aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig3
-rw-r--r--fs/Makefile2
-rw-r--r--fs/autofs4/autofs_i.h72
-rw-r--r--fs/autofs4/dev-ioctl.c57
-rw-r--r--fs/autofs4/expire.c84
-rw-r--r--fs/autofs4/init.c10
-rw-r--r--fs/autofs4/inode.c52
-rw-r--r--fs/autofs4/root.c165
-rw-r--r--fs/autofs4/symlink.c11
-rw-r--r--fs/autofs4/waitq.c78
-rw-r--r--fs/block_dev.c6
-rw-r--r--fs/btrfs/backref.c12
-rw-r--r--fs/btrfs/check-integrity.c12
-rw-r--r--fs/btrfs/compression.h9
-rw-r--r--fs/btrfs/ctree.c36
-rw-r--r--fs/btrfs/ctree.h87
-rw-r--r--fs/btrfs/delayed-inode.c10
-rw-r--r--fs/btrfs/delayed-ref.c12
-rw-r--r--fs/btrfs/dev-replace.c134
-rw-r--r--fs/btrfs/dev-replace.h7
-rw-r--r--fs/btrfs/disk-io.c73
-rw-r--r--fs/btrfs/extent-tree.c40
-rw-r--r--fs/btrfs/extent_io.c40
-rw-r--r--fs/btrfs/extent_io.h5
-rw-r--r--fs/btrfs/extent_map.c8
-rw-r--r--fs/btrfs/file-item.c103
-rw-r--r--fs/btrfs/file.c158
-rw-r--r--fs/btrfs/inode-map.c3
-rw-r--r--fs/btrfs/inode.c326
-rw-r--r--fs/btrfs/ioctl.c35
-rw-r--r--fs/btrfs/ordered-data.c6
-rw-r--r--fs/btrfs/print-tree.c23
-rw-r--r--fs/btrfs/props.c1
-rw-r--r--fs/btrfs/reada.c268
-rw-r--r--fs/btrfs/root-tree.c2
-rw-r--r--fs/btrfs/scrub.c32
-rw-r--r--fs/btrfs/send.c37
-rw-r--r--fs/btrfs/super.c52
-rw-r--r--fs/btrfs/tests/btrfs-tests.c9
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c1
-rw-r--r--fs/btrfs/tests/inode-tests.c1
-rw-r--r--fs/btrfs/transaction.c13
-rw-r--r--fs/btrfs/tree-log.c102
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c51
-rw-r--r--fs/btrfs/xattr.c67
-rw-r--r--fs/buffer.c24
-rw-r--r--fs/cachefiles/daemon.c13
-rw-r--r--fs/cachefiles/interface.c11
-rw-r--r--fs/cachefiles/internal.h4
-rw-r--r--fs/cachefiles/namei.c28
-rw-r--r--fs/ceph/addr.c324
-rw-r--r--fs/ceph/caps.c11
-rw-r--r--fs/ceph/dir.c69
-rw-r--r--fs/ceph/export.c13
-rw-r--r--fs/ceph/file.c15
-rw-r--r--fs/ceph/inode.c53
-rw-r--r--fs/ceph/mds_client.c7
-rw-r--r--fs/ceph/snap.c16
-rw-r--r--fs/ceph/super.c47
-rw-r--r--fs/ceph/super.h23
-rw-r--r--fs/ceph/xattr.c78
-rw-r--r--fs/cifs/cifs_debug.c56
-rw-r--r--fs/cifs/cifs_debug.h2
-rw-r--r--fs/cifs/cifsencrypt.c32
-rw-r--r--fs/cifs/cifsfs.c10
-rw-r--r--fs/cifs/cifsglob.h4
-rw-r--r--fs/cifs/smbencrypt.c26
-rw-r--r--fs/compat_ioctl.c22
-rw-r--r--fs/configfs/dir.c53
-rw-r--r--fs/configfs/inode.c20
-rw-r--r--fs/configfs/item.c1
-rw-r--r--fs/coredump.c30
-rw-r--r--fs/crypto/Kconfig18
-rw-r--r--fs/crypto/Makefile3
-rw-r--r--fs/crypto/crypto.c555
-rw-r--r--fs/crypto/fname.c (renamed from fs/f2fs/crypto_fname.c)276
-rw-r--r--fs/crypto/keyinfo.c272
-rw-r--r--fs/crypto/policy.c229
-rw-r--r--fs/dax.c9
-rw-r--r--fs/dcache.c177
-rw-r--r--fs/direct-io.c12
-rw-r--r--fs/dlm/config.c41
-rw-r--r--fs/dlm/lowcomms.c74
-rw-r--r--fs/ecryptfs/crypto.c134
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h13
-rw-r--r--fs/ecryptfs/inode.c12
-rw-r--r--fs/ecryptfs/keystore.c224
-rw-r--r--fs/ecryptfs/main.c1
-rw-r--r--fs/ecryptfs/mmap.c1
-rw-r--r--fs/ecryptfs/super.c1
-rw-r--r--fs/eventfd.c42
-rw-r--r--fs/eventpoll.c2
-rw-r--r--fs/exec.c100
-rw-r--r--fs/ext2/ext2.h3
-rw-r--r--fs/ext2/super.c25
-rw-r--r--fs/ext2/xattr.c139
-rw-r--r--fs/ext2/xattr.h21
-rw-r--r--fs/ext4/crypto.c24
-rw-r--r--fs/ext4/crypto_fname.c32
-rw-r--r--fs/ext4/crypto_key.c42
-rw-r--r--fs/ext4/dir.c2
-rw-r--r--fs/ext4/ext4.h75
-rw-r--r--fs/ext4/ext4_crypto.h2
-rw-r--r--fs/ext4/ext4_extents.h2
-rw-r--r--fs/ext4/extents.c128
-rw-r--r--fs/ext4/extents_status.c4
-rw-r--r--fs/ext4/file.c129
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/indirect.c29
-rw-r--r--fs/ext4/inline.c8
-rw-r--r--fs/ext4/inode.c402
-rw-r--r--fs/ext4/mballoc.c81
-rw-r--r--fs/ext4/mballoc.h12
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/mmp.c34
-rw-r--r--fs/ext4/page-io.c14
-rw-r--r--fs/ext4/super.c39
-rw-r--r--fs/ext4/xattr.c166
-rw-r--r--fs/ext4/xattr.h3
-rw-r--r--fs/f2fs/Kconfig12
-rw-r--r--fs/f2fs/Makefile2
-rw-r--r--fs/f2fs/checkpoint.c77
-rw-r--r--fs/f2fs/crypto.c491
-rw-r--r--fs/f2fs/crypto_key.c254
-rw-r--r--fs/f2fs/crypto_policy.c209
-rw-r--r--fs/f2fs/data.c428
-rw-r--r--fs/f2fs/dir.c94
-rw-r--r--fs/f2fs/extent_cache.c176
-rw-r--r--fs/f2fs/f2fs.h315
-rw-r--r--fs/f2fs/f2fs_crypto.h151
-rw-r--r--fs/f2fs/file.c114
-rw-r--r--fs/f2fs/gc.c245
-rw-r--r--fs/f2fs/inline.c43
-rw-r--r--fs/f2fs/inode.c15
-rw-r--r--fs/f2fs/namei.c168
-rw-r--r--fs/f2fs/node.c223
-rw-r--r--fs/f2fs/node.h26
-rw-r--r--fs/f2fs/recovery.c14
-rw-r--r--fs/f2fs/segment.c386
-rw-r--r--fs/f2fs/segment.h5
-rw-r--r--fs/f2fs/super.c204
-rw-r--r--fs/f2fs/trace.c6
-rw-r--r--fs/f2fs/xattr.c6
-rw-r--r--fs/f2fs/xattr.h3
-rw-r--r--fs/fat/Kconfig18
-rw-r--r--fs/fat/inode.c4
-rw-r--r--fs/fhandle.c2
-rw-r--r--fs/fs-writeback.c37
-rw-r--r--fs/fuse/cuse.c4
-rw-r--r--fs/fuse/file.c56
-rw-r--r--fs/fuse/fuse_i.h9
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/dir.c6
-rw-r--r--fs/gfs2/export.c2
-rw-r--r--fs/gfs2/glock.c10
-rw-r--r--fs/gfs2/incore.h1
-rw-r--r--fs/gfs2/inode.c71
-rw-r--r--fs/gfs2/inode.h5
-rw-r--r--fs/gfs2/ops_fstype.c2
-rw-r--r--fs/gfs2/super.c26
-rw-r--r--fs/jbd2/commit.c49
-rw-r--r--fs/jbd2/journal.c43
-rw-r--r--fs/jbd2/recovery.c31
-rw-r--r--fs/jbd2/revoke.c60
-rw-r--r--fs/jbd2/transaction.c22
-rw-r--r--fs/jffs2/gc.c64
-rw-r--r--fs/jffs2/jffs2_fs_sb.h2
-rw-r--r--fs/jffs2/nodemgmt.c4
-rw-r--r--fs/jffs2/wbuf.c6
-rw-r--r--fs/kernfs/dir.c210
-rw-r--r--fs/kernfs/mount.c69
-rw-r--r--fs/mbcache.c1093
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/namei.c311
-rw-r--r--fs/nfs/blocklayout/blocklayout.c72
-rw-r--r--fs/nfs/blocklayout/blocklayout.h14
-rw-r--r--fs/nfs/blocklayout/dev.c144
-rw-r--r--fs/nfs/blocklayout/extent_tree.c44
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c2
-rw-r--r--fs/nfs/callback.h3
-rw-r--r--fs/nfs/callback_proc.c69
-rw-r--r--fs/nfs/callback_xdr.c12
-rw-r--r--fs/nfs/dir.c12
-rw-r--r--fs/nfs/file.c12
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c2
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/nfs/internal.h6
-rw-r--r--fs/nfs/nfs4file.c33
-rw-r--r--fs/nfs/nfs4proc.c76
-rw-r--r--fs/nfs/nfs4session.c54
-rw-r--r--fs/nfs/nfs4session.h8
-rw-r--r--fs/nfs/pnfs_nfs.c16
-rw-r--r--fs/nfsd/Kconfig28
-rw-r--r--fs/nfsd/Makefile4
-rw-r--r--fs/nfsd/blocklayout.c298
-rw-r--r--fs/nfsd/blocklayoutxdr.c77
-rw-r--r--fs/nfsd/blocklayoutxdr.h14
-rw-r--r--fs/nfsd/nfs3proc.c7
-rw-r--r--fs/nfsd/nfs4layouts.c31
-rw-r--r--fs/nfsd/nfs4proc.c9
-rw-r--r--fs/nfsd/nfs4recover.c29
-rw-r--r--fs/nfsd/nfs4state.c29
-rw-r--r--fs/nfsd/nfs4xdr.c26
-rw-r--r--fs/nfsd/pnfs.h8
-rw-r--r--fs/nfsd/vfs.c4
-rw-r--r--fs/nfsd/vfs.h19
-rw-r--r--fs/nilfs2/page.c2
-rw-r--r--fs/ocfs2/Makefile3
-rw-r--r--fs/ocfs2/alloc.c105
-rw-r--r--fs/ocfs2/aops.c1141
-rw-r--r--fs/ocfs2/aops.h19
-rw-r--r--fs/ocfs2/cluster/heartbeat.c14
-rw-r--r--fs/ocfs2/cluster/nodemanager.c22
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h26
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c30
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c13
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c127
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c41
-rw-r--r--fs/ocfs2/dlm/dlmthread.c13
-rw-r--r--fs/ocfs2/file.c165
-rw-r--r--fs/ocfs2/filecheck.c606
-rw-r--r--fs/ocfs2/filecheck.h49
-rw-r--r--fs/ocfs2/inode.c228
-rw-r--r--fs/ocfs2/inode.h9
-rw-r--r--fs/ocfs2/journal.c8
-rw-r--r--fs/ocfs2/localalloc.c4
-rw-r--r--fs/ocfs2/mmap.c4
-rw-r--r--fs/ocfs2/ocfs2.h8
-rw-r--r--fs/ocfs2/ocfs2_trace.h20
-rw-r--r--fs/ocfs2/quota_global.c27
-rw-r--r--fs/ocfs2/resize.c2
-rw-r--r--fs/ocfs2/stackglue.c3
-rw-r--r--fs/ocfs2/stackglue.h2
-rw-r--r--fs/ocfs2/super.c49
-rw-r--r--fs/ocfs2/super.h2
-rw-r--r--fs/open.c6
-rw-r--r--fs/orangefs/Kconfig6
-rw-r--r--fs/orangefs/Makefile10
-rw-r--r--fs/orangefs/acl.c175
-rw-r--r--fs/orangefs/dcache.c138
-rw-r--r--fs/orangefs/devorangefs-req.c943
-rw-r--r--fs/orangefs/dir.c400
-rw-r--r--fs/orangefs/downcall.h133
-rw-r--r--fs/orangefs/file.c717
-rw-r--r--fs/orangefs/inode.c475
-rw-r--r--fs/orangefs/namei.c462
-rw-r--r--fs/orangefs/orangefs-bufmap.c556
-rw-r--r--fs/orangefs/orangefs-bufmap.h36
-rw-r--r--fs/orangefs/orangefs-cache.c161
-rw-r--r--fs/orangefs/orangefs-debug.h92
-rw-r--r--fs/orangefs/orangefs-debugfs.c455
-rw-r--r--fs/orangefs/orangefs-debugfs.h3
-rw-r--r--fs/orangefs/orangefs-dev-proto.h62
-rw-r--r--fs/orangefs/orangefs-kernel.h623
-rw-r--r--fs/orangefs/orangefs-mod.c293
-rw-r--r--fs/orangefs/orangefs-sysfs.c1772
-rw-r--r--fs/orangefs/orangefs-sysfs.h2
-rw-r--r--fs/orangefs/orangefs-utils.c1048
-rw-r--r--fs/orangefs/protocol.h452
-rw-r--r--fs/orangefs/super.c559
-rw-r--r--fs/orangefs/symlink.c19
-rw-r--r--fs/orangefs/upcall.h246
-rw-r--r--fs/orangefs/waitqueue.c357
-rw-r--r--fs/orangefs/xattr.c545
-rw-r--r--fs/overlayfs/copy_up.c35
-rw-r--r--fs/overlayfs/dir.c61
-rw-r--r--fs/overlayfs/overlayfs.h1
-rw-r--r--fs/overlayfs/readdir.c53
-rw-r--r--fs/overlayfs/super.c18
-rw-r--r--fs/proc/base.c71
-rw-r--r--fs/proc/meminfo.c31
-rw-r--r--fs/proc/namespaces.c3
-rw-r--r--fs/proc/page.c8
-rw-r--r--fs/proc/task_mmu.c14
-rw-r--r--fs/proc/vmcore.c7
-rw-r--r--fs/proc_namespace.c2
-rw-r--r--fs/pstore/ram.c4
-rw-r--r--fs/quota/dquot.c58
-rw-r--r--fs/quota/quota.c70
-rw-r--r--fs/quota/quota_tree.c67
-rw-r--r--fs/quota/quota_v2.c6
-rw-r--r--fs/read_write.c197
-rw-r--r--fs/reiserfs/super.c1
-rw-r--r--fs/select.c8
-rw-r--r--fs/splice.c5
-rw-r--r--fs/ubifs/Makefile1
-rw-r--r--fs/ubifs/misc.c57
-rw-r--r--fs/ubifs/ubifs.h41
-rw-r--r--fs/ubifs/xattr.c1
-rw-r--r--fs/udf/dir.c13
-rw-r--r--fs/udf/namei.c29
-rw-r--r--fs/udf/super.c38
-rw-r--r--fs/udf/udfdecl.h21
-rw-r--r--fs/udf/unicode.c630
-rw-r--r--fs/xfs/Makefile3
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_attr_sf.h16
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c172
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c5
-rw-r--r--fs/xfs/libxfs/xfs_btree.c32
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h16
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c12
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c4
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c4
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c12
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c170
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h38
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c3
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h19
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h3
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c32
-rw-r--r--fs/xfs/libxfs/xfs_sb.h1
-rw-r--r--fs/xfs/libxfs/xfs_shared.h1
-rw-r--r--fs/xfs/xfs_aops.c1027
-rw-r--r--fs/xfs/xfs_aops.h4
-rw-r--r--fs/xfs/xfs_attr_list.c19
-rw-r--r--fs/xfs/xfs_bmap_util.c8
-rw-r--r--fs/xfs/xfs_buf.c2
-rw-r--r--fs/xfs/xfs_buf.h26
-rw-r--r--fs/xfs/xfs_buf_item.c10
-rw-r--r--fs/xfs/xfs_dir2_readdir.c2
-rw-r--r--fs/xfs/xfs_discard.c2
-rw-r--r--fs/xfs/xfs_dquot.c129
-rw-r--r--fs/xfs/xfs_export.c4
-rw-r--r--fs/xfs/xfs_file.c88
-rw-r--r--fs/xfs/xfs_filestream.c4
-rw-r--r--fs/xfs/xfs_fsops.h1
-rw-r--r--fs/xfs/xfs_icache.c43
-rw-r--r--fs/xfs/xfs_inode.c174
-rw-r--r--fs/xfs/xfs_inode.h10
-rw-r--r--fs/xfs/xfs_inode_item.c82
-rw-r--r--fs/xfs/xfs_ioctl.c121
-rw-r--r--fs/xfs/xfs_iops.c59
-rw-r--r--fs/xfs/xfs_itable.c22
-rw-r--r--fs/xfs/xfs_log.c152
-rw-r--r--fs/xfs/xfs_log_recover.c97
-rw-r--r--fs/xfs/xfs_mount.c24
-rw-r--r--fs/xfs/xfs_mount.h31
-rw-r--r--fs/xfs/xfs_ondisk.h117
-rw-r--r--fs/xfs/xfs_pnfs.h2
-rw-r--r--fs/xfs/xfs_qm.c55
-rw-r--r--fs/xfs/xfs_qm.h48
-rw-r--r--fs/xfs/xfs_qm_syscalls.c27
-rw-r--r--fs/xfs/xfs_quotaops.c36
-rw-r--r--fs/xfs/xfs_rtalloc.c2
-rw-r--r--fs/xfs/xfs_super.c528
-rw-r--r--fs/xfs/xfs_super.h4
-rw-r--r--fs/xfs/xfs_sysfs.c78
-rw-r--r--fs/xfs/xfs_trace.h9
-rw-r--r--fs/xfs/xfs_trans.c4
-rw-r--r--fs/xfs/xfs_trans.h1
-rw-r--r--fs/xfs/xfs_trans_ail.c5
-rw-r--r--fs/xfs/xfs_trans_buf.c10
-rw-r--r--fs/xfs/xfs_trans_dquot.c15
-rw-r--r--fs/xfs/xfs_trans_inode.c14
356 files changed, 23417 insertions, 9241 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 9adee0d7536e..6725f59c18e6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -84,6 +84,8 @@ config MANDATORY_FILE_LOCKING
84 84
85 To the best of my knowledge this is dead code that no one cares about. 85 To the best of my knowledge this is dead code that no one cares about.
86 86
87source "fs/crypto/Kconfig"
88
87source "fs/notify/Kconfig" 89source "fs/notify/Kconfig"
88 90
89source "fs/quota/Kconfig" 91source "fs/quota/Kconfig"
@@ -207,6 +209,7 @@ menuconfig MISC_FILESYSTEMS
207 209
208if MISC_FILESYSTEMS 210if MISC_FILESYSTEMS
209 211
212source "fs/orangefs/Kconfig"
210source "fs/adfs/Kconfig" 213source "fs/adfs/Kconfig"
211source "fs/affs/Kconfig" 214source "fs/affs/Kconfig"
212source "fs/ecryptfs/Kconfig" 215source "fs/ecryptfs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 79f522575cba..85b6e13b62d3 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_EVENTFD) += eventfd.o
30obj-$(CONFIG_USERFAULTFD) += userfaultfd.o 30obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
31obj-$(CONFIG_AIO) += aio.o 31obj-$(CONFIG_AIO) += aio.o
32obj-$(CONFIG_FS_DAX) += dax.o 32obj-$(CONFIG_FS_DAX) += dax.o
33obj-$(CONFIG_FS_ENCRYPTION) += crypto/
33obj-$(CONFIG_FILE_LOCKING) += locks.o 34obj-$(CONFIG_FILE_LOCKING) += locks.o
34obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o 35obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
35obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o 36obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
@@ -105,6 +106,7 @@ obj-$(CONFIG_AUTOFS4_FS) += autofs4/
105obj-$(CONFIG_ADFS_FS) += adfs/ 106obj-$(CONFIG_ADFS_FS) += adfs/
106obj-$(CONFIG_FUSE_FS) += fuse/ 107obj-$(CONFIG_FUSE_FS) += fuse/
107obj-$(CONFIG_OVERLAY_FS) += overlayfs/ 108obj-$(CONFIG_OVERLAY_FS) += overlayfs/
109obj-$(CONFIG_ORANGEFS_FS) += orangefs/
108obj-$(CONFIG_UDF_FS) += udf/ 110obj-$(CONFIG_UDF_FS) += udf/
109obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ 111obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/
110obj-$(CONFIG_OMFS_FS) += omfs/ 112obj-$(CONFIG_OMFS_FS) += omfs/
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index c37149b929be..f0d268b97d19 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -1,15 +1,11 @@
1/* -*- c -*- ------------------------------------------------------------- * 1/*
2 * 2 * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
3 * linux/fs/autofs/autofs_i.h 3 * Copyright 2005-2006 Ian Kent <raven@themaw.net>
4 *
5 * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
6 * Copyright 2005-2006 Ian Kent <raven@themaw.net>
7 * 4 *
8 * This file is part of the Linux kernel and is made available under 5 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your 6 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference. 7 * option, any later version, incorporated herein by reference.
11 * 8 */
12 * ----------------------------------------------------------------------- */
13 9
14/* Internal header file for autofs */ 10/* Internal header file for autofs */
15 11
@@ -35,28 +31,23 @@
35#include <linux/mount.h> 31#include <linux/mount.h>
36#include <linux/namei.h> 32#include <linux/namei.h>
37#include <asm/current.h> 33#include <asm/current.h>
38#include <asm/uaccess.h> 34#include <linux/uaccess.h>
39 35
40/* #define DEBUG */ 36/* #define DEBUG */
41 37
42#define DPRINTK(fmt, ...) \ 38#ifdef pr_fmt
43 pr_debug("pid %d: %s: " fmt "\n", \ 39#undef pr_fmt
44 current->pid, __func__, ##__VA_ARGS__) 40#endif
45 41#define pr_fmt(fmt) KBUILD_MODNAME ":pid:%d:%s: " fmt, current->pid, __func__
46#define AUTOFS_WARN(fmt, ...) \ 42
47 printk(KERN_WARNING "pid %d: %s: " fmt "\n", \ 43/*
48 current->pid, __func__, ##__VA_ARGS__) 44 * Unified info structure. This is pointed to by both the dentry and
49 45 * inode structures. Each file in the filesystem has an instance of this
50#define AUTOFS_ERROR(fmt, ...) \ 46 * structure. It holds a reference to the dentry, so dentries are never
51 printk(KERN_ERR "pid %d: %s: " fmt "\n", \ 47 * flushed while the file exists. All name lookups are dealt with at the
52 current->pid, __func__, ##__VA_ARGS__) 48 * dentry level, although the filesystem can interfere in the validation
53 49 * process. Readdir is implemented by traversing the dentry lists.
54/* Unified info structure. This is pointed to by both the dentry and 50 */
55 inode structures. Each file in the filesystem has an instance of this
56 structure. It holds a reference to the dentry, so dentries are never
57 flushed while the file exists. All name lookups are dealt with at the
58 dentry level, although the filesystem can interfere in the validation
59 process. Readdir is implemented by traversing the dentry lists. */
60struct autofs_info { 51struct autofs_info {
61 struct dentry *dentry; 52 struct dentry *dentry;
62 struct inode *inode; 53 struct inode *inode;
@@ -78,7 +69,7 @@ struct autofs_info {
78 kgid_t gid; 69 kgid_t gid;
79}; 70};
80 71
81#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ 72#define AUTOFS_INF_EXPIRING (1<<0) /* dentry in the process of expiring */
82#define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered 73#define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered
83 * for expiry, so RCU_walk is 74 * for expiry, so RCU_walk is
84 * not permitted 75 * not permitted
@@ -140,10 +131,11 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry)
140} 131}
141 132
142/* autofs4_oz_mode(): do we see the man behind the curtain? (The 133/* autofs4_oz_mode(): do we see the man behind the curtain? (The
143 processes which do manipulations for us in user space sees the raw 134 * processes which do manipulations for us in user space sees the raw
144 filesystem without "magic".) */ 135 * filesystem without "magic".)
145 136 */
146static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { 137static inline int autofs4_oz_mode(struct autofs_sb_info *sbi)
138{
147 return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; 139 return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
148} 140}
149 141
@@ -154,12 +146,12 @@ void autofs4_free_ino(struct autofs_info *);
154int is_autofs4_dentry(struct dentry *); 146int is_autofs4_dentry(struct dentry *);
155int autofs4_expire_wait(struct dentry *dentry, int rcu_walk); 147int autofs4_expire_wait(struct dentry *dentry, int rcu_walk);
156int autofs4_expire_run(struct super_block *, struct vfsmount *, 148int autofs4_expire_run(struct super_block *, struct vfsmount *,
157 struct autofs_sb_info *, 149 struct autofs_sb_info *,
158 struct autofs_packet_expire __user *); 150 struct autofs_packet_expire __user *);
159int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, 151int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
160 struct autofs_sb_info *sbi, int when); 152 struct autofs_sb_info *sbi, int when);
161int autofs4_expire_multi(struct super_block *, struct vfsmount *, 153int autofs4_expire_multi(struct super_block *, struct vfsmount *,
162 struct autofs_sb_info *, int __user *); 154 struct autofs_sb_info *, int __user *);
163struct dentry *autofs4_expire_direct(struct super_block *sb, 155struct dentry *autofs4_expire_direct(struct super_block *sb,
164 struct vfsmount *mnt, 156 struct vfsmount *mnt,
165 struct autofs_sb_info *sbi, int how); 157 struct autofs_sb_info *sbi, int how);
@@ -224,8 +216,8 @@ static inline int autofs_prepare_pipe(struct file *pipe)
224 216
225/* Queue management functions */ 217/* Queue management functions */
226 218
227int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify); 219int autofs4_wait(struct autofs_sb_info *, struct dentry *, enum autofs_notify);
228int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int); 220int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int);
229void autofs4_catatonic_mode(struct autofs_sb_info *); 221void autofs4_catatonic_mode(struct autofs_sb_info *);
230 222
231static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi) 223static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
@@ -242,37 +234,37 @@ static inline void __autofs4_add_expiring(struct dentry *dentry)
242{ 234{
243 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 235 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
244 struct autofs_info *ino = autofs4_dentry_ino(dentry); 236 struct autofs_info *ino = autofs4_dentry_ino(dentry);
237
245 if (ino) { 238 if (ino) {
246 if (list_empty(&ino->expiring)) 239 if (list_empty(&ino->expiring))
247 list_add(&ino->expiring, &sbi->expiring_list); 240 list_add(&ino->expiring, &sbi->expiring_list);
248 } 241 }
249 return;
250} 242}
251 243
252static inline void autofs4_add_expiring(struct dentry *dentry) 244static inline void autofs4_add_expiring(struct dentry *dentry)
253{ 245{
254 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 246 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
255 struct autofs_info *ino = autofs4_dentry_ino(dentry); 247 struct autofs_info *ino = autofs4_dentry_ino(dentry);
248
256 if (ino) { 249 if (ino) {
257 spin_lock(&sbi->lookup_lock); 250 spin_lock(&sbi->lookup_lock);
258 if (list_empty(&ino->expiring)) 251 if (list_empty(&ino->expiring))
259 list_add(&ino->expiring, &sbi->expiring_list); 252 list_add(&ino->expiring, &sbi->expiring_list);
260 spin_unlock(&sbi->lookup_lock); 253 spin_unlock(&sbi->lookup_lock);
261 } 254 }
262 return;
263} 255}
264 256
265static inline void autofs4_del_expiring(struct dentry *dentry) 257static inline void autofs4_del_expiring(struct dentry *dentry)
266{ 258{
267 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 259 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
268 struct autofs_info *ino = autofs4_dentry_ino(dentry); 260 struct autofs_info *ino = autofs4_dentry_ino(dentry);
261
269 if (ino) { 262 if (ino) {
270 spin_lock(&sbi->lookup_lock); 263 spin_lock(&sbi->lookup_lock);
271 if (!list_empty(&ino->expiring)) 264 if (!list_empty(&ino->expiring))
272 list_del_init(&ino->expiring); 265 list_del_init(&ino->expiring);
273 spin_unlock(&sbi->lookup_lock); 266 spin_unlock(&sbi->lookup_lock);
274 } 267 }
275 return;
276} 268}
277 269
278extern void autofs4_kill_sb(struct super_block *); 270extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index ac7d921ed984..c7fcc7438843 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -72,13 +72,13 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
72{ 72{
73 int err = 0; 73 int err = 0;
74 74
75 if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) || 75 if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) ||
76 (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) { 76 (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) {
77 AUTOFS_WARN("ioctl control interface version mismatch: " 77 pr_warn("ioctl control interface version mismatch: "
78 "kernel(%u.%u), user(%u.%u), cmd(%d)", 78 "kernel(%u.%u), user(%u.%u), cmd(%d)\n",
79 AUTOFS_DEV_IOCTL_VERSION_MAJOR, 79 AUTOFS_DEV_IOCTL_VERSION_MAJOR,
80 AUTOFS_DEV_IOCTL_VERSION_MINOR, 80 AUTOFS_DEV_IOCTL_VERSION_MINOR,
81 param->ver_major, param->ver_minor, cmd); 81 param->ver_major, param->ver_minor, cmd);
82 err = -EINVAL; 82 err = -EINVAL;
83 } 83 }
84 84
@@ -93,7 +93,8 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
93 * Copy parameter control struct, including a possible path allocated 93 * Copy parameter control struct, including a possible path allocated
94 * at the end of the struct. 94 * at the end of the struct.
95 */ 95 */
96static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in) 96static struct autofs_dev_ioctl *
97 copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
97{ 98{
98 struct autofs_dev_ioctl tmp, *res; 99 struct autofs_dev_ioctl tmp, *res;
99 100
@@ -116,7 +117,6 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
116static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) 117static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
117{ 118{
118 kfree(param); 119 kfree(param);
119 return;
120} 120}
121 121
122/* 122/*
@@ -129,24 +129,24 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
129 129
130 err = check_dev_ioctl_version(cmd, param); 130 err = check_dev_ioctl_version(cmd, param);
131 if (err) { 131 if (err) {
132 AUTOFS_WARN("invalid device control module version " 132 pr_warn("invalid device control module version "
133 "supplied for cmd(0x%08x)", cmd); 133 "supplied for cmd(0x%08x)\n", cmd);
134 goto out; 134 goto out;
135 } 135 }
136 136
137 if (param->size > sizeof(*param)) { 137 if (param->size > sizeof(*param)) {
138 err = invalid_str(param->path, param->size - sizeof(*param)); 138 err = invalid_str(param->path, param->size - sizeof(*param));
139 if (err) { 139 if (err) {
140 AUTOFS_WARN( 140 pr_warn(
141 "path string terminator missing for cmd(0x%08x)", 141 "path string terminator missing for cmd(0x%08x)\n",
142 cmd); 142 cmd);
143 goto out; 143 goto out;
144 } 144 }
145 145
146 err = check_name(param->path); 146 err = check_name(param->path);
147 if (err) { 147 if (err) {
148 AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", 148 pr_warn("invalid path supplied for cmd(0x%08x)\n",
149 cmd); 149 cmd);
150 goto out; 150 goto out;
151 } 151 }
152 } 152 }
@@ -197,7 +197,9 @@ static int find_autofs_mount(const char *pathname,
197 void *data) 197 void *data)
198{ 198{
199 struct path path; 199 struct path path;
200 int err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0); 200 int err;
201
202 err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
201 if (err) 203 if (err)
202 return err; 204 return err;
203 err = -ENOENT; 205 err = -ENOENT;
@@ -225,6 +227,7 @@ static int test_by_dev(struct path *path, void *p)
225static int test_by_type(struct path *path, void *p) 227static int test_by_type(struct path *path, void *p)
226{ 228{
227 struct autofs_info *ino = autofs4_dentry_ino(path->dentry); 229 struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
230
228 return ino && ino->sbi->type & *(unsigned *)p; 231 return ino && ino->sbi->type & *(unsigned *)p;
229} 232}
230 233
@@ -370,7 +373,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
370 new_pid = get_task_pid(current, PIDTYPE_PGID); 373 new_pid = get_task_pid(current, PIDTYPE_PGID);
371 374
372 if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) { 375 if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) {
373 AUTOFS_WARN("Not allowed to change PID namespace"); 376 pr_warn("not allowed to change PID namespace\n");
374 err = -EINVAL; 377 err = -EINVAL;
375 goto out; 378 goto out;
376 } 379 }
@@ -456,8 +459,10 @@ static int autofs_dev_ioctl_requester(struct file *fp,
456 err = 0; 459 err = 0;
457 autofs4_expire_wait(path.dentry, 0); 460 autofs4_expire_wait(path.dentry, 0);
458 spin_lock(&sbi->fs_lock); 461 spin_lock(&sbi->fs_lock);
459 param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); 462 param->requester.uid =
460 param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid); 463 from_kuid_munged(current_user_ns(), ino->uid);
464 param->requester.gid =
465 from_kgid_munged(current_user_ns(), ino->gid);
461 spin_unlock(&sbi->fs_lock); 466 spin_unlock(&sbi->fs_lock);
462 } 467 }
463 path_put(&path); 468 path_put(&path);
@@ -619,7 +624,8 @@ static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
619} 624}
620 625
621/* ioctl dispatcher */ 626/* ioctl dispatcher */
622static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user) 627static int _autofs_dev_ioctl(unsigned int command,
628 struct autofs_dev_ioctl __user *user)
623{ 629{
624 struct autofs_dev_ioctl *param; 630 struct autofs_dev_ioctl *param;
625 struct file *fp; 631 struct file *fp;
@@ -655,7 +661,7 @@ static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __use
655 661
656 fn = lookup_dev_ioctl(cmd); 662 fn = lookup_dev_ioctl(cmd);
657 if (!fn) { 663 if (!fn) {
658 AUTOFS_WARN("unknown command 0x%08x", command); 664 pr_warn("unknown command 0x%08x\n", command);
659 return -ENOTTY; 665 return -ENOTTY;
660 } 666 }
661 667
@@ -711,6 +717,7 @@ out:
711static long autofs_dev_ioctl(struct file *file, uint command, ulong u) 717static long autofs_dev_ioctl(struct file *file, uint command, ulong u)
712{ 718{
713 int err; 719 int err;
720
714 err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u); 721 err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u);
715 return (long) err; 722 return (long) err;
716} 723}
@@ -733,8 +740,8 @@ static const struct file_operations _dev_ioctl_fops = {
733 740
734static struct miscdevice _autofs_dev_ioctl_misc = { 741static struct miscdevice _autofs_dev_ioctl_misc = {
735 .minor = AUTOFS_MINOR, 742 .minor = AUTOFS_MINOR,
736 .name = AUTOFS_DEVICE_NAME, 743 .name = AUTOFS_DEVICE_NAME,
737 .fops = &_dev_ioctl_fops 744 .fops = &_dev_ioctl_fops
738}; 745};
739 746
740MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); 747MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
@@ -747,7 +754,7 @@ int __init autofs_dev_ioctl_init(void)
747 754
748 r = misc_register(&_autofs_dev_ioctl_misc); 755 r = misc_register(&_autofs_dev_ioctl_misc);
749 if (r) { 756 if (r) {
750 AUTOFS_ERROR("misc_register failed for control device"); 757 pr_err("misc_register failed for control device\n");
751 return r; 758 return r;
752 } 759 }
753 760
@@ -757,6 +764,4 @@ int __init autofs_dev_ioctl_init(void)
757void autofs_dev_ioctl_exit(void) 764void autofs_dev_ioctl_exit(void)
758{ 765{
759 misc_deregister(&_autofs_dev_ioctl_misc); 766 misc_deregister(&_autofs_dev_ioctl_misc);
760 return;
761} 767}
762
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 1cebc3c52fa5..9510d8d2e9cd 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -1,16 +1,12 @@
1/* -*- c -*- --------------------------------------------------------------- * 1/*
2 * 2 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
3 * linux/fs/autofs/expire.c 3 * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
4 * 4 * Copyright 2001-2006 Ian Kent <raven@themaw.net>
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
7 * Copyright 2001-2006 Ian Kent <raven@themaw.net>
8 * 5 *
9 * This file is part of the Linux kernel and is made available under 6 * This file is part of the Linux kernel and is made available under
10 * the terms of the GNU General Public License, version 2, or at your 7 * the terms of the GNU General Public License, version 2, or at your
11 * option, any later version, incorporated herein by reference. 8 * option, any later version, incorporated herein by reference.
12 * 9 */
13 * ------------------------------------------------------------------------- */
14 10
15#include "autofs_i.h" 11#include "autofs_i.h"
16 12
@@ -18,7 +14,7 @@ static unsigned long now;
18 14
19/* Check if a dentry can be expired */ 15/* Check if a dentry can be expired */
20static inline int autofs4_can_expire(struct dentry *dentry, 16static inline int autofs4_can_expire(struct dentry *dentry,
21 unsigned long timeout, int do_now) 17 unsigned long timeout, int do_now)
22{ 18{
23 struct autofs_info *ino = autofs4_dentry_ino(dentry); 19 struct autofs_info *ino = autofs4_dentry_ino(dentry);
24 20
@@ -41,7 +37,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
41 struct path path = {.mnt = mnt, .dentry = dentry}; 37 struct path path = {.mnt = mnt, .dentry = dentry};
42 int status = 1; 38 int status = 1;
43 39
44 DPRINTK("dentry %p %pd", dentry, dentry); 40 pr_debug("dentry %p %pd\n", dentry, dentry);
45 41
46 path_get(&path); 42 path_get(&path);
47 43
@@ -58,14 +54,16 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
58 54
59 /* Update the expiry counter if fs is busy */ 55 /* Update the expiry counter if fs is busy */
60 if (!may_umount_tree(path.mnt)) { 56 if (!may_umount_tree(path.mnt)) {
61 struct autofs_info *ino = autofs4_dentry_ino(top); 57 struct autofs_info *ino;
58
59 ino = autofs4_dentry_ino(top);
62 ino->last_used = jiffies; 60 ino->last_used = jiffies;
63 goto done; 61 goto done;
64 } 62 }
65 63
66 status = 0; 64 status = 0;
67done: 65done:
68 DPRINTK("returning = %d", status); 66 pr_debug("returning = %d\n", status);
69 path_put(&path); 67 path_put(&path);
70 return status; 68 return status;
71} 69}
@@ -74,7 +72,7 @@ done:
74 * Calculate and dget next entry in the subdirs list under root. 72 * Calculate and dget next entry in the subdirs list under root.
75 */ 73 */
76static struct dentry *get_next_positive_subdir(struct dentry *prev, 74static struct dentry *get_next_positive_subdir(struct dentry *prev,
77 struct dentry *root) 75 struct dentry *root)
78{ 76{
79 struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); 77 struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
80 struct list_head *next; 78 struct list_head *next;
@@ -121,7 +119,7 @@ cont:
121 * Calculate and dget next entry in top down tree traversal. 119 * Calculate and dget next entry in top down tree traversal.
122 */ 120 */
123static struct dentry *get_next_positive_dentry(struct dentry *prev, 121static struct dentry *get_next_positive_dentry(struct dentry *prev,
124 struct dentry *root) 122 struct dentry *root)
125{ 123{
126 struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); 124 struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
127 struct list_head *next; 125 struct list_head *next;
@@ -187,15 +185,17 @@ again:
187 * autofs submounts. 185 * autofs submounts.
188 */ 186 */
189static int autofs4_direct_busy(struct vfsmount *mnt, 187static int autofs4_direct_busy(struct vfsmount *mnt,
190 struct dentry *top, 188 struct dentry *top,
191 unsigned long timeout, 189 unsigned long timeout,
192 int do_now) 190 int do_now)
193{ 191{
194 DPRINTK("top %p %pd", top, top); 192 pr_debug("top %p %pd\n", top, top);
195 193
196 /* If it's busy update the expiry counters */ 194 /* If it's busy update the expiry counters */
197 if (!may_umount_tree(mnt)) { 195 if (!may_umount_tree(mnt)) {
198 struct autofs_info *ino = autofs4_dentry_ino(top); 196 struct autofs_info *ino;
197
198 ino = autofs4_dentry_ino(top);
199 if (ino) 199 if (ino)
200 ino->last_used = jiffies; 200 ino->last_used = jiffies;
201 return 1; 201 return 1;
@@ -208,7 +208,8 @@ static int autofs4_direct_busy(struct vfsmount *mnt,
208 return 0; 208 return 0;
209} 209}
210 210
211/* Check a directory tree of mount points for busyness 211/*
212 * Check a directory tree of mount points for busyness
212 * The tree is not busy iff no mountpoints are busy 213 * The tree is not busy iff no mountpoints are busy
213 */ 214 */
214static int autofs4_tree_busy(struct vfsmount *mnt, 215static int autofs4_tree_busy(struct vfsmount *mnt,
@@ -219,7 +220,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
219 struct autofs_info *top_ino = autofs4_dentry_ino(top); 220 struct autofs_info *top_ino = autofs4_dentry_ino(top);
220 struct dentry *p; 221 struct dentry *p;
221 222
222 DPRINTK("top %p %pd", top, top); 223 pr_debug("top %p %pd\n", top, top);
223 224
224 /* Negative dentry - give up */ 225 /* Negative dentry - give up */
225 if (!simple_positive(top)) 226 if (!simple_positive(top))
@@ -227,7 +228,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
227 228
228 p = NULL; 229 p = NULL;
229 while ((p = get_next_positive_dentry(p, top))) { 230 while ((p = get_next_positive_dentry(p, top))) {
230 DPRINTK("dentry %p %pd", p, p); 231 pr_debug("dentry %p %pd\n", p, p);
231 232
232 /* 233 /*
233 * Is someone visiting anywhere in the subtree ? 234 * Is someone visiting anywhere in the subtree ?
@@ -273,11 +274,11 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
273{ 274{
274 struct dentry *p; 275 struct dentry *p;
275 276
276 DPRINTK("parent %p %pd", parent, parent); 277 pr_debug("parent %p %pd\n", parent, parent);
277 278
278 p = NULL; 279 p = NULL;
279 while ((p = get_next_positive_dentry(p, parent))) { 280 while ((p = get_next_positive_dentry(p, parent))) {
280 DPRINTK("dentry %p %pd", p, p); 281 pr_debug("dentry %p %pd\n", p, p);
281 282
282 if (d_mountpoint(p)) { 283 if (d_mountpoint(p)) {
283 /* Can we umount this guy */ 284 /* Can we umount this guy */
@@ -362,7 +363,7 @@ static struct dentry *should_expire(struct dentry *dentry,
362 * offset (autofs-5.0+). 363 * offset (autofs-5.0+).
363 */ 364 */
364 if (d_mountpoint(dentry)) { 365 if (d_mountpoint(dentry)) {
365 DPRINTK("checking mountpoint %p %pd", dentry, dentry); 366 pr_debug("checking mountpoint %p %pd\n", dentry, dentry);
366 367
367 /* Can we umount this guy */ 368 /* Can we umount this guy */
368 if (autofs4_mount_busy(mnt, dentry)) 369 if (autofs4_mount_busy(mnt, dentry))
@@ -375,7 +376,7 @@ static struct dentry *should_expire(struct dentry *dentry,
375 } 376 }
376 377
377 if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { 378 if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
378 DPRINTK("checking symlink %p %pd", dentry, dentry); 379 pr_debug("checking symlink %p %pd\n", dentry, dentry);
379 /* 380 /*
380 * A symlink can't be "busy" in the usual sense so 381 * A symlink can't be "busy" in the usual sense so
381 * just check last used for expire timeout. 382 * just check last used for expire timeout.
@@ -404,6 +405,7 @@ static struct dentry *should_expire(struct dentry *dentry,
404 } else { 405 } else {
405 /* Path walk currently on this dentry? */ 406 /* Path walk currently on this dentry? */
406 struct dentry *expired; 407 struct dentry *expired;
408
407 ino_count = atomic_read(&ino->count) + 1; 409 ino_count = atomic_read(&ino->count) + 1;
408 if (d_count(dentry) > ino_count) 410 if (d_count(dentry) > ino_count)
409 return NULL; 411 return NULL;
@@ -471,7 +473,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
471 return NULL; 473 return NULL;
472 474
473found: 475found:
474 DPRINTK("returning %p %pd", expired, expired); 476 pr_debug("returning %p %pd\n", expired, expired);
475 ino->flags |= AUTOFS_INF_EXPIRING; 477 ino->flags |= AUTOFS_INF_EXPIRING;
476 smp_mb(); 478 smp_mb();
477 ino->flags &= ~AUTOFS_INF_NO_RCU; 479 ino->flags &= ~AUTOFS_INF_NO_RCU;
@@ -503,12 +505,12 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
503 if (ino->flags & AUTOFS_INF_EXPIRING) { 505 if (ino->flags & AUTOFS_INF_EXPIRING) {
504 spin_unlock(&sbi->fs_lock); 506 spin_unlock(&sbi->fs_lock);
505 507
506 DPRINTK("waiting for expire %p name=%pd", dentry, dentry); 508 pr_debug("waiting for expire %p name=%pd\n", dentry, dentry);
507 509
508 status = autofs4_wait(sbi, dentry, NFY_NONE); 510 status = autofs4_wait(sbi, dentry, NFY_NONE);
509 wait_for_completion(&ino->expire_complete); 511 wait_for_completion(&ino->expire_complete);
510 512
511 DPRINTK("expire done status=%d", status); 513 pr_debug("expire done status=%d\n", status);
512 514
513 if (d_unhashed(dentry)) 515 if (d_unhashed(dentry))
514 return -EAGAIN; 516 return -EAGAIN;
@@ -522,21 +524,22 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
522 524
523/* Perform an expiry operation */ 525/* Perform an expiry operation */
524int autofs4_expire_run(struct super_block *sb, 526int autofs4_expire_run(struct super_block *sb,
525 struct vfsmount *mnt, 527 struct vfsmount *mnt,
526 struct autofs_sb_info *sbi, 528 struct autofs_sb_info *sbi,
527 struct autofs_packet_expire __user *pkt_p) 529 struct autofs_packet_expire __user *pkt_p)
528{ 530{
529 struct autofs_packet_expire pkt; 531 struct autofs_packet_expire pkt;
530 struct autofs_info *ino; 532 struct autofs_info *ino;
531 struct dentry *dentry; 533 struct dentry *dentry;
532 int ret = 0; 534 int ret = 0;
533 535
534 memset(&pkt,0,sizeof pkt); 536 memset(&pkt, 0, sizeof(pkt));
535 537
536 pkt.hdr.proto_version = sbi->version; 538 pkt.hdr.proto_version = sbi->version;
537 pkt.hdr.type = autofs_ptype_expire; 539 pkt.hdr.type = autofs_ptype_expire;
538 540
539 if ((dentry = autofs4_expire_indirect(sb, mnt, sbi, 0)) == NULL) 541 dentry = autofs4_expire_indirect(sb, mnt, sbi, 0);
542 if (!dentry)
540 return -EAGAIN; 543 return -EAGAIN;
541 544
542 pkt.len = dentry->d_name.len; 545 pkt.len = dentry->d_name.len;
@@ -544,7 +547,7 @@ int autofs4_expire_run(struct super_block *sb,
544 pkt.name[pkt.len] = '\0'; 547 pkt.name[pkt.len] = '\0';
545 dput(dentry); 548 dput(dentry);
546 549
547 if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) ) 550 if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
548 ret = -EFAULT; 551 ret = -EFAULT;
549 552
550 spin_lock(&sbi->fs_lock); 553 spin_lock(&sbi->fs_lock);
@@ -573,7 +576,8 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
573 struct autofs_info *ino = autofs4_dentry_ino(dentry); 576 struct autofs_info *ino = autofs4_dentry_ino(dentry);
574 577
575 /* This is synchronous because it makes the daemon a 578 /* This is synchronous because it makes the daemon a
576 little easier */ 579 * little easier
580 */
577 ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); 581 ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
578 582
579 spin_lock(&sbi->fs_lock); 583 spin_lock(&sbi->fs_lock);
@@ -588,8 +592,10 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
588 return ret; 592 return ret;
589} 593}
590 594
591/* Call repeatedly until it returns -EAGAIN, meaning there's nothing 595/*
592 more to be done */ 596 * Call repeatedly until it returns -EAGAIN, meaning there's nothing
597 * more to be done.
598 */
593int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, 599int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
594 struct autofs_sb_info *sbi, int __user *arg) 600 struct autofs_sb_info *sbi, int __user *arg)
595{ 601{
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index b3db517e89ec..8cf0e63389ae 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -1,14 +1,10 @@
1/* -*- c -*- --------------------------------------------------------------- * 1/*
2 * 2 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
3 * linux/fs/autofs/init.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 * 3 *
7 * This file is part of the Linux kernel and is made available under 4 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your 5 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference. 6 * option, any later version, incorporated herein by reference.
10 * 7 */
11 * ------------------------------------------------------------------------- */
12 8
13#include <linux/module.h> 9#include <linux/module.h>
14#include <linux/init.h> 10#include <linux/init.h>
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index a3ae0b2aeb5a..61b21051bd5a 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -1,15 +1,11 @@
1/* -*- c -*- --------------------------------------------------------------- * 1/*
2 * 2 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
3 * linux/fs/autofs/inode.c 3 * Copyright 2005-2006 Ian Kent <raven@themaw.net>
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 * Copyright 2005-2006 Ian Kent <raven@themaw.net>
7 * 4 *
8 * This file is part of the Linux kernel and is made available under 5 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your 6 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference. 7 * option, any later version, incorporated herein by reference.
11 * 8 */
12 * ------------------------------------------------------------------------- */
13 9
14#include <linux/kernel.h> 10#include <linux/kernel.h>
15#include <linux/slab.h> 11#include <linux/slab.h>
@@ -24,7 +20,9 @@
24 20
25struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi) 21struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
26{ 22{
27 struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL); 23 struct autofs_info *ino;
24
25 ino = kzalloc(sizeof(*ino), GFP_KERNEL);
28 if (ino) { 26 if (ino) {
29 INIT_LIST_HEAD(&ino->active); 27 INIT_LIST_HEAD(&ino->active);
30 INIT_LIST_HEAD(&ino->expiring); 28 INIT_LIST_HEAD(&ino->expiring);
@@ -62,7 +60,7 @@ void autofs4_kill_sb(struct super_block *sb)
62 put_pid(sbi->oz_pgrp); 60 put_pid(sbi->oz_pgrp);
63 } 61 }
64 62
65 DPRINTK("shutting down"); 63 pr_debug("shutting down\n");
66 kill_litter_super(sb); 64 kill_litter_super(sb);
67 if (sbi) 65 if (sbi)
68 kfree_rcu(sbi, rcu); 66 kfree_rcu(sbi, rcu);
@@ -94,7 +92,12 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
94 seq_printf(m, ",direct"); 92 seq_printf(m, ",direct");
95 else 93 else
96 seq_printf(m, ",indirect"); 94 seq_printf(m, ",indirect");
97 95#ifdef CONFIG_CHECKPOINT_RESTORE
96 if (sbi->pipe)
97 seq_printf(m, ",pipe_ino=%ld", sbi->pipe->f_inode->i_ino);
98 else
99 seq_printf(m, ",pipe_ino=-1");
100#endif
98 return 0; 101 return 0;
99} 102}
100 103
@@ -147,6 +150,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
147 150
148 while ((p = strsep(&options, ",")) != NULL) { 151 while ((p = strsep(&options, ",")) != NULL) {
149 int token; 152 int token;
153
150 if (!*p) 154 if (!*p)
151 continue; 155 continue;
152 156
@@ -204,9 +208,9 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
204 208
205int autofs4_fill_super(struct super_block *s, void *data, int silent) 209int autofs4_fill_super(struct super_block *s, void *data, int silent)
206{ 210{
207 struct inode * root_inode; 211 struct inode *root_inode;
208 struct dentry * root; 212 struct dentry *root;
209 struct file * pipe; 213 struct file *pipe;
210 int pipefd; 214 int pipefd;
211 struct autofs_sb_info *sbi; 215 struct autofs_sb_info *sbi;
212 struct autofs_info *ino; 216 struct autofs_info *ino;
@@ -217,7 +221,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
217 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 221 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
218 if (!sbi) 222 if (!sbi)
219 return -ENOMEM; 223 return -ENOMEM;
220 DPRINTK("starting up, sbi = %p",sbi); 224 pr_debug("starting up, sbi = %p\n", sbi);
221 225
222 s->s_fs_info = sbi; 226 s->s_fs_info = sbi;
223 sbi->magic = AUTOFS_SBI_MAGIC; 227 sbi->magic = AUTOFS_SBI_MAGIC;
@@ -266,14 +270,14 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
266 if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, 270 if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
267 &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto, 271 &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto,
268 &sbi->max_proto)) { 272 &sbi->max_proto)) {
269 printk("autofs: called with bogus options\n"); 273 pr_err("called with bogus options\n");
270 goto fail_dput; 274 goto fail_dput;
271 } 275 }
272 276
273 if (pgrp_set) { 277 if (pgrp_set) {
274 sbi->oz_pgrp = find_get_pid(pgrp); 278 sbi->oz_pgrp = find_get_pid(pgrp);
275 if (!sbi->oz_pgrp) { 279 if (!sbi->oz_pgrp) {
276 pr_warn("autofs: could not find process group %d\n", 280 pr_err("could not find process group %d\n",
277 pgrp); 281 pgrp);
278 goto fail_dput; 282 goto fail_dput;
279 } 283 }
@@ -290,10 +294,10 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
290 /* Couldn't this be tested earlier? */ 294 /* Couldn't this be tested earlier? */
291 if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || 295 if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
292 sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { 296 sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) {
293 printk("autofs: kernel does not match daemon version " 297 pr_err("kernel does not match daemon version "
294 "daemon (%d, %d) kernel (%d, %d)\n", 298 "daemon (%d, %d) kernel (%d, %d)\n",
295 sbi->min_proto, sbi->max_proto, 299 sbi->min_proto, sbi->max_proto,
296 AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); 300 AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
297 goto fail_dput; 301 goto fail_dput;
298 } 302 }
299 303
@@ -304,11 +308,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
304 sbi->version = sbi->max_proto; 308 sbi->version = sbi->max_proto;
305 sbi->sub_version = AUTOFS_PROTO_SUBVERSION; 309 sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
306 310
307 DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp)); 311 pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp));
308 pipe = fget(pipefd); 312 pipe = fget(pipefd);
309 313
310 if (!pipe) { 314 if (!pipe) {
311 printk("autofs: could not open pipe file descriptor\n"); 315 pr_err("could not open pipe file descriptor\n");
312 goto fail_dput; 316 goto fail_dput;
313 } 317 }
314 ret = autofs_prepare_pipe(pipe); 318 ret = autofs_prepare_pipe(pipe);
@@ -323,12 +327,12 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
323 */ 327 */
324 s->s_root = root; 328 s->s_root = root;
325 return 0; 329 return 0;
326 330
327 /* 331 /*
328 * Failure ... clean up. 332 * Failure ... clean up.
329 */ 333 */
330fail_fput: 334fail_fput:
331 printk("autofs: pipe file descriptor does not contain proper ops\n"); 335 pr_err("pipe file descriptor does not contain proper ops\n");
332 fput(pipe); 336 fput(pipe);
333 /* fall through */ 337 /* fall through */
334fail_dput: 338fail_dput:
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index c6d7d3dbd52a..7ab923940d18 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -1,16 +1,12 @@
1/* -*- c -*- --------------------------------------------------------------- * 1/*
2 * 2 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
3 * linux/fs/autofs/root.c 3 * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
4 * 4 * Copyright 2001-2006 Ian Kent <raven@themaw.net>
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
7 * Copyright 2001-2006 Ian Kent <raven@themaw.net>
8 * 5 *
9 * This file is part of the Linux kernel and is made available under 6 * This file is part of the Linux kernel and is made available under
10 * the terms of the GNU General Public License, version 2, or at your 7 * the terms of the GNU General Public License, version 2, or at your
11 * option, any later version, incorporated herein by reference. 8 * option, any later version, incorporated herein by reference.
12 * 9 */
13 * ------------------------------------------------------------------------- */
14 10
15#include <linux/capability.h> 11#include <linux/capability.h>
16#include <linux/errno.h> 12#include <linux/errno.h>
@@ -23,16 +19,18 @@
23 19
24#include "autofs_i.h" 20#include "autofs_i.h"
25 21
26static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); 22static int autofs4_dir_symlink(struct inode *, struct dentry *, const char *);
27static int autofs4_dir_unlink(struct inode *,struct dentry *); 23static int autofs4_dir_unlink(struct inode *, struct dentry *);
28static int autofs4_dir_rmdir(struct inode *,struct dentry *); 24static int autofs4_dir_rmdir(struct inode *, struct dentry *);
29static int autofs4_dir_mkdir(struct inode *,struct dentry *,umode_t); 25static int autofs4_dir_mkdir(struct inode *, struct dentry *, umode_t);
30static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long); 26static long autofs4_root_ioctl(struct file *, unsigned int, unsigned long);
31#ifdef CONFIG_COMPAT 27#ifdef CONFIG_COMPAT
32static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long); 28static long autofs4_root_compat_ioctl(struct file *,
29 unsigned int, unsigned long);
33#endif 30#endif
34static int autofs4_dir_open(struct inode *inode, struct file *file); 31static int autofs4_dir_open(struct inode *inode, struct file *file);
35static struct dentry *autofs4_lookup(struct inode *,struct dentry *, unsigned int); 32static struct dentry *autofs4_lookup(struct inode *,
33 struct dentry *, unsigned int);
36static struct vfsmount *autofs4_d_automount(struct path *); 34static struct vfsmount *autofs4_d_automount(struct path *);
37static int autofs4_d_manage(struct dentry *, bool); 35static int autofs4_d_manage(struct dentry *, bool);
38static void autofs4_dentry_release(struct dentry *); 36static void autofs4_dentry_release(struct dentry *);
@@ -74,7 +72,9 @@ const struct dentry_operations autofs4_dentry_operations = {
74static void autofs4_add_active(struct dentry *dentry) 72static void autofs4_add_active(struct dentry *dentry)
75{ 73{
76 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 74 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
77 struct autofs_info *ino = autofs4_dentry_ino(dentry); 75 struct autofs_info *ino;
76
77 ino = autofs4_dentry_ino(dentry);
78 if (ino) { 78 if (ino) {
79 spin_lock(&sbi->lookup_lock); 79 spin_lock(&sbi->lookup_lock);
80 if (!ino->active_count) { 80 if (!ino->active_count) {
@@ -84,13 +84,14 @@ static void autofs4_add_active(struct dentry *dentry)
84 ino->active_count++; 84 ino->active_count++;
85 spin_unlock(&sbi->lookup_lock); 85 spin_unlock(&sbi->lookup_lock);
86 } 86 }
87 return;
88} 87}
89 88
90static void autofs4_del_active(struct dentry *dentry) 89static void autofs4_del_active(struct dentry *dentry)
91{ 90{
92 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 91 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
93 struct autofs_info *ino = autofs4_dentry_ino(dentry); 92 struct autofs_info *ino;
93
94 ino = autofs4_dentry_ino(dentry);
94 if (ino) { 95 if (ino) {
95 spin_lock(&sbi->lookup_lock); 96 spin_lock(&sbi->lookup_lock);
96 ino->active_count--; 97 ino->active_count--;
@@ -100,7 +101,6 @@ static void autofs4_del_active(struct dentry *dentry)
100 } 101 }
101 spin_unlock(&sbi->lookup_lock); 102 spin_unlock(&sbi->lookup_lock);
102 } 103 }
103 return;
104} 104}
105 105
106static int autofs4_dir_open(struct inode *inode, struct file *file) 106static int autofs4_dir_open(struct inode *inode, struct file *file)
@@ -108,7 +108,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
108 struct dentry *dentry = file->f_path.dentry; 108 struct dentry *dentry = file->f_path.dentry;
109 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 109 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
110 110
111 DPRINTK("file=%p dentry=%p %pd", file, dentry, dentry); 111 pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry);
112 112
113 if (autofs4_oz_mode(sbi)) 113 if (autofs4_oz_mode(sbi))
114 goto out; 114 goto out;
@@ -138,7 +138,7 @@ static void autofs4_dentry_release(struct dentry *de)
138 struct autofs_info *ino = autofs4_dentry_ino(de); 138 struct autofs_info *ino = autofs4_dentry_ino(de);
139 struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb); 139 struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
140 140
141 DPRINTK("releasing %p", de); 141 pr_debug("releasing %p\n", de);
142 142
143 if (!ino) 143 if (!ino)
144 return; 144 return;
@@ -278,9 +278,9 @@ static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk)
278 if (ino->flags & AUTOFS_INF_PENDING) { 278 if (ino->flags & AUTOFS_INF_PENDING) {
279 if (rcu_walk) 279 if (rcu_walk)
280 return -ECHILD; 280 return -ECHILD;
281 DPRINTK("waiting for mount name=%pd", dentry); 281 pr_debug("waiting for mount name=%pd\n", dentry);
282 status = autofs4_wait(sbi, dentry, NFY_MOUNT); 282 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
283 DPRINTK("mount wait done status=%d", status); 283 pr_debug("mount wait done status=%d\n", status);
284 } 284 }
285 ino->last_used = jiffies; 285 ino->last_used = jiffies;
286 return status; 286 return status;
@@ -320,7 +320,9 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
320 if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { 320 if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
321 struct dentry *parent = dentry->d_parent; 321 struct dentry *parent = dentry->d_parent;
322 struct autofs_info *ino; 322 struct autofs_info *ino;
323 struct dentry *new = d_lookup(parent, &dentry->d_name); 323 struct dentry *new;
324
325 new = d_lookup(parent, &dentry->d_name);
324 if (!new) 326 if (!new)
325 return NULL; 327 return NULL;
326 ino = autofs4_dentry_ino(new); 328 ino = autofs4_dentry_ino(new);
@@ -338,7 +340,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
338 struct autofs_info *ino = autofs4_dentry_ino(dentry); 340 struct autofs_info *ino = autofs4_dentry_ino(dentry);
339 int status; 341 int status;
340 342
341 DPRINTK("dentry=%p %pd", dentry, dentry); 343 pr_debug("dentry=%p %pd\n", dentry, dentry);
342 344
343 /* The daemon never triggers a mount. */ 345 /* The daemon never triggers a mount. */
344 if (autofs4_oz_mode(sbi)) 346 if (autofs4_oz_mode(sbi))
@@ -425,7 +427,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
425 struct autofs_info *ino = autofs4_dentry_ino(dentry); 427 struct autofs_info *ino = autofs4_dentry_ino(dentry);
426 int status; 428 int status;
427 429
428 DPRINTK("dentry=%p %pd", dentry, dentry); 430 pr_debug("dentry=%p %pd\n", dentry, dentry);
429 431
430 /* The daemon never waits. */ 432 /* The daemon never waits. */
431 if (autofs4_oz_mode(sbi)) { 433 if (autofs4_oz_mode(sbi)) {
@@ -455,6 +457,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
455 * a mount-trap. 457 * a mount-trap.
456 */ 458 */
457 struct inode *inode; 459 struct inode *inode;
460
458 if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU)) 461 if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))
459 return 0; 462 return 0;
460 if (d_mountpoint(dentry)) 463 if (d_mountpoint(dentry))
@@ -494,13 +497,14 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
494} 497}
495 498
496/* Lookups in the root directory */ 499/* Lookups in the root directory */
497static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 500static struct dentry *autofs4_lookup(struct inode *dir,
501 struct dentry *dentry, unsigned int flags)
498{ 502{
499 struct autofs_sb_info *sbi; 503 struct autofs_sb_info *sbi;
500 struct autofs_info *ino; 504 struct autofs_info *ino;
501 struct dentry *active; 505 struct dentry *active;
502 506
503 DPRINTK("name = %pd", dentry); 507 pr_debug("name = %pd\n", dentry);
504 508
505 /* File name too long to exist */ 509 /* File name too long to exist */
506 if (dentry->d_name.len > NAME_MAX) 510 if (dentry->d_name.len > NAME_MAX)
@@ -508,14 +512,14 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
508 512
509 sbi = autofs4_sbi(dir->i_sb); 513 sbi = autofs4_sbi(dir->i_sb);
510 514
511 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", 515 pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n",
512 current->pid, task_pgrp_nr(current), sbi->catatonic, 516 current->pid, task_pgrp_nr(current), sbi->catatonic,
513 autofs4_oz_mode(sbi)); 517 autofs4_oz_mode(sbi));
514 518
515 active = autofs4_lookup_active(dentry); 519 active = autofs4_lookup_active(dentry);
516 if (active) { 520 if (active)
517 return active; 521 return active;
518 } else { 522 else {
519 /* 523 /*
520 * A dentry that is not within the root can never trigger a 524 * A dentry that is not within the root can never trigger a
521 * mount operation, unless the directory already exists, so we 525 * mount operation, unless the directory already exists, so we
@@ -526,7 +530,8 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
526 return ERR_PTR(-ENOENT); 530 return ERR_PTR(-ENOENT);
527 531
528 /* Mark entries in the root as mount triggers */ 532 /* Mark entries in the root as mount triggers */
529 if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent)) 533 if (IS_ROOT(dentry->d_parent) &&
534 autofs_type_indirect(sbi->type))
530 __managed_dentry_set_managed(dentry); 535 __managed_dentry_set_managed(dentry);
531 536
532 ino = autofs4_new_ino(sbi); 537 ino = autofs4_new_ino(sbi);
@@ -537,8 +542,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
537 ino->dentry = dentry; 542 ino->dentry = dentry;
538 543
539 autofs4_add_active(dentry); 544 autofs4_add_active(dentry);
540
541 d_instantiate(dentry, NULL);
542 } 545 }
543 return NULL; 546 return NULL;
544} 547}
@@ -554,7 +557,7 @@ static int autofs4_dir_symlink(struct inode *dir,
554 size_t size = strlen(symname); 557 size_t size = strlen(symname);
555 char *cp; 558 char *cp;
556 559
557 DPRINTK("%s <- %pd", symname, dentry); 560 pr_debug("%s <- %pd\n", symname, dentry);
558 561
559 if (!autofs4_oz_mode(sbi)) 562 if (!autofs4_oz_mode(sbi))
560 return -EACCES; 563 return -EACCES;
@@ -613,7 +616,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
613 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); 616 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
614 struct autofs_info *ino = autofs4_dentry_ino(dentry); 617 struct autofs_info *ino = autofs4_dentry_ino(dentry);
615 struct autofs_info *p_ino; 618 struct autofs_info *p_ino;
616 619
617 /* This allows root to remove symlinks */ 620 /* This allows root to remove symlinks */
618 if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) 621 if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
619 return -EPERM; 622 return -EPERM;
@@ -664,7 +667,6 @@ static void autofs_set_leaf_automount_flags(struct dentry *dentry)
664 if (IS_ROOT(parent->d_parent)) 667 if (IS_ROOT(parent->d_parent))
665 return; 668 return;
666 managed_dentry_clear_managed(parent); 669 managed_dentry_clear_managed(parent);
667 return;
668} 670}
669 671
670static void autofs_clear_leaf_automount_flags(struct dentry *dentry) 672static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
@@ -687,7 +689,6 @@ static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
687 if (d_child->next == &parent->d_subdirs && 689 if (d_child->next == &parent->d_subdirs &&
688 d_child->prev == &parent->d_subdirs) 690 d_child->prev == &parent->d_subdirs)
689 managed_dentry_set_managed(parent); 691 managed_dentry_set_managed(parent);
690 return;
691} 692}
692 693
693static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) 694static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
@@ -695,8 +696,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
695 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); 696 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
696 struct autofs_info *ino = autofs4_dentry_ino(dentry); 697 struct autofs_info *ino = autofs4_dentry_ino(dentry);
697 struct autofs_info *p_ino; 698 struct autofs_info *p_ino;
698 699
699 DPRINTK("dentry %p, removing %pd", dentry, dentry); 700 pr_debug("dentry %p, removing %pd\n", dentry, dentry);
700 701
701 if (!autofs4_oz_mode(sbi)) 702 if (!autofs4_oz_mode(sbi))
702 return -EACCES; 703 return -EACCES;
@@ -728,7 +729,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
728 return 0; 729 return 0;
729} 730}
730 731
731static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 732static int autofs4_dir_mkdir(struct inode *dir,
733 struct dentry *dentry, umode_t mode)
732{ 734{
733 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); 735 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
734 struct autofs_info *ino = autofs4_dentry_ino(dentry); 736 struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -738,7 +740,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
738 if (!autofs4_oz_mode(sbi)) 740 if (!autofs4_oz_mode(sbi))
739 return -EACCES; 741 return -EACCES;
740 742
741 DPRINTK("dentry %p, creating %pd", dentry, dentry); 743 pr_debug("dentry %p, creating %pd\n", dentry, dentry);
742 744
743 BUG_ON(!ino); 745 BUG_ON(!ino);
744 746
@@ -768,14 +770,18 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
768/* Get/set timeout ioctl() operation */ 770/* Get/set timeout ioctl() operation */
769#ifdef CONFIG_COMPAT 771#ifdef CONFIG_COMPAT
770static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi, 772static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
771 compat_ulong_t __user *p) 773 compat_ulong_t __user *p)
772{ 774{
773 int rv;
774 unsigned long ntimeout; 775 unsigned long ntimeout;
776 int rv;
777
778 rv = get_user(ntimeout, p);
779 if (rv)
780 goto error;
775 781
776 if ((rv = get_user(ntimeout, p)) || 782 rv = put_user(sbi->exp_timeout/HZ, p);
777 (rv = put_user(sbi->exp_timeout/HZ, p))) 783 if (rv)
778 return rv; 784 goto error;
779 785
780 if (ntimeout > UINT_MAX/HZ) 786 if (ntimeout > UINT_MAX/HZ)
781 sbi->exp_timeout = 0; 787 sbi->exp_timeout = 0;
@@ -783,18 +789,24 @@ static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
783 sbi->exp_timeout = ntimeout * HZ; 789 sbi->exp_timeout = ntimeout * HZ;
784 790
785 return 0; 791 return 0;
792error:
793 return rv;
786} 794}
787#endif 795#endif
788 796
789static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi, 797static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
790 unsigned long __user *p) 798 unsigned long __user *p)
791{ 799{
792 int rv;
793 unsigned long ntimeout; 800 unsigned long ntimeout;
801 int rv;
802
803 rv = get_user(ntimeout, p);
804 if (rv)
805 goto error;
794 806
795 if ((rv = get_user(ntimeout, p)) || 807 rv = put_user(sbi->exp_timeout/HZ, p);
796 (rv = put_user(sbi->exp_timeout/HZ, p))) 808 if (rv)
797 return rv; 809 goto error;
798 810
799 if (ntimeout > ULONG_MAX/HZ) 811 if (ntimeout > ULONG_MAX/HZ)
800 sbi->exp_timeout = 0; 812 sbi->exp_timeout = 0;
@@ -802,16 +814,20 @@ static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
802 sbi->exp_timeout = ntimeout * HZ; 814 sbi->exp_timeout = ntimeout * HZ;
803 815
804 return 0; 816 return 0;
817error:
818 return rv;
805} 819}
806 820
807/* Return protocol version */ 821/* Return protocol version */
808static inline int autofs4_get_protover(struct autofs_sb_info *sbi, int __user *p) 822static inline int autofs4_get_protover(struct autofs_sb_info *sbi,
823 int __user *p)
809{ 824{
810 return put_user(sbi->version, p); 825 return put_user(sbi->version, p);
811} 826}
812 827
813/* Return protocol sub version */ 828/* Return protocol sub version */
814static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user *p) 829static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi,
830 int __user *p)
815{ 831{
816 return put_user(sbi->sub_version, p); 832 return put_user(sbi->sub_version, p);
817} 833}
@@ -826,7 +842,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
826 if (may_umount(mnt)) 842 if (may_umount(mnt))
827 status = 1; 843 status = 1;
828 844
829 DPRINTK("returning %d", status); 845 pr_debug("returning %d\n", status);
830 846
831 status = put_user(status, p); 847 status = put_user(status, p);
832 848
@@ -834,9 +850,9 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
834} 850}
835 851
836/* Identify autofs4_dentries - this is so we can tell if there's 852/* Identify autofs4_dentries - this is so we can tell if there's
837 an extra dentry refcount or not. We only hold a refcount on the 853 * an extra dentry refcount or not. We only hold a refcount on the
838 dentry if its non-negative (ie, d_inode != NULL) 854 * dentry if its non-negative (ie, d_inode != NULL)
839*/ 855 */
840int is_autofs4_dentry(struct dentry *dentry) 856int is_autofs4_dentry(struct dentry *dentry)
841{ 857{
842 return dentry && d_really_is_positive(dentry) && 858 return dentry && d_really_is_positive(dentry) &&
@@ -854,21 +870,21 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
854 struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); 870 struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
855 void __user *p = (void __user *)arg; 871 void __user *p = (void __user *)arg;
856 872
857 DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u", 873 pr_debug("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",
858 cmd,arg,sbi,task_pgrp_nr(current)); 874 cmd, arg, sbi, task_pgrp_nr(current));
859 875
860 if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) || 876 if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
861 _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) 877 _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
862 return -ENOTTY; 878 return -ENOTTY;
863 879
864 if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) 880 if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
865 return -EPERM; 881 return -EPERM;
866 882
867 switch(cmd) { 883 switch (cmd) {
868 case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */ 884 case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */
869 return autofs4_wait_release(sbi,(autofs_wqt_t)arg,0); 885 return autofs4_wait_release(sbi, (autofs_wqt_t) arg, 0);
870 case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */ 886 case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */
871 return autofs4_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT); 887 return autofs4_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT);
872 case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */ 888 case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
873 autofs4_catatonic_mode(sbi); 889 autofs4_catatonic_mode(sbi);
874 return 0; 890 return 0;
@@ -888,13 +904,15 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
888 904
889 /* return a single thing to expire */ 905 /* return a single thing to expire */
890 case AUTOFS_IOC_EXPIRE: 906 case AUTOFS_IOC_EXPIRE:
891 return autofs4_expire_run(inode->i_sb,filp->f_path.mnt,sbi, p); 907 return autofs4_expire_run(inode->i_sb,
908 filp->f_path.mnt, sbi, p);
892 /* same as above, but can send multiple expires through pipe */ 909 /* same as above, but can send multiple expires through pipe */
893 case AUTOFS_IOC_EXPIRE_MULTI: 910 case AUTOFS_IOC_EXPIRE_MULTI:
894 return autofs4_expire_multi(inode->i_sb,filp->f_path.mnt,sbi, p); 911 return autofs4_expire_multi(inode->i_sb,
912 filp->f_path.mnt, sbi, p);
895 913
896 default: 914 default:
897 return -ENOSYS; 915 return -EINVAL;
898 } 916 }
899} 917}
900 918
@@ -902,12 +920,13 @@ static long autofs4_root_ioctl(struct file *filp,
902 unsigned int cmd, unsigned long arg) 920 unsigned int cmd, unsigned long arg)
903{ 921{
904 struct inode *inode = file_inode(filp); 922 struct inode *inode = file_inode(filp);
923
905 return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); 924 return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
906} 925}
907 926
908#ifdef CONFIG_COMPAT 927#ifdef CONFIG_COMPAT
909static long autofs4_root_compat_ioctl(struct file *filp, 928static long autofs4_root_compat_ioctl(struct file *filp,
910 unsigned int cmd, unsigned long arg) 929 unsigned int cmd, unsigned long arg)
911{ 930{
912 struct inode *inode = file_inode(filp); 931 struct inode *inode = file_inode(filp);
913 int ret; 932 int ret;
@@ -916,7 +935,7 @@ static long autofs4_root_compat_ioctl(struct file *filp,
916 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); 935 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
917 else 936 else
918 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, 937 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
919 (unsigned long)compat_ptr(arg)); 938 (unsigned long) compat_ptr(arg));
920 939
921 return ret; 940 return ret;
922} 941}
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index 84e037d1d129..99aab00dc217 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -1,14 +1,10 @@
1/* -*- c -*- --------------------------------------------------------------- * 1/*
2 * 2 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
3 * linux/fs/autofs/symlink.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 * 3 *
7 * This file is part of the Linux kernel and is made available under 4 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your 5 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference. 6 * option, any later version, incorporated herein by reference.
10 * 7 */
11 * ------------------------------------------------------------------------- */
12 8
13#include "autofs_i.h" 9#include "autofs_i.h"
14 10
@@ -18,6 +14,7 @@ static const char *autofs4_get_link(struct dentry *dentry,
18{ 14{
19 struct autofs_sb_info *sbi; 15 struct autofs_sb_info *sbi;
20 struct autofs_info *ino; 16 struct autofs_info *ino;
17
21 if (!dentry) 18 if (!dentry)
22 return ERR_PTR(-ECHILD); 19 return ERR_PTR(-ECHILD);
23 sbi = autofs4_sbi(dentry->d_sb); 20 sbi = autofs4_sbi(dentry->d_sb);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 35b755e79c2d..0146d911f468 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -1,15 +1,11 @@
1/* -*- c -*- --------------------------------------------------------------- * 1/*
2 * 2 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
3 * linux/fs/autofs/waitq.c 3 * Copyright 2001-2006 Ian Kent <raven@themaw.net>
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 * Copyright 2001-2006 Ian Kent <raven@themaw.net>
7 * 4 *
8 * This file is part of the Linux kernel and is made available under 5 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your 6 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference. 7 * option, any later version, incorporated herein by reference.
11 * 8 */
12 * ------------------------------------------------------------------------- */
13 9
14#include <linux/slab.h> 10#include <linux/slab.h>
15#include <linux/time.h> 11#include <linux/time.h>
@@ -18,7 +14,8 @@
18#include "autofs_i.h" 14#include "autofs_i.h"
19 15
20/* We make this a static variable rather than a part of the superblock; it 16/* We make this a static variable rather than a part of the superblock; it
21 is better if we don't reassign numbers easily even across filesystems */ 17 * is better if we don't reassign numbers easily even across filesystems
18 */
22static autofs_wqt_t autofs4_next_wait_queue = 1; 19static autofs_wqt_t autofs4_next_wait_queue = 1;
23 20
24/* These are the signals we allow interrupting a pending mount */ 21/* These are the signals we allow interrupting a pending mount */
@@ -34,7 +31,7 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
34 return; 31 return;
35 } 32 }
36 33
37 DPRINTK("entering catatonic mode"); 34 pr_debug("entering catatonic mode\n");
38 35
39 sbi->catatonic = 1; 36 sbi->catatonic = 1;
40 wq = sbi->queues; 37 wq = sbi->queues;
@@ -69,17 +66,19 @@ static int autofs4_write(struct autofs_sb_info *sbi,
69 set_fs(KERNEL_DS); 66 set_fs(KERNEL_DS);
70 67
71 mutex_lock(&sbi->pipe_mutex); 68 mutex_lock(&sbi->pipe_mutex);
72 while (bytes && 69 wr = __vfs_write(file, data, bytes, &file->f_pos);
73 (wr = __vfs_write(file,data,bytes,&file->f_pos)) > 0) { 70 while (bytes && wr) {
74 data += wr; 71 data += wr;
75 bytes -= wr; 72 bytes -= wr;
73 wr = __vfs_write(file, data, bytes, &file->f_pos);
76 } 74 }
77 mutex_unlock(&sbi->pipe_mutex); 75 mutex_unlock(&sbi->pipe_mutex);
78 76
79 set_fs(fs); 77 set_fs(fs);
80 78
81 /* Keep the currently executing process from receiving a 79 /* Keep the currently executing process from receiving a
82 SIGPIPE unless it was already supposed to get one */ 80 * SIGPIPE unless it was already supposed to get one
81 */
83 if (wr == -EPIPE && !sigpipe) { 82 if (wr == -EPIPE && !sigpipe) {
84 spin_lock_irqsave(&current->sighand->siglock, flags); 83 spin_lock_irqsave(&current->sighand->siglock, flags);
85 sigdelset(&current->pending.signal, SIGPIPE); 84 sigdelset(&current->pending.signal, SIGPIPE);
@@ -89,7 +88,7 @@ static int autofs4_write(struct autofs_sb_info *sbi,
89 88
90 return (bytes > 0); 89 return (bytes > 0);
91} 90}
92 91
93static void autofs4_notify_daemon(struct autofs_sb_info *sbi, 92static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
94 struct autofs_wait_queue *wq, 93 struct autofs_wait_queue *wq,
95 int type) 94 int type)
@@ -102,10 +101,11 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
102 struct file *pipe = NULL; 101 struct file *pipe = NULL;
103 size_t pktsz; 102 size_t pktsz;
104 103
105 DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d", 104 pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n",
106 (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type); 105 (unsigned long) wq->wait_queue_token,
106 wq->name.len, wq->name.name, type);
107 107
108 memset(&pkt,0,sizeof pkt); /* For security reasons */ 108 memset(&pkt, 0, sizeof(pkt)); /* For security reasons */
109 109
110 pkt.hdr.proto_version = sbi->version; 110 pkt.hdr.proto_version = sbi->version;
111 pkt.hdr.type = type; 111 pkt.hdr.type = type;
@@ -126,7 +126,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
126 } 126 }
127 case autofs_ptype_expire_multi: 127 case autofs_ptype_expire_multi:
128 { 128 {
129 struct autofs_packet_expire_multi *ep = &pkt.v4_pkt.expire_multi; 129 struct autofs_packet_expire_multi *ep =
130 &pkt.v4_pkt.expire_multi;
130 131
131 pktsz = sizeof(*ep); 132 pktsz = sizeof(*ep);
132 133
@@ -163,7 +164,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
163 break; 164 break;
164 } 165 }
165 default: 166 default:
166 printk("autofs4_notify_daemon: bad type %d!\n", type); 167 pr_warn("bad type %d!\n", type);
167 mutex_unlock(&sbi->wq_mutex); 168 mutex_unlock(&sbi->wq_mutex);
168 return; 169 return;
169 } 170 }
@@ -231,7 +232,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
231 if (wq->name.hash == qstr->hash && 232 if (wq->name.hash == qstr->hash &&
232 wq->name.len == qstr->len && 233 wq->name.len == qstr->len &&
233 wq->name.name && 234 wq->name.name &&
234 !memcmp(wq->name.name, qstr->name, qstr->len)) 235 !memcmp(wq->name.name, qstr->name, qstr->len))
235 break; 236 break;
236 } 237 }
237 return wq; 238 return wq;
@@ -248,7 +249,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
248static int validate_request(struct autofs_wait_queue **wait, 249static int validate_request(struct autofs_wait_queue **wait,
249 struct autofs_sb_info *sbi, 250 struct autofs_sb_info *sbi,
250 struct qstr *qstr, 251 struct qstr *qstr,
251 struct dentry*dentry, enum autofs_notify notify) 252 struct dentry *dentry, enum autofs_notify notify)
252{ 253{
253 struct autofs_wait_queue *wq; 254 struct autofs_wait_queue *wq;
254 struct autofs_info *ino; 255 struct autofs_info *ino;
@@ -322,8 +323,10 @@ static int validate_request(struct autofs_wait_queue **wait,
322 * continue on and create a new request. 323 * continue on and create a new request.
323 */ 324 */
324 if (!IS_ROOT(dentry)) { 325 if (!IS_ROOT(dentry)) {
325 if (d_really_is_positive(dentry) && d_unhashed(dentry)) { 326 if (d_unhashed(dentry) &&
327 d_really_is_positive(dentry)) {
326 struct dentry *parent = dentry->d_parent; 328 struct dentry *parent = dentry->d_parent;
329
327 new = d_lookup(parent, &dentry->d_name); 330 new = d_lookup(parent, &dentry->d_name);
328 if (new) 331 if (new)
329 dentry = new; 332 dentry = new;
@@ -340,8 +343,8 @@ static int validate_request(struct autofs_wait_queue **wait,
340 return 1; 343 return 1;
341} 344}
342 345
343int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, 346int autofs4_wait(struct autofs_sb_info *sbi,
344 enum autofs_notify notify) 347 struct dentry *dentry, enum autofs_notify notify)
345{ 348{
346 struct autofs_wait_queue *wq; 349 struct autofs_wait_queue *wq;
347 struct qstr qstr; 350 struct qstr qstr;
@@ -411,7 +414,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
411 414
412 if (!wq) { 415 if (!wq) {
413 /* Create a new wait queue */ 416 /* Create a new wait queue */
414 wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL); 417 wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL);
415 if (!wq) { 418 if (!wq) {
416 kfree(qstr.name); 419 kfree(qstr.name);
417 mutex_unlock(&sbi->wq_mutex); 420 mutex_unlock(&sbi->wq_mutex);
@@ -450,17 +453,19 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
450 autofs_ptype_expire_indirect; 453 autofs_ptype_expire_indirect;
451 } 454 }
452 455
453 DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", 456 pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
454 (unsigned long) wq->wait_queue_token, wq->name.len, 457 (unsigned long) wq->wait_queue_token, wq->name.len,
455 wq->name.name, notify); 458 wq->name.name, notify);
456 459
457 /* autofs4_notify_daemon() may block; it will unlock ->wq_mutex */ 460 /*
461 * autofs4_notify_daemon() may block; it will unlock ->wq_mutex
462 */
458 autofs4_notify_daemon(sbi, wq, type); 463 autofs4_notify_daemon(sbi, wq, type);
459 } else { 464 } else {
460 wq->wait_ctr++; 465 wq->wait_ctr++;
461 DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d", 466 pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n",
462 (unsigned long) wq->wait_queue_token, wq->name.len, 467 (unsigned long) wq->wait_queue_token, wq->name.len,
463 wq->name.name, notify); 468 wq->name.name, notify);
464 mutex_unlock(&sbi->wq_mutex); 469 mutex_unlock(&sbi->wq_mutex);
465 kfree(qstr.name); 470 kfree(qstr.name);
466 } 471 }
@@ -471,12 +476,14 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
471 */ 476 */
472 if (wq->name.name) { 477 if (wq->name.name) {
473 /* Block all but "shutdown" signals while waiting */ 478 /* Block all but "shutdown" signals while waiting */
474 sigset_t oldset; 479 unsigned long shutdown_sigs_mask;
475 unsigned long irqflags; 480 unsigned long irqflags;
481 sigset_t oldset;
476 482
477 spin_lock_irqsave(&current->sighand->siglock, irqflags); 483 spin_lock_irqsave(&current->sighand->siglock, irqflags);
478 oldset = current->blocked; 484 oldset = current->blocked;
479 siginitsetinv(&current->blocked, SHUTDOWN_SIGS & ~oldset.sig[0]); 485 shutdown_sigs_mask = SHUTDOWN_SIGS & ~oldset.sig[0];
486 siginitsetinv(&current->blocked, shutdown_sigs_mask);
480 recalc_sigpending(); 487 recalc_sigpending();
481 spin_unlock_irqrestore(&current->sighand->siglock, irqflags); 488 spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
482 489
@@ -487,7 +494,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
487 recalc_sigpending(); 494 recalc_sigpending();
488 spin_unlock_irqrestore(&current->sighand->siglock, irqflags); 495 spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
489 } else { 496 } else {
490 DPRINTK("skipped sleeping"); 497 pr_debug("skipped sleeping\n");
491 } 498 }
492 499
493 status = wq->status; 500 status = wq->status;
@@ -562,4 +569,3 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok
562 569
563 return 0; 570 return 0;
564} 571}
565
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 826b164a4b5b..3172c4e2f502 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -575,7 +575,11 @@ static const struct super_operations bdev_sops = {
575static struct dentry *bd_mount(struct file_system_type *fs_type, 575static struct dentry *bd_mount(struct file_system_type *fs_type,
576 int flags, const char *dev_name, void *data) 576 int flags, const char *dev_name, void *data)
577{ 577{
578 return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC); 578 struct dentry *dent;
579 dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
580 if (dent)
581 dent->d_sb->s_iflags |= SB_I_CGROUPWB;
582 return dent;
579} 583}
580 584
581static struct file_system_type bd_type = { 585static struct file_system_type bd_type = {
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f6dac40f87ff..80e8472d618b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -148,8 +148,7 @@ int __init btrfs_prelim_ref_init(void)
148 148
149void btrfs_prelim_ref_exit(void) 149void btrfs_prelim_ref_exit(void)
150{ 150{
151 if (btrfs_prelim_ref_cache) 151 kmem_cache_destroy(btrfs_prelim_ref_cache);
152 kmem_cache_destroy(btrfs_prelim_ref_cache);
153} 152}
154 153
155/* 154/*
@@ -566,17 +565,14 @@ static void __merge_refs(struct list_head *head, int mode)
566 struct __prelim_ref *pos2 = pos1, *tmp; 565 struct __prelim_ref *pos2 = pos1, *tmp;
567 566
568 list_for_each_entry_safe_continue(pos2, tmp, head, list) { 567 list_for_each_entry_safe_continue(pos2, tmp, head, list) {
569 struct __prelim_ref *xchg, *ref1 = pos1, *ref2 = pos2; 568 struct __prelim_ref *ref1 = pos1, *ref2 = pos2;
570 struct extent_inode_elem *eie; 569 struct extent_inode_elem *eie;
571 570
572 if (!ref_for_same_block(ref1, ref2)) 571 if (!ref_for_same_block(ref1, ref2))
573 continue; 572 continue;
574 if (mode == 1) { 573 if (mode == 1) {
575 if (!ref1->parent && ref2->parent) { 574 if (!ref1->parent && ref2->parent)
576 xchg = ref1; 575 swap(ref1, ref2);
577 ref1 = ref2;
578 ref2 = xchg;
579 }
580 } else { 576 } else {
581 if (ref1->parent != ref2->parent) 577 if (ref1->parent != ref2->parent)
582 continue; 578 continue;
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 861d472564c1..e34a71b3e225 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -95,6 +95,7 @@
95#include <linux/genhd.h> 95#include <linux/genhd.h>
96#include <linux/blkdev.h> 96#include <linux/blkdev.h>
97#include <linux/vmalloc.h> 97#include <linux/vmalloc.h>
98#include <linux/string.h>
98#include "ctree.h" 99#include "ctree.h"
99#include "disk-io.h" 100#include "disk-io.h"
100#include "hash.h" 101#include "hash.h"
@@ -105,6 +106,7 @@
105#include "locking.h" 106#include "locking.h"
106#include "check-integrity.h" 107#include "check-integrity.h"
107#include "rcu-string.h" 108#include "rcu-string.h"
109#include "compression.h"
108 110
109#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 111#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
110#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 112#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
@@ -176,7 +178,7 @@ struct btrfsic_block {
176 * Elements of this type are allocated dynamically and required because 178 * Elements of this type are allocated dynamically and required because
177 * each block object can refer to and can be ref from multiple blocks. 179 * each block object can refer to and can be ref from multiple blocks.
178 * The key to lookup them in the hashtable is the dev_bytenr of 180 * The key to lookup them in the hashtable is the dev_bytenr of
179 * the block ref to plus the one from the block refered from. 181 * the block ref to plus the one from the block referred from.
180 * The fact that they are searchable via a hashtable and that a 182 * The fact that they are searchable via a hashtable and that a
181 * ref_cnt is maintained is not required for the btrfs integrity 183 * ref_cnt is maintained is not required for the btrfs integrity
182 * check algorithm itself, it is only used to make the output more 184 * check algorithm itself, it is only used to make the output more
@@ -3076,7 +3078,7 @@ int btrfsic_mount(struct btrfs_root *root,
3076 3078
3077 list_for_each_entry(device, dev_head, dev_list) { 3079 list_for_each_entry(device, dev_head, dev_list) {
3078 struct btrfsic_dev_state *ds; 3080 struct btrfsic_dev_state *ds;
3079 char *p; 3081 const char *p;
3080 3082
3081 if (!device->bdev || !device->name) 3083 if (!device->bdev || !device->name)
3082 continue; 3084 continue;
@@ -3092,11 +3094,7 @@ int btrfsic_mount(struct btrfs_root *root,
3092 ds->state = state; 3094 ds->state = state;
3093 bdevname(ds->bdev, ds->name); 3095 bdevname(ds->bdev, ds->name);
3094 ds->name[BDEVNAME_SIZE - 1] = '\0'; 3096 ds->name[BDEVNAME_SIZE - 1] = '\0';
3095 for (p = ds->name; *p != '\0'; p++); 3097 p = kbasename(ds->name);
3096 while (p > ds->name && *p != '/')
3097 p--;
3098 if (*p == '/')
3099 p++;
3100 strlcpy(ds->name, p, sizeof(ds->name)); 3098 strlcpy(ds->name, p, sizeof(ds->name));
3101 btrfsic_dev_state_hashtable_add(ds, 3099 btrfsic_dev_state_hashtable_add(ds,
3102 &btrfsic_dev_state_hashtable); 3100 &btrfsic_dev_state_hashtable);
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 13a4dc0436c9..f49d8b8c0f00 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -48,6 +48,15 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
48void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt, 48void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt,
49 unsigned long pg_index, 49 unsigned long pg_index,
50 unsigned long pg_offset); 50 unsigned long pg_offset);
51
52enum btrfs_compression_type {
53 BTRFS_COMPRESS_NONE = 0,
54 BTRFS_COMPRESS_ZLIB = 1,
55 BTRFS_COMPRESS_LZO = 2,
56 BTRFS_COMPRESS_TYPES = 2,
57 BTRFS_COMPRESS_LAST = 3,
58};
59
51struct btrfs_compress_op { 60struct btrfs_compress_op {
52 struct list_head *(*alloc_workspace)(void); 61 struct list_head *(*alloc_workspace)(void);
53 62
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 769e0ff1b4ce..77592931ab4f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -311,7 +311,7 @@ struct tree_mod_root {
311 311
312struct tree_mod_elem { 312struct tree_mod_elem {
313 struct rb_node node; 313 struct rb_node node;
314 u64 index; /* shifted logical */ 314 u64 logical;
315 u64 seq; 315 u64 seq;
316 enum mod_log_op op; 316 enum mod_log_op op;
317 317
@@ -435,11 +435,11 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
435 435
436/* 436/*
437 * key order of the log: 437 * key order of the log:
438 * index -> sequence 438 * node/leaf start address -> sequence
439 * 439 *
440 * the index is the shifted logical of the *new* root node for root replace 440 * The 'start address' is the logical address of the *new* root node
441 * operations, or the shifted logical of the affected block for all other 441 * for root replace operations, or the logical address of the affected
442 * operations. 442 * block for all other operations.
443 * 443 *
444 * Note: must be called with write lock (tree_mod_log_write_lock). 444 * Note: must be called with write lock (tree_mod_log_write_lock).
445 */ 445 */
@@ -460,9 +460,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
460 while (*new) { 460 while (*new) {
461 cur = container_of(*new, struct tree_mod_elem, node); 461 cur = container_of(*new, struct tree_mod_elem, node);
462 parent = *new; 462 parent = *new;
463 if (cur->index < tm->index) 463 if (cur->logical < tm->logical)
464 new = &((*new)->rb_left); 464 new = &((*new)->rb_left);
465 else if (cur->index > tm->index) 465 else if (cur->logical > tm->logical)
466 new = &((*new)->rb_right); 466 new = &((*new)->rb_right);
467 else if (cur->seq < tm->seq) 467 else if (cur->seq < tm->seq)
468 new = &((*new)->rb_left); 468 new = &((*new)->rb_left);
@@ -523,7 +523,7 @@ alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
523 if (!tm) 523 if (!tm)
524 return NULL; 524 return NULL;
525 525
526 tm->index = eb->start >> PAGE_CACHE_SHIFT; 526 tm->logical = eb->start;
527 if (op != MOD_LOG_KEY_ADD) { 527 if (op != MOD_LOG_KEY_ADD) {
528 btrfs_node_key(eb, &tm->key, slot); 528 btrfs_node_key(eb, &tm->key, slot);
529 tm->blockptr = btrfs_node_blockptr(eb, slot); 529 tm->blockptr = btrfs_node_blockptr(eb, slot);
@@ -588,7 +588,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
588 goto free_tms; 588 goto free_tms;
589 } 589 }
590 590
591 tm->index = eb->start >> PAGE_CACHE_SHIFT; 591 tm->logical = eb->start;
592 tm->slot = src_slot; 592 tm->slot = src_slot;
593 tm->move.dst_slot = dst_slot; 593 tm->move.dst_slot = dst_slot;
594 tm->move.nr_items = nr_items; 594 tm->move.nr_items = nr_items;
@@ -699,7 +699,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
699 goto free_tms; 699 goto free_tms;
700 } 700 }
701 701
702 tm->index = new_root->start >> PAGE_CACHE_SHIFT; 702 tm->logical = new_root->start;
703 tm->old_root.logical = old_root->start; 703 tm->old_root.logical = old_root->start;
704 tm->old_root.level = btrfs_header_level(old_root); 704 tm->old_root.level = btrfs_header_level(old_root);
705 tm->generation = btrfs_header_generation(old_root); 705 tm->generation = btrfs_header_generation(old_root);
@@ -739,16 +739,15 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
739 struct rb_node *node; 739 struct rb_node *node;
740 struct tree_mod_elem *cur = NULL; 740 struct tree_mod_elem *cur = NULL;
741 struct tree_mod_elem *found = NULL; 741 struct tree_mod_elem *found = NULL;
742 u64 index = start >> PAGE_CACHE_SHIFT;
743 742
744 tree_mod_log_read_lock(fs_info); 743 tree_mod_log_read_lock(fs_info);
745 tm_root = &fs_info->tree_mod_log; 744 tm_root = &fs_info->tree_mod_log;
746 node = tm_root->rb_node; 745 node = tm_root->rb_node;
747 while (node) { 746 while (node) {
748 cur = container_of(node, struct tree_mod_elem, node); 747 cur = container_of(node, struct tree_mod_elem, node);
749 if (cur->index < index) { 748 if (cur->logical < start) {
750 node = node->rb_left; 749 node = node->rb_left;
751 } else if (cur->index > index) { 750 } else if (cur->logical > start) {
752 node = node->rb_right; 751 node = node->rb_right;
753 } else if (cur->seq < min_seq) { 752 } else if (cur->seq < min_seq) {
754 node = node->rb_left; 753 node = node->rb_left;
@@ -1230,9 +1229,10 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1230 return NULL; 1229 return NULL;
1231 1230
1232 /* 1231 /*
1233 * the very last operation that's logged for a root is the replacement 1232 * the very last operation that's logged for a root is the
1234 * operation (if it is replaced at all). this has the index of the *new* 1233 * replacement operation (if it is replaced at all). this has
1235 * root, making it the very first operation that's logged for this root. 1234 * the logical address of the *new* root, making it the very
1235 * first operation that's logged for this root.
1236 */ 1236 */
1237 while (1) { 1237 while (1) {
1238 tm = tree_mod_log_search_oldest(fs_info, root_logical, 1238 tm = tree_mod_log_search_oldest(fs_info, root_logical,
@@ -1336,7 +1336,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1336 if (!next) 1336 if (!next)
1337 break; 1337 break;
1338 tm = container_of(next, struct tree_mod_elem, node); 1338 tm = container_of(next, struct tree_mod_elem, node);
1339 if (tm->index != first_tm->index) 1339 if (tm->logical != first_tm->logical)
1340 break; 1340 break;
1341 } 1341 }
1342 tree_mod_log_read_unlock(fs_info); 1342 tree_mod_log_read_unlock(fs_info);
@@ -5361,7 +5361,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5361 goto out; 5361 goto out;
5362 } 5362 }
5363 5363
5364 tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS); 5364 tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL);
5365 if (!tmp_buf) { 5365 if (!tmp_buf) {
5366 ret = -ENOMEM; 5366 ret = -ENOMEM;
5367 goto out; 5367 goto out;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index bfe4a337fb4d..84a6a5b3384a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -100,6 +100,9 @@ struct btrfs_ordered_sum;
100/* tracks free space in block groups. */ 100/* tracks free space in block groups. */
101#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL 101#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
102 102
103/* device stats in the device tree */
104#define BTRFS_DEV_STATS_OBJECTID 0ULL
105
103/* for storing balance parameters in the root tree */ 106/* for storing balance parameters in the root tree */
104#define BTRFS_BALANCE_OBJECTID -4ULL 107#define BTRFS_BALANCE_OBJECTID -4ULL
105 108
@@ -715,14 +718,6 @@ struct btrfs_timespec {
715 __le32 nsec; 718 __le32 nsec;
716} __attribute__ ((__packed__)); 719} __attribute__ ((__packed__));
717 720
718enum btrfs_compression_type {
719 BTRFS_COMPRESS_NONE = 0,
720 BTRFS_COMPRESS_ZLIB = 1,
721 BTRFS_COMPRESS_LZO = 2,
722 BTRFS_COMPRESS_TYPES = 2,
723 BTRFS_COMPRESS_LAST = 3,
724};
725
726struct btrfs_inode_item { 721struct btrfs_inode_item {
727 /* nfs style generation number */ 722 /* nfs style generation number */
728 __le64 generation; 723 __le64 generation;
@@ -793,7 +788,7 @@ struct btrfs_root_item {
793 788
794 /* 789 /*
795 * This generation number is used to test if the new fields are valid 790 * This generation number is used to test if the new fields are valid
796 * and up to date while reading the root item. Everytime the root item 791 * and up to date while reading the root item. Every time the root item
797 * is written out, the "generation" field is copied into this field. If 792 * is written out, the "generation" field is copied into this field. If
798 * anyone ever mounted the fs with an older kernel, we will have 793 * anyone ever mounted the fs with an older kernel, we will have
799 * mismatching generation values here and thus must invalidate the 794 * mismatching generation values here and thus must invalidate the
@@ -1002,8 +997,10 @@ struct btrfs_dev_replace {
1002 pid_t lock_owner; 997 pid_t lock_owner;
1003 atomic_t nesting_level; 998 atomic_t nesting_level;
1004 struct mutex lock_finishing_cancel_unmount; 999 struct mutex lock_finishing_cancel_unmount;
1005 struct mutex lock_management_lock; 1000 rwlock_t lock;
1006 struct mutex lock; 1001 atomic_t read_locks;
1002 atomic_t blocking_readers;
1003 wait_queue_head_t read_lock_wq;
1007 1004
1008 struct btrfs_scrub_progress scrub_progress; 1005 struct btrfs_scrub_progress scrub_progress;
1009}; 1006};
@@ -1222,10 +1219,10 @@ struct btrfs_space_info {
1222 * we've called update_block_group and dropped the bytes_used counter 1219 * we've called update_block_group and dropped the bytes_used counter
1223 * and increased the bytes_pinned counter. However this means that 1220 * and increased the bytes_pinned counter. However this means that
1224 * bytes_pinned does not reflect the bytes that will be pinned once the 1221 * bytes_pinned does not reflect the bytes that will be pinned once the
1225 * delayed refs are flushed, so this counter is inc'ed everytime we call 1222 * delayed refs are flushed, so this counter is inc'ed every time we
1226 * btrfs_free_extent so it is a realtime count of what will be freed 1223 * call btrfs_free_extent so it is a realtime count of what will be
1227 * once the transaction is committed. It will be zero'ed everytime the 1224 * freed once the transaction is committed. It will be zero'ed every
1228 * transaction commits. 1225 * time the transaction commits.
1229 */ 1226 */
1230 struct percpu_counter total_bytes_pinned; 1227 struct percpu_counter total_bytes_pinned;
1231 1228
@@ -1822,6 +1819,9 @@ struct btrfs_fs_info {
1822 spinlock_t reada_lock; 1819 spinlock_t reada_lock;
1823 struct radix_tree_root reada_tree; 1820 struct radix_tree_root reada_tree;
1824 1821
1822 /* readahead works cnt */
1823 atomic_t reada_works_cnt;
1824
1825 /* Extent buffer radix tree */ 1825 /* Extent buffer radix tree */
1826 spinlock_t buffer_lock; 1826 spinlock_t buffer_lock;
1827 struct radix_tree_root buffer_radix; 1827 struct radix_tree_root buffer_radix;
@@ -2185,13 +2185,43 @@ struct btrfs_ioctl_defrag_range_args {
2185 */ 2185 */
2186#define BTRFS_QGROUP_RELATION_KEY 246 2186#define BTRFS_QGROUP_RELATION_KEY 246
2187 2187
2188/*
2189 * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY.
2190 */
2188#define BTRFS_BALANCE_ITEM_KEY 248 2191#define BTRFS_BALANCE_ITEM_KEY 248
2189 2192
2190/* 2193/*
2191 * Persistantly stores the io stats in the device tree. 2194 * The key type for tree items that are stored persistently, but do not need to
2192 * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid). 2195 * exist for extended period of time. The items can exist in any tree.
2196 *
2197 * [subtype, BTRFS_TEMPORARY_ITEM_KEY, data]
2198 *
2199 * Existing items:
2200 *
2201 * - balance status item
2202 * (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0)
2193 */ 2203 */
2194#define BTRFS_DEV_STATS_KEY 249 2204#define BTRFS_TEMPORARY_ITEM_KEY 248
2205
2206/*
2207 * Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY
2208 */
2209#define BTRFS_DEV_STATS_KEY 249
2210
2211/*
2212 * The key type for tree items that are stored persistently and usually exist
2213 * for a long period, eg. filesystem lifetime. The item kinds can be status
2214 * information, stats or preference values. The item can exist in any tree.
2215 *
2216 * [subtype, BTRFS_PERSISTENT_ITEM_KEY, data]
2217 *
2218 * Existing items:
2219 *
2220 * - device statistics, store IO stats in the device tree, one key for all
2221 * stats
2222 * (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0)
2223 */
2224#define BTRFS_PERSISTENT_ITEM_KEY 249
2195 2225
2196/* 2226/*
2197 * Persistantly stores the device replace state in the device tree. 2227 * Persistantly stores the device replace state in the device tree.
@@ -2241,7 +2271,7 @@ struct btrfs_ioctl_defrag_range_args {
2241#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) 2271#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
2242#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) 2272#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
2243#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) 2273#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
2244#define BTRFS_MOUNT_RECOVERY (1 << 18) 2274#define BTRFS_MOUNT_USEBACKUPROOT (1 << 18)
2245#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) 2275#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19)
2246#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) 2276#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20)
2247#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) 2277#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
@@ -2250,9 +2280,10 @@ struct btrfs_ioctl_defrag_range_args {
2250#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24) 2280#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24)
2251#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) 2281#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
2252#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) 2282#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26)
2283#define BTRFS_MOUNT_NOLOGREPLAY (1 << 27)
2253 2284
2254#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) 2285#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
2255#define BTRFS_DEFAULT_MAX_INLINE (8192) 2286#define BTRFS_DEFAULT_MAX_INLINE (2048)
2256 2287
2257#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 2288#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
2258#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 2289#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2353,6 +2384,9 @@ struct btrfs_map_token {
2353 unsigned long offset; 2384 unsigned long offset;
2354}; 2385};
2355 2386
2387#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
2388 ((bytes) >> (fs_info)->sb->s_blocksize_bits)
2389
2356static inline void btrfs_init_map_token (struct btrfs_map_token *token) 2390static inline void btrfs_init_map_token (struct btrfs_map_token *token)
2357{ 2391{
2358 token->kaddr = NULL; 2392 token->kaddr = NULL;
@@ -3448,8 +3482,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes);
3448static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 3482static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
3449 unsigned num_items) 3483 unsigned num_items)
3450{ 3484{
3451 return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * 3485 return root->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
3452 2 * num_items;
3453} 3486}
3454 3487
3455/* 3488/*
@@ -4027,7 +4060,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4027 struct btrfs_root *root, 4060 struct btrfs_root *root,
4028 struct inode *dir, u64 objectid, 4061 struct inode *dir, u64 objectid,
4029 const char *name, int name_len); 4062 const char *name, int name_len);
4030int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, 4063int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
4031 int front); 4064 int front);
4032int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 4065int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4033 struct btrfs_root *root, 4066 struct btrfs_root *root,
@@ -4089,6 +4122,7 @@ void btrfs_test_inode_set_ops(struct inode *inode);
4089 4122
4090/* ioctl.c */ 4123/* ioctl.c */
4091long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 4124long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
4125int btrfs_ioctl_get_supported_features(void __user *arg);
4092void btrfs_update_iflags(struct inode *inode); 4126void btrfs_update_iflags(struct inode *inode);
4093void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); 4127void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
4094int btrfs_is_empty_uuid(u8 *uuid); 4128int btrfs_is_empty_uuid(u8 *uuid);
@@ -4151,7 +4185,8 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
4151ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); 4185ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
4152 4186
4153/* super.c */ 4187/* super.c */
4154int btrfs_parse_options(struct btrfs_root *root, char *options); 4188int btrfs_parse_options(struct btrfs_root *root, char *options,
4189 unsigned long new_flags);
4155int btrfs_sync_fs(struct super_block *sb, int wait); 4190int btrfs_sync_fs(struct super_block *sb, int wait);
4156 4191
4157#ifdef CONFIG_PRINTK 4192#ifdef CONFIG_PRINTK
@@ -4525,8 +4560,8 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
4525 struct btrfs_key *start, struct btrfs_key *end); 4560 struct btrfs_key *start, struct btrfs_key *end);
4526int btrfs_reada_wait(void *handle); 4561int btrfs_reada_wait(void *handle);
4527void btrfs_reada_detach(void *handle); 4562void btrfs_reada_detach(void *handle);
4528int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, 4563int btree_readahead_hook(struct btrfs_fs_info *fs_info,
4529 u64 start, int err); 4564 struct extent_buffer *eb, u64 start, int err);
4530 4565
4531static inline int is_fstree(u64 rootid) 4566static inline int is_fstree(u64 rootid)
4532{ 4567{
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index b57daa895cea..6cef0062f929 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -43,8 +43,7 @@ int __init btrfs_delayed_inode_init(void)
43 43
44void btrfs_delayed_inode_exit(void) 44void btrfs_delayed_inode_exit(void)
45{ 45{
46 if (delayed_node_cache) 46 kmem_cache_destroy(delayed_node_cache);
47 kmem_cache_destroy(delayed_node_cache);
48} 47}
49 48
50static inline void btrfs_init_delayed_node( 49static inline void btrfs_init_delayed_node(
@@ -651,9 +650,14 @@ static int btrfs_delayed_inode_reserve_metadata(
651 goto out; 650 goto out;
652 651
653 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 652 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
654 if (!WARN_ON(ret)) 653 if (!ret)
655 goto out; 654 goto out;
656 655
656 if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
657 btrfs_debug(root->fs_info,
658 "block rsv migrate returned %d", ret);
659 WARN_ON(1);
660 }
657 /* 661 /*
658 * Ok this is a problem, let's just steal from the global rsv 662 * Ok this is a problem, let's just steal from the global rsv
659 * since this really shouldn't happen that often. 663 * since this really shouldn't happen that often.
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 914ac13bd92f..430b3689b112 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -929,14 +929,10 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
929 929
930void btrfs_delayed_ref_exit(void) 930void btrfs_delayed_ref_exit(void)
931{ 931{
932 if (btrfs_delayed_ref_head_cachep) 932 kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
933 kmem_cache_destroy(btrfs_delayed_ref_head_cachep); 933 kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
934 if (btrfs_delayed_tree_ref_cachep) 934 kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
935 kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); 935 kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
936 if (btrfs_delayed_data_ref_cachep)
937 kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
938 if (btrfs_delayed_extent_op_cachep)
939 kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
940} 936}
941 937
942int btrfs_delayed_ref_init(void) 938int btrfs_delayed_ref_init(void)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index cbb7dbfb3fff..a1d6652e0c47 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -202,13 +202,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
202 struct btrfs_dev_replace_item *ptr; 202 struct btrfs_dev_replace_item *ptr;
203 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 203 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
204 204
205 btrfs_dev_replace_lock(dev_replace); 205 btrfs_dev_replace_lock(dev_replace, 0);
206 if (!dev_replace->is_valid || 206 if (!dev_replace->is_valid ||
207 !dev_replace->item_needs_writeback) { 207 !dev_replace->item_needs_writeback) {
208 btrfs_dev_replace_unlock(dev_replace); 208 btrfs_dev_replace_unlock(dev_replace, 0);
209 return 0; 209 return 0;
210 } 210 }
211 btrfs_dev_replace_unlock(dev_replace); 211 btrfs_dev_replace_unlock(dev_replace, 0);
212 212
213 key.objectid = 0; 213 key.objectid = 0;
214 key.type = BTRFS_DEV_REPLACE_KEY; 214 key.type = BTRFS_DEV_REPLACE_KEY;
@@ -264,7 +264,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
264 ptr = btrfs_item_ptr(eb, path->slots[0], 264 ptr = btrfs_item_ptr(eb, path->slots[0],
265 struct btrfs_dev_replace_item); 265 struct btrfs_dev_replace_item);
266 266
267 btrfs_dev_replace_lock(dev_replace); 267 btrfs_dev_replace_lock(dev_replace, 1);
268 if (dev_replace->srcdev) 268 if (dev_replace->srcdev)
269 btrfs_set_dev_replace_src_devid(eb, ptr, 269 btrfs_set_dev_replace_src_devid(eb, ptr,
270 dev_replace->srcdev->devid); 270 dev_replace->srcdev->devid);
@@ -287,7 +287,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
287 btrfs_set_dev_replace_cursor_right(eb, ptr, 287 btrfs_set_dev_replace_cursor_right(eb, ptr,
288 dev_replace->cursor_right); 288 dev_replace->cursor_right);
289 dev_replace->item_needs_writeback = 0; 289 dev_replace->item_needs_writeback = 0;
290 btrfs_dev_replace_unlock(dev_replace); 290 btrfs_dev_replace_unlock(dev_replace, 1);
291 291
292 btrfs_mark_buffer_dirty(eb); 292 btrfs_mark_buffer_dirty(eb);
293 293
@@ -356,7 +356,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
356 return PTR_ERR(trans); 356 return PTR_ERR(trans);
357 } 357 }
358 358
359 btrfs_dev_replace_lock(dev_replace); 359 btrfs_dev_replace_lock(dev_replace, 1);
360 switch (dev_replace->replace_state) { 360 switch (dev_replace->replace_state) {
361 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 361 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
362 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 362 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -395,7 +395,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
395 dev_replace->is_valid = 1; 395 dev_replace->is_valid = 1;
396 dev_replace->item_needs_writeback = 1; 396 dev_replace->item_needs_writeback = 1;
397 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 397 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
398 btrfs_dev_replace_unlock(dev_replace); 398 btrfs_dev_replace_unlock(dev_replace, 1);
399 399
400 ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); 400 ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
401 if (ret) 401 if (ret)
@@ -407,7 +407,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
407 trans = btrfs_start_transaction(root, 0); 407 trans = btrfs_start_transaction(root, 0);
408 if (IS_ERR(trans)) { 408 if (IS_ERR(trans)) {
409 ret = PTR_ERR(trans); 409 ret = PTR_ERR(trans);
410 btrfs_dev_replace_lock(dev_replace); 410 btrfs_dev_replace_lock(dev_replace, 1);
411 goto leave; 411 goto leave;
412 } 412 }
413 413
@@ -433,7 +433,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
433leave: 433leave:
434 dev_replace->srcdev = NULL; 434 dev_replace->srcdev = NULL;
435 dev_replace->tgtdev = NULL; 435 dev_replace->tgtdev = NULL;
436 btrfs_dev_replace_unlock(dev_replace); 436 btrfs_dev_replace_unlock(dev_replace, 1);
437 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 437 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
438 return ret; 438 return ret;
439} 439}
@@ -471,18 +471,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
471 /* don't allow cancel or unmount to disturb the finishing procedure */ 471 /* don't allow cancel or unmount to disturb the finishing procedure */
472 mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 472 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
473 473
474 btrfs_dev_replace_lock(dev_replace); 474 btrfs_dev_replace_lock(dev_replace, 0);
475 /* was the operation canceled, or is it finished? */ 475 /* was the operation canceled, or is it finished? */
476 if (dev_replace->replace_state != 476 if (dev_replace->replace_state !=
477 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { 477 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
478 btrfs_dev_replace_unlock(dev_replace); 478 btrfs_dev_replace_unlock(dev_replace, 0);
479 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 479 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
480 return 0; 480 return 0;
481 } 481 }
482 482
483 tgt_device = dev_replace->tgtdev; 483 tgt_device = dev_replace->tgtdev;
484 src_device = dev_replace->srcdev; 484 src_device = dev_replace->srcdev;
485 btrfs_dev_replace_unlock(dev_replace); 485 btrfs_dev_replace_unlock(dev_replace, 0);
486 486
487 /* 487 /*
488 * flush all outstanding I/O and inode extent mappings before the 488 * flush all outstanding I/O and inode extent mappings before the
@@ -507,7 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
507 /* keep away write_all_supers() during the finishing procedure */ 507 /* keep away write_all_supers() during the finishing procedure */
508 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 508 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
509 mutex_lock(&root->fs_info->chunk_mutex); 509 mutex_lock(&root->fs_info->chunk_mutex);
510 btrfs_dev_replace_lock(dev_replace); 510 btrfs_dev_replace_lock(dev_replace, 1);
511 dev_replace->replace_state = 511 dev_replace->replace_state =
512 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 512 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
513 : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; 513 : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
@@ -528,7 +528,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
528 rcu_str_deref(src_device->name), 528 rcu_str_deref(src_device->name),
529 src_device->devid, 529 src_device->devid,
530 rcu_str_deref(tgt_device->name), scrub_ret); 530 rcu_str_deref(tgt_device->name), scrub_ret);
531 btrfs_dev_replace_unlock(dev_replace); 531 btrfs_dev_replace_unlock(dev_replace, 1);
532 mutex_unlock(&root->fs_info->chunk_mutex); 532 mutex_unlock(&root->fs_info->chunk_mutex);
533 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 533 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
534 mutex_unlock(&uuid_mutex); 534 mutex_unlock(&uuid_mutex);
@@ -565,7 +565,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
565 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 565 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
566 fs_info->fs_devices->rw_devices++; 566 fs_info->fs_devices->rw_devices++;
567 567
568 btrfs_dev_replace_unlock(dev_replace); 568 btrfs_dev_replace_unlock(dev_replace, 1);
569 569
570 btrfs_rm_dev_replace_blocked(fs_info); 570 btrfs_rm_dev_replace_blocked(fs_info);
571 571
@@ -649,7 +649,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
649 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 649 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
650 struct btrfs_device *srcdev; 650 struct btrfs_device *srcdev;
651 651
652 btrfs_dev_replace_lock(dev_replace); 652 btrfs_dev_replace_lock(dev_replace, 0);
653 /* even if !dev_replace_is_valid, the values are good enough for 653 /* even if !dev_replace_is_valid, the values are good enough for
654 * the replace_status ioctl */ 654 * the replace_status ioctl */
655 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 655 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
@@ -675,7 +675,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
675 div_u64(btrfs_device_get_total_bytes(srcdev), 1000)); 675 div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
676 break; 676 break;
677 } 677 }
678 btrfs_dev_replace_unlock(dev_replace); 678 btrfs_dev_replace_unlock(dev_replace, 0);
679} 679}
680 680
681int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, 681int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
@@ -698,13 +698,13 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
698 return -EROFS; 698 return -EROFS;
699 699
700 mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 700 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
701 btrfs_dev_replace_lock(dev_replace); 701 btrfs_dev_replace_lock(dev_replace, 1);
702 switch (dev_replace->replace_state) { 702 switch (dev_replace->replace_state) {
703 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 703 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
704 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 704 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
705 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 705 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
706 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; 706 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
707 btrfs_dev_replace_unlock(dev_replace); 707 btrfs_dev_replace_unlock(dev_replace, 1);
708 goto leave; 708 goto leave;
709 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 709 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
710 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 710 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
@@ -717,7 +717,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
717 dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; 717 dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
718 dev_replace->time_stopped = get_seconds(); 718 dev_replace->time_stopped = get_seconds();
719 dev_replace->item_needs_writeback = 1; 719 dev_replace->item_needs_writeback = 1;
720 btrfs_dev_replace_unlock(dev_replace); 720 btrfs_dev_replace_unlock(dev_replace, 1);
721 btrfs_scrub_cancel(fs_info); 721 btrfs_scrub_cancel(fs_info);
722 722
723 trans = btrfs_start_transaction(root, 0); 723 trans = btrfs_start_transaction(root, 0);
@@ -740,7 +740,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
740 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 740 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
741 741
742 mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 742 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
743 btrfs_dev_replace_lock(dev_replace); 743 btrfs_dev_replace_lock(dev_replace, 1);
744 switch (dev_replace->replace_state) { 744 switch (dev_replace->replace_state) {
745 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 745 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
746 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 746 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -756,7 +756,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
756 break; 756 break;
757 } 757 }
758 758
759 btrfs_dev_replace_unlock(dev_replace); 759 btrfs_dev_replace_unlock(dev_replace, 1);
760 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 760 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
761} 761}
762 762
@@ -766,12 +766,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
766 struct task_struct *task; 766 struct task_struct *task;
767 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 767 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
768 768
769 btrfs_dev_replace_lock(dev_replace); 769 btrfs_dev_replace_lock(dev_replace, 1);
770 switch (dev_replace->replace_state) { 770 switch (dev_replace->replace_state) {
771 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 771 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
772 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 772 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
773 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 773 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
774 btrfs_dev_replace_unlock(dev_replace); 774 btrfs_dev_replace_unlock(dev_replace, 1);
775 return 0; 775 return 0;
776 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 776 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
777 break; 777 break;
@@ -784,10 +784,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
784 btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing"); 784 btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing");
785 btrfs_info(fs_info, 785 btrfs_info(fs_info,
786 "you may cancel the operation after 'mount -o degraded'"); 786 "you may cancel the operation after 'mount -o degraded'");
787 btrfs_dev_replace_unlock(dev_replace); 787 btrfs_dev_replace_unlock(dev_replace, 1);
788 return 0; 788 return 0;
789 } 789 }
790 btrfs_dev_replace_unlock(dev_replace); 790 btrfs_dev_replace_unlock(dev_replace, 1);
791 791
792 WARN_ON(atomic_xchg( 792 WARN_ON(atomic_xchg(
793 &fs_info->mutually_exclusive_operation_running, 1)); 793 &fs_info->mutually_exclusive_operation_running, 1));
@@ -802,7 +802,7 @@ static int btrfs_dev_replace_kthread(void *data)
802 struct btrfs_ioctl_dev_replace_args *status_args; 802 struct btrfs_ioctl_dev_replace_args *status_args;
803 u64 progress; 803 u64 progress;
804 804
805 status_args = kzalloc(sizeof(*status_args), GFP_NOFS); 805 status_args = kzalloc(sizeof(*status_args), GFP_KERNEL);
806 if (status_args) { 806 if (status_args) {
807 btrfs_dev_replace_status(fs_info, status_args); 807 btrfs_dev_replace_status(fs_info, status_args);
808 progress = status_args->status.progress_1000; 808 progress = status_args->status.progress_1000;
@@ -858,55 +858,65 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
858 * not called and the the filesystem is remounted 858 * not called and the the filesystem is remounted
859 * in degraded state. This does not stop the 859 * in degraded state. This does not stop the
860 * dev_replace procedure. It needs to be canceled 860 * dev_replace procedure. It needs to be canceled
861 * manually if the cancelation is wanted. 861 * manually if the cancellation is wanted.
862 */ 862 */
863 break; 863 break;
864 } 864 }
865 return 1; 865 return 1;
866} 866}
867 867
868void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace) 868void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw)
869{ 869{
870 /* the beginning is just an optimization for the typical case */ 870 if (rw == 1) {
871 if (atomic_read(&dev_replace->nesting_level) == 0) { 871 /* write */
872acquire_lock: 872again:
873 /* this is not a nested case where the same thread 873 wait_event(dev_replace->read_lock_wq,
874 * is trying to acqurire the same lock twice */ 874 atomic_read(&dev_replace->blocking_readers) == 0);
875 mutex_lock(&dev_replace->lock); 875 write_lock(&dev_replace->lock);
876 mutex_lock(&dev_replace->lock_management_lock); 876 if (atomic_read(&dev_replace->blocking_readers)) {
877 dev_replace->lock_owner = current->pid; 877 write_unlock(&dev_replace->lock);
878 atomic_inc(&dev_replace->nesting_level); 878 goto again;
879 mutex_unlock(&dev_replace->lock_management_lock); 879 }
880 return; 880 } else {
881 read_lock(&dev_replace->lock);
882 atomic_inc(&dev_replace->read_locks);
881 } 883 }
884}
882 885
883 mutex_lock(&dev_replace->lock_management_lock); 886void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw)
884 if (atomic_read(&dev_replace->nesting_level) > 0 && 887{
885 dev_replace->lock_owner == current->pid) { 888 if (rw == 1) {
886 WARN_ON(!mutex_is_locked(&dev_replace->lock)); 889 /* write */
887 atomic_inc(&dev_replace->nesting_level); 890 ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
888 mutex_unlock(&dev_replace->lock_management_lock); 891 write_unlock(&dev_replace->lock);
889 return; 892 } else {
893 ASSERT(atomic_read(&dev_replace->read_locks) > 0);
894 atomic_dec(&dev_replace->read_locks);
895 read_unlock(&dev_replace->lock);
890 } 896 }
897}
891 898
892 mutex_unlock(&dev_replace->lock_management_lock); 899/* inc blocking cnt and release read lock */
893 goto acquire_lock; 900void btrfs_dev_replace_set_lock_blocking(
901 struct btrfs_dev_replace *dev_replace)
902{
903 /* only set blocking for read lock */
904 ASSERT(atomic_read(&dev_replace->read_locks) > 0);
905 atomic_inc(&dev_replace->blocking_readers);
906 read_unlock(&dev_replace->lock);
894} 907}
895 908
896void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace) 909/* acquire read lock and dec blocking cnt */
910void btrfs_dev_replace_clear_lock_blocking(
911 struct btrfs_dev_replace *dev_replace)
897{ 912{
898 WARN_ON(!mutex_is_locked(&dev_replace->lock)); 913 /* only set blocking for read lock */
899 mutex_lock(&dev_replace->lock_management_lock); 914 ASSERT(atomic_read(&dev_replace->read_locks) > 0);
900 WARN_ON(atomic_read(&dev_replace->nesting_level) < 1); 915 ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
901 WARN_ON(dev_replace->lock_owner != current->pid); 916 read_lock(&dev_replace->lock);
902 atomic_dec(&dev_replace->nesting_level); 917 if (atomic_dec_and_test(&dev_replace->blocking_readers) &&
903 if (atomic_read(&dev_replace->nesting_level) == 0) { 918 waitqueue_active(&dev_replace->read_lock_wq))
904 dev_replace->lock_owner = 0; 919 wake_up(&dev_replace->read_lock_wq);
905 mutex_unlock(&dev_replace->lock_management_lock);
906 mutex_unlock(&dev_replace->lock);
907 } else {
908 mutex_unlock(&dev_replace->lock_management_lock);
909 }
910} 920}
911 921
912void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) 922void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 20035cbbf021..29e3ef5f96bd 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -34,8 +34,11 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
34void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); 34void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
35int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); 35int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
36int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); 36int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
37void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace); 37void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw);
38void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace); 38void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw);
39void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace);
40void btrfs_dev_replace_clear_lock_blocking(
41 struct btrfs_dev_replace *dev_replace);
39 42
40static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) 43static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
41{ 44{
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4545e2e2ad45..4b02591b0301 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,6 +50,7 @@
50#include "raid56.h" 50#include "raid56.h"
51#include "sysfs.h" 51#include "sysfs.h"
52#include "qgroup.h" 52#include "qgroup.h"
53#include "compression.h"
53 54
54#ifdef CONFIG_X86 55#ifdef CONFIG_X86
55#include <asm/cpufeature.h> 56#include <asm/cpufeature.h>
@@ -110,8 +111,7 @@ int __init btrfs_end_io_wq_init(void)
110 111
111void btrfs_end_io_wq_exit(void) 112void btrfs_end_io_wq_exit(void)
112{ 113{
113 if (btrfs_end_io_wq_cache) 114 kmem_cache_destroy(btrfs_end_io_wq_cache);
114 kmem_cache_destroy(btrfs_end_io_wq_cache);
115} 115}
116 116
117/* 117/*
@@ -612,6 +612,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
612 int found_level; 612 int found_level;
613 struct extent_buffer *eb; 613 struct extent_buffer *eb;
614 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 614 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
615 struct btrfs_fs_info *fs_info = root->fs_info;
615 int ret = 0; 616 int ret = 0;
616 int reads_done; 617 int reads_done;
617 618
@@ -637,21 +638,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
637 638
638 found_start = btrfs_header_bytenr(eb); 639 found_start = btrfs_header_bytenr(eb);
639 if (found_start != eb->start) { 640 if (found_start != eb->start) {
640 btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu", 641 btrfs_err_rl(fs_info, "bad tree block start %llu %llu",
641 found_start, eb->start); 642 found_start, eb->start);
642 ret = -EIO; 643 ret = -EIO;
643 goto err; 644 goto err;
644 } 645 }
645 if (check_tree_block_fsid(root->fs_info, eb)) { 646 if (check_tree_block_fsid(fs_info, eb)) {
646 btrfs_err_rl(eb->fs_info, "bad fsid on block %llu", 647 btrfs_err_rl(fs_info, "bad fsid on block %llu",
647 eb->start); 648 eb->start);
648 ret = -EIO; 649 ret = -EIO;
649 goto err; 650 goto err;
650 } 651 }
651 found_level = btrfs_header_level(eb); 652 found_level = btrfs_header_level(eb);
652 if (found_level >= BTRFS_MAX_LEVEL) { 653 if (found_level >= BTRFS_MAX_LEVEL) {
653 btrfs_err(root->fs_info, "bad tree block level %d", 654 btrfs_err(fs_info, "bad tree block level %d",
654 (int)btrfs_header_level(eb)); 655 (int)btrfs_header_level(eb));
655 ret = -EIO; 656 ret = -EIO;
656 goto err; 657 goto err;
657 } 658 }
@@ -659,7 +660,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
659 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), 660 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
660 eb, found_level); 661 eb, found_level);
661 662
662 ret = csum_tree_block(root->fs_info, eb, 1); 663 ret = csum_tree_block(fs_info, eb, 1);
663 if (ret) { 664 if (ret) {
664 ret = -EIO; 665 ret = -EIO;
665 goto err; 666 goto err;
@@ -680,7 +681,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
680err: 681err:
681 if (reads_done && 682 if (reads_done &&
682 test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) 683 test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
683 btree_readahead_hook(root, eb, eb->start, ret); 684 btree_readahead_hook(fs_info, eb, eb->start, ret);
684 685
685 if (ret) { 686 if (ret) {
686 /* 687 /*
@@ -699,14 +700,13 @@ out:
699static int btree_io_failed_hook(struct page *page, int failed_mirror) 700static int btree_io_failed_hook(struct page *page, int failed_mirror)
700{ 701{
701 struct extent_buffer *eb; 702 struct extent_buffer *eb;
702 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
703 703
704 eb = (struct extent_buffer *)page->private; 704 eb = (struct extent_buffer *)page->private;
705 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 705 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
706 eb->read_mirror = failed_mirror; 706 eb->read_mirror = failed_mirror;
707 atomic_dec(&eb->io_pages); 707 atomic_dec(&eb->io_pages);
708 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) 708 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
709 btree_readahead_hook(root, eb, eb->start, -EIO); 709 btree_readahead_hook(eb->fs_info, eb, eb->start, -EIO);
710 return -EIO; /* we fixed nothing */ 710 return -EIO; /* we fixed nothing */
711} 711}
712 712
@@ -816,7 +816,7 @@ static void run_one_async_done(struct btrfs_work *work)
816 waitqueue_active(&fs_info->async_submit_wait)) 816 waitqueue_active(&fs_info->async_submit_wait))
817 wake_up(&fs_info->async_submit_wait); 817 wake_up(&fs_info->async_submit_wait);
818 818
819 /* If an error occured we just want to clean up the bio and move on */ 819 /* If an error occurred we just want to clean up the bio and move on */
820 if (async->error) { 820 if (async->error) {
821 async->bio->bi_error = async->error; 821 async->bio->bi_error = async->error;
822 bio_endio(async->bio); 822 bio_endio(async->bio);
@@ -931,7 +931,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags)
931 if (bio_flags & EXTENT_BIO_TREE_LOG) 931 if (bio_flags & EXTENT_BIO_TREE_LOG)
932 return 0; 932 return 0;
933#ifdef CONFIG_X86 933#ifdef CONFIG_X86
934 if (static_cpu_has_safe(X86_FEATURE_XMM4_2)) 934 if (static_cpu_has(X86_FEATURE_XMM4_2))
935 return 0; 935 return 0;
936#endif 936#endif
937 return 1; 937 return 1;
@@ -1296,9 +1296,10 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
1296 spin_lock_init(&root->root_item_lock); 1296 spin_lock_init(&root->root_item_lock);
1297} 1297}
1298 1298
1299static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) 1299static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
1300 gfp_t flags)
1300{ 1301{
1301 struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); 1302 struct btrfs_root *root = kzalloc(sizeof(*root), flags);
1302 if (root) 1303 if (root)
1303 root->fs_info = fs_info; 1304 root->fs_info = fs_info;
1304 return root; 1305 return root;
@@ -1310,7 +1311,7 @@ struct btrfs_root *btrfs_alloc_dummy_root(void)
1310{ 1311{
1311 struct btrfs_root *root; 1312 struct btrfs_root *root;
1312 1313
1313 root = btrfs_alloc_root(NULL); 1314 root = btrfs_alloc_root(NULL, GFP_KERNEL);
1314 if (!root) 1315 if (!root)
1315 return ERR_PTR(-ENOMEM); 1316 return ERR_PTR(-ENOMEM);
1316 __setup_root(4096, 4096, 4096, root, NULL, 1); 1317 __setup_root(4096, 4096, 4096, root, NULL, 1);
@@ -1332,7 +1333,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1332 int ret = 0; 1333 int ret = 0;
1333 uuid_le uuid; 1334 uuid_le uuid;
1334 1335
1335 root = btrfs_alloc_root(fs_info); 1336 root = btrfs_alloc_root(fs_info, GFP_KERNEL);
1336 if (!root) 1337 if (!root)
1337 return ERR_PTR(-ENOMEM); 1338 return ERR_PTR(-ENOMEM);
1338 1339
@@ -1408,7 +1409,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1408 struct btrfs_root *tree_root = fs_info->tree_root; 1409 struct btrfs_root *tree_root = fs_info->tree_root;
1409 struct extent_buffer *leaf; 1410 struct extent_buffer *leaf;
1410 1411
1411 root = btrfs_alloc_root(fs_info); 1412 root = btrfs_alloc_root(fs_info, GFP_NOFS);
1412 if (!root) 1413 if (!root)
1413 return ERR_PTR(-ENOMEM); 1414 return ERR_PTR(-ENOMEM);
1414 1415
@@ -1506,7 +1507,7 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1506 if (!path) 1507 if (!path)
1507 return ERR_PTR(-ENOMEM); 1508 return ERR_PTR(-ENOMEM);
1508 1509
1509 root = btrfs_alloc_root(fs_info); 1510 root = btrfs_alloc_root(fs_info, GFP_NOFS);
1510 if (!root) { 1511 if (!root) {
1511 ret = -ENOMEM; 1512 ret = -ENOMEM;
1512 goto alloc_fail; 1513 goto alloc_fail;
@@ -2272,9 +2273,11 @@ static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
2272 fs_info->dev_replace.lock_owner = 0; 2273 fs_info->dev_replace.lock_owner = 0;
2273 atomic_set(&fs_info->dev_replace.nesting_level, 0); 2274 atomic_set(&fs_info->dev_replace.nesting_level, 0);
2274 mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); 2275 mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2275 mutex_init(&fs_info->dev_replace.lock_management_lock); 2276 rwlock_init(&fs_info->dev_replace.lock);
2276 mutex_init(&fs_info->dev_replace.lock); 2277 atomic_set(&fs_info->dev_replace.read_locks, 0);
2278 atomic_set(&fs_info->dev_replace.blocking_readers, 0);
2277 init_waitqueue_head(&fs_info->replace_wait); 2279 init_waitqueue_head(&fs_info->replace_wait);
2280 init_waitqueue_head(&fs_info->dev_replace.read_lock_wq);
2278} 2281}
2279 2282
2280static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) 2283static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
@@ -2385,7 +2388,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
2385 return -EIO; 2388 return -EIO;
2386 } 2389 }
2387 2390
2388 log_tree_root = btrfs_alloc_root(fs_info); 2391 log_tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
2389 if (!log_tree_root) 2392 if (!log_tree_root)
2390 return -ENOMEM; 2393 return -ENOMEM;
2391 2394
@@ -2510,8 +2513,8 @@ int open_ctree(struct super_block *sb,
2510 int backup_index = 0; 2513 int backup_index = 0;
2511 int max_active; 2514 int max_active;
2512 2515
2513 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); 2516 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
2514 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); 2517 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
2515 if (!tree_root || !chunk_root) { 2518 if (!tree_root || !chunk_root) {
2516 err = -ENOMEM; 2519 err = -ENOMEM;
2517 goto fail; 2520 goto fail;
@@ -2603,6 +2606,7 @@ int open_ctree(struct super_block *sb,
2603 atomic_set(&fs_info->nr_async_bios, 0); 2606 atomic_set(&fs_info->nr_async_bios, 0);
2604 atomic_set(&fs_info->defrag_running, 0); 2607 atomic_set(&fs_info->defrag_running, 0);
2605 atomic_set(&fs_info->qgroup_op_seq, 0); 2608 atomic_set(&fs_info->qgroup_op_seq, 0);
2609 atomic_set(&fs_info->reada_works_cnt, 0);
2606 atomic64_set(&fs_info->tree_mod_seq, 0); 2610 atomic64_set(&fs_info->tree_mod_seq, 0);
2607 fs_info->sb = sb; 2611 fs_info->sb = sb;
2608 fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; 2612 fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
@@ -2622,7 +2626,7 @@ int open_ctree(struct super_block *sb,
2622 INIT_LIST_HEAD(&fs_info->ordered_roots); 2626 INIT_LIST_HEAD(&fs_info->ordered_roots);
2623 spin_lock_init(&fs_info->ordered_root_lock); 2627 spin_lock_init(&fs_info->ordered_root_lock);
2624 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), 2628 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2625 GFP_NOFS); 2629 GFP_KERNEL);
2626 if (!fs_info->delayed_root) { 2630 if (!fs_info->delayed_root) {
2627 err = -ENOMEM; 2631 err = -ENOMEM;
2628 goto fail_iput; 2632 goto fail_iput;
@@ -2750,7 +2754,7 @@ int open_ctree(struct super_block *sb,
2750 */ 2754 */
2751 fs_info->compress_type = BTRFS_COMPRESS_ZLIB; 2755 fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
2752 2756
2753 ret = btrfs_parse_options(tree_root, options); 2757 ret = btrfs_parse_options(tree_root, options, sb->s_flags);
2754 if (ret) { 2758 if (ret) {
2755 err = ret; 2759 err = ret;
2756 goto fail_alloc; 2760 goto fail_alloc;
@@ -3029,8 +3033,9 @@ retry_root_backup:
3029 if (ret) 3033 if (ret)
3030 goto fail_trans_kthread; 3034 goto fail_trans_kthread;
3031 3035
3032 /* do not make disk changes in broken FS */ 3036 /* do not make disk changes in broken FS or nologreplay is given */
3033 if (btrfs_super_log_root(disk_super) != 0) { 3037 if (btrfs_super_log_root(disk_super) != 0 &&
3038 !btrfs_test_opt(tree_root, NOLOGREPLAY)) {
3034 ret = btrfs_replay_log(fs_info, fs_devices); 3039 ret = btrfs_replay_log(fs_info, fs_devices);
3035 if (ret) { 3040 if (ret) {
3036 err = ret; 3041 err = ret;
@@ -3146,6 +3151,12 @@ retry_root_backup:
3146 3151
3147 fs_info->open = 1; 3152 fs_info->open = 1;
3148 3153
3154 /*
3155 * backuproot only affect mount behavior, and if open_ctree succeeded,
3156 * no need to keep the flag
3157 */
3158 btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
3159
3149 return 0; 3160 return 0;
3150 3161
3151fail_qgroup: 3162fail_qgroup:
@@ -3200,7 +3211,7 @@ fail:
3200 return err; 3211 return err;
3201 3212
3202recovery_tree_root: 3213recovery_tree_root:
3203 if (!btrfs_test_opt(tree_root, RECOVERY)) 3214 if (!btrfs_test_opt(tree_root, USEBACKUPROOT))
3204 goto fail_tree_roots; 3215 goto fail_tree_roots;
3205 3216
3206 free_root_pointers(fs_info, 0); 3217 free_root_pointers(fs_info, 0);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e2287c7c10be..53e12977bfd0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4838,7 +4838,7 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4838 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 4838 u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4839 4839
4840 /* If we're just plain full then async reclaim just slows us down. */ 4840 /* If we're just plain full then async reclaim just slows us down. */
4841 if (space_info->bytes_used >= thresh) 4841 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
4842 return 0; 4842 return 0;
4843 4843
4844 return (used >= thresh && !btrfs_fs_closing(fs_info) && 4844 return (used >= thresh && !btrfs_fs_closing(fs_info) &&
@@ -5373,27 +5373,33 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5373 5373
5374 block_rsv->size = min_t(u64, num_bytes, SZ_512M); 5374 block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5375 5375
5376 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + 5376 if (block_rsv->reserved < block_rsv->size) {
5377 sinfo->bytes_reserved + sinfo->bytes_readonly + 5377 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
5378 sinfo->bytes_may_use; 5378 sinfo->bytes_reserved + sinfo->bytes_readonly +
5379 5379 sinfo->bytes_may_use;
5380 if (sinfo->total_bytes > num_bytes) { 5380 if (sinfo->total_bytes > num_bytes) {
5381 num_bytes = sinfo->total_bytes - num_bytes; 5381 num_bytes = sinfo->total_bytes - num_bytes;
5382 block_rsv->reserved += num_bytes; 5382 num_bytes = min(num_bytes,
5383 sinfo->bytes_may_use += num_bytes; 5383 block_rsv->size - block_rsv->reserved);
5384 trace_btrfs_space_reservation(fs_info, "space_info", 5384 block_rsv->reserved += num_bytes;
5385 sinfo->flags, num_bytes, 1); 5385 sinfo->bytes_may_use += num_bytes;
5386 } 5386 trace_btrfs_space_reservation(fs_info, "space_info",
5387 5387 sinfo->flags, num_bytes,
5388 if (block_rsv->reserved >= block_rsv->size) { 5388 1);
5389 }
5390 } else if (block_rsv->reserved > block_rsv->size) {
5389 num_bytes = block_rsv->reserved - block_rsv->size; 5391 num_bytes = block_rsv->reserved - block_rsv->size;
5390 sinfo->bytes_may_use -= num_bytes; 5392 sinfo->bytes_may_use -= num_bytes;
5391 trace_btrfs_space_reservation(fs_info, "space_info", 5393 trace_btrfs_space_reservation(fs_info, "space_info",
5392 sinfo->flags, num_bytes, 0); 5394 sinfo->flags, num_bytes, 0);
5393 block_rsv->reserved = block_rsv->size; 5395 block_rsv->reserved = block_rsv->size;
5394 block_rsv->full = 1;
5395 } 5396 }
5396 5397
5398 if (block_rsv->reserved == block_rsv->size)
5399 block_rsv->full = 1;
5400 else
5401 block_rsv->full = 0;
5402
5397 spin_unlock(&block_rsv->lock); 5403 spin_unlock(&block_rsv->lock);
5398 spin_unlock(&sinfo->lock); 5404 spin_unlock(&sinfo->lock);
5399} 5405}
@@ -5752,7 +5758,7 @@ out_fail:
5752 5758
5753 /* 5759 /*
5754 * This is tricky, but first we need to figure out how much we 5760 * This is tricky, but first we need to figure out how much we
5755 * free'd from any free-ers that occured during this 5761 * free'd from any free-ers that occurred during this
5756 * reservation, so we reset ->csum_bytes to the csum_bytes 5762 * reservation, so we reset ->csum_bytes to the csum_bytes
5757 * before we dropped our lock, and then call the free for the 5763 * before we dropped our lock, and then call the free for the
5758 * number of bytes that were freed while we were trying our 5764 * number of bytes that were freed while we were trying our
@@ -7018,7 +7024,7 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
7018 struct btrfs_free_cluster *cluster, 7024 struct btrfs_free_cluster *cluster,
7019 int delalloc) 7025 int delalloc)
7020{ 7026{
7021 struct btrfs_block_group_cache *used_bg; 7027 struct btrfs_block_group_cache *used_bg = NULL;
7022 bool locked = false; 7028 bool locked = false;
7023again: 7029again:
7024 spin_lock(&cluster->refill_lock); 7030 spin_lock(&cluster->refill_lock);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 392592dc7010..76a0c8597d98 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -206,10 +206,8 @@ void extent_io_exit(void)
206 * destroy caches. 206 * destroy caches.
207 */ 207 */
208 rcu_barrier(); 208 rcu_barrier();
209 if (extent_state_cache) 209 kmem_cache_destroy(extent_state_cache);
210 kmem_cache_destroy(extent_state_cache); 210 kmem_cache_destroy(extent_buffer_cache);
211 if (extent_buffer_cache)
212 kmem_cache_destroy(extent_buffer_cache);
213 if (btrfs_bioset) 211 if (btrfs_bioset)
214 bioset_free(btrfs_bioset); 212 bioset_free(btrfs_bioset);
215} 213}
@@ -232,7 +230,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
232 if (!state) 230 if (!state)
233 return state; 231 return state;
234 state->state = 0; 232 state->state = 0;
235 state->private = 0; 233 state->failrec = NULL;
236 RB_CLEAR_NODE(&state->rb_node); 234 RB_CLEAR_NODE(&state->rb_node);
237 btrfs_leak_debug_add(&state->leak_list, &states); 235 btrfs_leak_debug_add(&state->leak_list, &states);
238 atomic_set(&state->refs, 1); 236 atomic_set(&state->refs, 1);
@@ -1844,7 +1842,8 @@ out:
1844 * set the private field for a given byte offset in the tree. If there isn't 1842 * set the private field for a given byte offset in the tree. If there isn't
1845 * an extent_state there already, this does nothing. 1843 * an extent_state there already, this does nothing.
1846 */ 1844 */
1847static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) 1845static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start,
1846 struct io_failure_record *failrec)
1848{ 1847{
1849 struct rb_node *node; 1848 struct rb_node *node;
1850 struct extent_state *state; 1849 struct extent_state *state;
@@ -1865,13 +1864,14 @@ static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private
1865 ret = -ENOENT; 1864 ret = -ENOENT;
1866 goto out; 1865 goto out;
1867 } 1866 }
1868 state->private = private; 1867 state->failrec = failrec;
1869out: 1868out:
1870 spin_unlock(&tree->lock); 1869 spin_unlock(&tree->lock);
1871 return ret; 1870 return ret;
1872} 1871}
1873 1872
1874int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) 1873static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start,
1874 struct io_failure_record **failrec)
1875{ 1875{
1876 struct rb_node *node; 1876 struct rb_node *node;
1877 struct extent_state *state; 1877 struct extent_state *state;
@@ -1892,7 +1892,7 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1892 ret = -ENOENT; 1892 ret = -ENOENT;
1893 goto out; 1893 goto out;
1894 } 1894 }
1895 *private = state->private; 1895 *failrec = state->failrec;
1896out: 1896out:
1897 spin_unlock(&tree->lock); 1897 spin_unlock(&tree->lock);
1898 return ret; 1898 return ret;
@@ -1972,7 +1972,7 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec)
1972 int err = 0; 1972 int err = 0;
1973 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 1973 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1974 1974
1975 set_state_private(failure_tree, rec->start, 0); 1975 set_state_failrec(failure_tree, rec->start, NULL);
1976 ret = clear_extent_bits(failure_tree, rec->start, 1976 ret = clear_extent_bits(failure_tree, rec->start,
1977 rec->start + rec->len - 1, 1977 rec->start + rec->len - 1,
1978 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 1978 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
@@ -2089,7 +2089,6 @@ int clean_io_failure(struct inode *inode, u64 start, struct page *page,
2089 unsigned int pg_offset) 2089 unsigned int pg_offset)
2090{ 2090{
2091 u64 private; 2091 u64 private;
2092 u64 private_failure;
2093 struct io_failure_record *failrec; 2092 struct io_failure_record *failrec;
2094 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2093 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2095 struct extent_state *state; 2094 struct extent_state *state;
@@ -2102,12 +2101,11 @@ int clean_io_failure(struct inode *inode, u64 start, struct page *page,
2102 if (!ret) 2101 if (!ret)
2103 return 0; 2102 return 0;
2104 2103
2105 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, 2104 ret = get_state_failrec(&BTRFS_I(inode)->io_failure_tree, start,
2106 &private_failure); 2105 &failrec);
2107 if (ret) 2106 if (ret)
2108 return 0; 2107 return 0;
2109 2108
2110 failrec = (struct io_failure_record *)(unsigned long) private_failure;
2111 BUG_ON(!failrec->this_mirror); 2109 BUG_ON(!failrec->this_mirror);
2112 2110
2113 if (failrec->in_validation) { 2111 if (failrec->in_validation) {
@@ -2167,7 +2165,7 @@ void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
2167 2165
2168 next = next_state(state); 2166 next = next_state(state);
2169 2167
2170 failrec = (struct io_failure_record *)(unsigned long)state->private; 2168 failrec = state->failrec;
2171 free_extent_state(state); 2169 free_extent_state(state);
2172 kfree(failrec); 2170 kfree(failrec);
2173 2171
@@ -2177,10 +2175,9 @@ void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
2177} 2175}
2178 2176
2179int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, 2177int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
2180 struct io_failure_record **failrec_ret) 2178 struct io_failure_record **failrec_ret)
2181{ 2179{
2182 struct io_failure_record *failrec; 2180 struct io_failure_record *failrec;
2183 u64 private;
2184 struct extent_map *em; 2181 struct extent_map *em;
2185 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2182 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2186 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2183 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
@@ -2188,7 +2185,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
2188 int ret; 2185 int ret;
2189 u64 logical; 2186 u64 logical;
2190 2187
2191 ret = get_state_private(failure_tree, start, &private); 2188 ret = get_state_failrec(failure_tree, start, &failrec);
2192 if (ret) { 2189 if (ret) {
2193 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2190 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2194 if (!failrec) 2191 if (!failrec)
@@ -2237,8 +2234,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
2237 ret = set_extent_bits(failure_tree, start, end, 2234 ret = set_extent_bits(failure_tree, start, end,
2238 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 2235 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2239 if (ret >= 0) 2236 if (ret >= 0)
2240 ret = set_state_private(failure_tree, start, 2237 ret = set_state_failrec(failure_tree, start, failrec);
2241 (u64)(unsigned long)failrec);
2242 /* set the bits in the inode's tree */ 2238 /* set the bits in the inode's tree */
2243 if (ret >= 0) 2239 if (ret >= 0)
2244 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, 2240 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
@@ -2248,7 +2244,6 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
2248 return ret; 2244 return ret;
2249 } 2245 }
2250 } else { 2246 } else {
2251 failrec = (struct io_failure_record *)(unsigned long)private;
2252 pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n", 2247 pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n",
2253 failrec->logical, failrec->start, failrec->len, 2248 failrec->logical, failrec->start, failrec->len,
2254 failrec->in_validation); 2249 failrec->in_validation);
@@ -3177,7 +3172,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
3177 3172
3178 while (1) { 3173 while (1) {
3179 lock_extent(tree, start, end); 3174 lock_extent(tree, start, end);
3180 ordered = btrfs_lookup_ordered_extent(inode, start); 3175 ordered = btrfs_lookup_ordered_range(inode, start,
3176 PAGE_CACHE_SIZE);
3181 if (!ordered) 3177 if (!ordered)
3182 break; 3178 break;
3183 unlock_extent(tree, start, end); 3179 unlock_extent(tree, start, end);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 880d5292e972..5dbf92e68fbd 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -61,6 +61,7 @@
61struct extent_state; 61struct extent_state;
62struct btrfs_root; 62struct btrfs_root;
63struct btrfs_io_bio; 63struct btrfs_io_bio;
64struct io_failure_record;
64 65
65typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, 66typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
66 struct bio *bio, int mirror_num, 67 struct bio *bio, int mirror_num,
@@ -111,8 +112,7 @@ struct extent_state {
111 atomic_t refs; 112 atomic_t refs;
112 unsigned state; 113 unsigned state;
113 114
114 /* for use by the FS */ 115 struct io_failure_record *failrec;
115 u64 private;
116 116
117#ifdef CONFIG_BTRFS_DEBUG 117#ifdef CONFIG_BTRFS_DEBUG
118 struct list_head leak_list; 118 struct list_head leak_list;
@@ -342,7 +342,6 @@ int extent_readpages(struct extent_io_tree *tree,
342 get_extent_t get_extent); 342 get_extent_t get_extent);
343int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 343int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
344 __u64 start, __u64 len, get_extent_t *get_extent); 344 __u64 start, __u64 len, get_extent_t *get_extent);
345int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
346void set_page_extent_mapped(struct page *page); 345void set_page_extent_mapped(struct page *page);
347 346
348struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 347struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 84fb56d5c018..318b048eb254 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -4,6 +4,7 @@
4#include <linux/hardirq.h> 4#include <linux/hardirq.h>
5#include "ctree.h" 5#include "ctree.h"
6#include "extent_map.h" 6#include "extent_map.h"
7#include "compression.h"
7 8
8 9
9static struct kmem_cache *extent_map_cache; 10static struct kmem_cache *extent_map_cache;
@@ -20,8 +21,7 @@ int __init extent_map_init(void)
20 21
21void extent_map_exit(void) 22void extent_map_exit(void)
22{ 23{
23 if (extent_map_cache) 24 kmem_cache_destroy(extent_map_cache);
24 kmem_cache_destroy(extent_map_cache);
25} 25}
26 26
27/** 27/**
@@ -62,7 +62,7 @@ struct extent_map *alloc_extent_map(void)
62 62
63/** 63/**
64 * free_extent_map - drop reference count of an extent_map 64 * free_extent_map - drop reference count of an extent_map
65 * @em: extent map beeing releasead 65 * @em: extent map being releasead
66 * 66 *
67 * Drops the reference out on @em by one and free the structure 67 * Drops the reference out on @em by one and free the structure
68 * if the reference count hits zero. 68 * if the reference count hits zero.
@@ -422,7 +422,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
422/** 422/**
423 * remove_extent_mapping - removes an extent_map from the extent tree 423 * remove_extent_mapping - removes an extent_map from the extent tree
424 * @tree: extent tree to remove from 424 * @tree: extent tree to remove from
425 * @em: extent map beeing removed 425 * @em: extent map being removed
426 * 426 *
427 * Removes @em from @tree. No reference counts are dropped, and no checks 427 * Removes @em from @tree. No reference counts are dropped, and no checks
428 * are done to see if the range is in use 428 * are done to see if the range is in use
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a67e1c828d0f..b5baf5bdc8e1 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -25,6 +25,7 @@
25#include "transaction.h" 25#include "transaction.h"
26#include "volumes.h" 26#include "volumes.h"
27#include "print-tree.h" 27#include "print-tree.h"
28#include "compression.h"
28 29
29#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ 30#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
30 sizeof(struct btrfs_item) * 2) / \ 31 sizeof(struct btrfs_item) * 2) / \
@@ -172,6 +173,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
172 u64 item_start_offset = 0; 173 u64 item_start_offset = 0;
173 u64 item_last_offset = 0; 174 u64 item_last_offset = 0;
174 u64 disk_bytenr; 175 u64 disk_bytenr;
176 u64 page_bytes_left;
175 u32 diff; 177 u32 diff;
176 int nblocks; 178 int nblocks;
177 int bio_index = 0; 179 int bio_index = 0;
@@ -220,6 +222,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
220 disk_bytenr = (u64)bio->bi_iter.bi_sector << 9; 222 disk_bytenr = (u64)bio->bi_iter.bi_sector << 9;
221 if (dio) 223 if (dio)
222 offset = logical_offset; 224 offset = logical_offset;
225
226 page_bytes_left = bvec->bv_len;
223 while (bio_index < bio->bi_vcnt) { 227 while (bio_index < bio->bi_vcnt) {
224 if (!dio) 228 if (!dio)
225 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 229 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
@@ -243,7 +247,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
243 if (BTRFS_I(inode)->root->root_key.objectid == 247 if (BTRFS_I(inode)->root->root_key.objectid ==
244 BTRFS_DATA_RELOC_TREE_OBJECTID) { 248 BTRFS_DATA_RELOC_TREE_OBJECTID) {
245 set_extent_bits(io_tree, offset, 249 set_extent_bits(io_tree, offset,
246 offset + bvec->bv_len - 1, 250 offset + root->sectorsize - 1,
247 EXTENT_NODATASUM, GFP_NOFS); 251 EXTENT_NODATASUM, GFP_NOFS);
248 } else { 252 } else {
249 btrfs_info(BTRFS_I(inode)->root->fs_info, 253 btrfs_info(BTRFS_I(inode)->root->fs_info,
@@ -281,13 +285,29 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
281found: 285found:
282 csum += count * csum_size; 286 csum += count * csum_size;
283 nblocks -= count; 287 nblocks -= count;
284 bio_index += count; 288
285 while (count--) { 289 while (count--) {
286 disk_bytenr += bvec->bv_len; 290 disk_bytenr += root->sectorsize;
287 offset += bvec->bv_len; 291 offset += root->sectorsize;
288 bvec++; 292 page_bytes_left -= root->sectorsize;
293 if (!page_bytes_left) {
294 bio_index++;
295 /*
296 * make sure we're still inside the
297 * bio before we update page_bytes_left
298 */
299 if (bio_index >= bio->bi_vcnt) {
300 WARN_ON_ONCE(count);
301 goto done;
302 }
303 bvec++;
304 page_bytes_left = bvec->bv_len;
305 }
306
289 } 307 }
290 } 308 }
309
310done:
291 btrfs_free_path(path); 311 btrfs_free_path(path);
292 return 0; 312 return 0;
293} 313}
@@ -432,6 +452,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
432 struct bio_vec *bvec = bio->bi_io_vec; 452 struct bio_vec *bvec = bio->bi_io_vec;
433 int bio_index = 0; 453 int bio_index = 0;
434 int index; 454 int index;
455 int nr_sectors;
456 int i;
435 unsigned long total_bytes = 0; 457 unsigned long total_bytes = 0;
436 unsigned long this_sum_bytes = 0; 458 unsigned long this_sum_bytes = 0;
437 u64 offset; 459 u64 offset;
@@ -459,41 +481,56 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
459 if (!contig) 481 if (!contig)
460 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 482 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
461 483
462 if (offset >= ordered->file_offset + ordered->len || 484 data = kmap_atomic(bvec->bv_page);
463 offset < ordered->file_offset) {
464 unsigned long bytes_left;
465 sums->len = this_sum_bytes;
466 this_sum_bytes = 0;
467 btrfs_add_ordered_sum(inode, ordered, sums);
468 btrfs_put_ordered_extent(ordered);
469 485
470 bytes_left = bio->bi_iter.bi_size - total_bytes; 486 nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
487 bvec->bv_len + root->sectorsize
488 - 1);
489
490 for (i = 0; i < nr_sectors; i++) {
491 if (offset >= ordered->file_offset + ordered->len ||
492 offset < ordered->file_offset) {
493 unsigned long bytes_left;
494
495 kunmap_atomic(data);
496 sums->len = this_sum_bytes;
497 this_sum_bytes = 0;
498 btrfs_add_ordered_sum(inode, ordered, sums);
499 btrfs_put_ordered_extent(ordered);
500
501 bytes_left = bio->bi_iter.bi_size - total_bytes;
502
503 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
504 GFP_NOFS);
505 BUG_ON(!sums); /* -ENOMEM */
506 sums->len = bytes_left;
507 ordered = btrfs_lookup_ordered_extent(inode,
508 offset);
509 ASSERT(ordered); /* Logic error */
510 sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9)
511 + total_bytes;
512 index = 0;
513
514 data = kmap_atomic(bvec->bv_page);
515 }
471 516
472 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), 517 sums->sums[index] = ~(u32)0;
473 GFP_NOFS); 518 sums->sums[index]
474 BUG_ON(!sums); /* -ENOMEM */ 519 = btrfs_csum_data(data + bvec->bv_offset
475 sums->len = bytes_left; 520 + (i * root->sectorsize),
476 ordered = btrfs_lookup_ordered_extent(inode, offset); 521 sums->sums[index],
477 BUG_ON(!ordered); /* Logic error */ 522 root->sectorsize);
478 sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) + 523 btrfs_csum_final(sums->sums[index],
479 total_bytes; 524 (char *)(sums->sums + index));
480 index = 0; 525 index++;
526 offset += root->sectorsize;
527 this_sum_bytes += root->sectorsize;
528 total_bytes += root->sectorsize;
481 } 529 }
482 530
483 data = kmap_atomic(bvec->bv_page);
484 sums->sums[index] = ~(u32)0;
485 sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset,
486 sums->sums[index],
487 bvec->bv_len);
488 kunmap_atomic(data); 531 kunmap_atomic(data);
489 btrfs_csum_final(sums->sums[index],
490 (char *)(sums->sums + index));
491 532
492 bio_index++; 533 bio_index++;
493 index++;
494 total_bytes += bvec->bv_len;
495 this_sum_bytes += bvec->bv_len;
496 offset += bvec->bv_len;
497 bvec++; 534 bvec++;
498 } 535 }
499 this_sum_bytes = 0; 536 this_sum_bytes = 0;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 098bb8f690c9..15a09cb156ce 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -41,6 +41,7 @@
41#include "locking.h" 41#include "locking.h"
42#include "volumes.h" 42#include "volumes.h"
43#include "qgroup.h" 43#include "qgroup.h"
44#include "compression.h"
44 45
45static struct kmem_cache *btrfs_inode_defrag_cachep; 46static struct kmem_cache *btrfs_inode_defrag_cachep;
46/* 47/*
@@ -498,7 +499,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
498 loff_t isize = i_size_read(inode); 499 loff_t isize = i_size_read(inode);
499 500
500 start_pos = pos & ~((u64)root->sectorsize - 1); 501 start_pos = pos & ~((u64)root->sectorsize - 1);
501 num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize); 502 num_bytes = round_up(write_bytes + pos - start_pos, root->sectorsize);
502 503
503 end_of_last_block = start_pos + num_bytes - 1; 504 end_of_last_block = start_pos + num_bytes - 1;
504 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 505 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
@@ -1379,16 +1380,19 @@ fail:
1379static noinline int 1380static noinline int
1380lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, 1381lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
1381 size_t num_pages, loff_t pos, 1382 size_t num_pages, loff_t pos,
1383 size_t write_bytes,
1382 u64 *lockstart, u64 *lockend, 1384 u64 *lockstart, u64 *lockend,
1383 struct extent_state **cached_state) 1385 struct extent_state **cached_state)
1384{ 1386{
1387 struct btrfs_root *root = BTRFS_I(inode)->root;
1385 u64 start_pos; 1388 u64 start_pos;
1386 u64 last_pos; 1389 u64 last_pos;
1387 int i; 1390 int i;
1388 int ret = 0; 1391 int ret = 0;
1389 1392
1390 start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1); 1393 start_pos = round_down(pos, root->sectorsize);
1391 last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1; 1394 last_pos = start_pos
1395 + round_up(pos + write_bytes - start_pos, root->sectorsize) - 1;
1392 1396
1393 if (start_pos < inode->i_size) { 1397 if (start_pos < inode->i_size) {
1394 struct btrfs_ordered_extent *ordered; 1398 struct btrfs_ordered_extent *ordered;
@@ -1503,6 +1507,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1503 1507
1504 while (iov_iter_count(i) > 0) { 1508 while (iov_iter_count(i) > 0) {
1505 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 1509 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1510 size_t sector_offset;
1506 size_t write_bytes = min(iov_iter_count(i), 1511 size_t write_bytes = min(iov_iter_count(i),
1507 nrptrs * (size_t)PAGE_CACHE_SIZE - 1512 nrptrs * (size_t)PAGE_CACHE_SIZE -
1508 offset); 1513 offset);
@@ -1511,6 +1516,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1511 size_t reserve_bytes; 1516 size_t reserve_bytes;
1512 size_t dirty_pages; 1517 size_t dirty_pages;
1513 size_t copied; 1518 size_t copied;
1519 size_t dirty_sectors;
1520 size_t num_sectors;
1514 1521
1515 WARN_ON(num_pages > nrptrs); 1522 WARN_ON(num_pages > nrptrs);
1516 1523
@@ -1523,29 +1530,29 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1523 break; 1530 break;
1524 } 1531 }
1525 1532
1526 reserve_bytes = num_pages << PAGE_CACHE_SHIFT; 1533 sector_offset = pos & (root->sectorsize - 1);
1534 reserve_bytes = round_up(write_bytes + sector_offset,
1535 root->sectorsize);
1527 1536
1528 if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | 1537 if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
1529 BTRFS_INODE_PREALLOC)) { 1538 BTRFS_INODE_PREALLOC)) &&
1530 ret = check_can_nocow(inode, pos, &write_bytes); 1539 check_can_nocow(inode, pos, &write_bytes) > 0) {
1531 if (ret < 0) 1540 /*
1532 break; 1541 * For nodata cow case, no need to reserve
1533 if (ret > 0) { 1542 * data space.
1534 /* 1543 */
1535 * For nodata cow case, no need to reserve 1544 only_release_metadata = true;
1536 * data space. 1545 /*
1537 */ 1546 * our prealloc extent may be smaller than
1538 only_release_metadata = true; 1547 * write_bytes, so scale down.
1539 /* 1548 */
1540 * our prealloc extent may be smaller than 1549 num_pages = DIV_ROUND_UP(write_bytes + offset,
1541 * write_bytes, so scale down. 1550 PAGE_CACHE_SIZE);
1542 */ 1551 reserve_bytes = round_up(write_bytes + sector_offset,
1543 num_pages = DIV_ROUND_UP(write_bytes + offset, 1552 root->sectorsize);
1544 PAGE_CACHE_SIZE); 1553 goto reserve_metadata;
1545 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1546 goto reserve_metadata;
1547 }
1548 } 1554 }
1555
1549 ret = btrfs_check_data_free_space(inode, pos, write_bytes); 1556 ret = btrfs_check_data_free_space(inode, pos, write_bytes);
1550 if (ret < 0) 1557 if (ret < 0)
1551 break; 1558 break;
@@ -1576,8 +1583,8 @@ again:
1576 break; 1583 break;
1577 1584
1578 ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages, 1585 ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
1579 pos, &lockstart, &lockend, 1586 pos, write_bytes, &lockstart,
1580 &cached_state); 1587 &lockend, &cached_state);
1581 if (ret < 0) { 1588 if (ret < 0) {
1582 if (ret == -EAGAIN) 1589 if (ret == -EAGAIN)
1583 goto again; 1590 goto again;
@@ -1612,9 +1619,16 @@ again:
1612 * we still have an outstanding extent for the chunk we actually 1619 * we still have an outstanding extent for the chunk we actually
1613 * managed to copy. 1620 * managed to copy.
1614 */ 1621 */
1615 if (num_pages > dirty_pages) { 1622 num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
1616 release_bytes = (num_pages - dirty_pages) << 1623 reserve_bytes);
1617 PAGE_CACHE_SHIFT; 1624 dirty_sectors = round_up(copied + sector_offset,
1625 root->sectorsize);
1626 dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
1627 dirty_sectors);
1628
1629 if (num_sectors > dirty_sectors) {
1630 release_bytes = (write_bytes - copied)
1631 & ~((u64)root->sectorsize - 1);
1618 if (copied > 0) { 1632 if (copied > 0) {
1619 spin_lock(&BTRFS_I(inode)->lock); 1633 spin_lock(&BTRFS_I(inode)->lock);
1620 BTRFS_I(inode)->outstanding_extents++; 1634 BTRFS_I(inode)->outstanding_extents++;
@@ -1633,7 +1647,8 @@ again:
1633 } 1647 }
1634 } 1648 }
1635 1649
1636 release_bytes = dirty_pages << PAGE_CACHE_SHIFT; 1650 release_bytes = round_up(copied + sector_offset,
1651 root->sectorsize);
1637 1652
1638 if (copied > 0) 1653 if (copied > 0)
1639 ret = btrfs_dirty_pages(root, inode, pages, 1654 ret = btrfs_dirty_pages(root, inode, pages,
@@ -1654,8 +1669,7 @@ again:
1654 1669
1655 if (only_release_metadata && copied > 0) { 1670 if (only_release_metadata && copied > 0) {
1656 lockstart = round_down(pos, root->sectorsize); 1671 lockstart = round_down(pos, root->sectorsize);
1657 lockend = lockstart + 1672 lockend = round_up(pos + copied, root->sectorsize) - 1;
1658 (dirty_pages << PAGE_CACHE_SHIFT) - 1;
1659 1673
1660 set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 1674 set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
1661 lockend, EXTENT_NORESERVE, NULL, 1675 lockend, EXTENT_NORESERVE, NULL,
@@ -1761,6 +1775,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1761 ssize_t err; 1775 ssize_t err;
1762 loff_t pos; 1776 loff_t pos;
1763 size_t count; 1777 size_t count;
1778 loff_t oldsize;
1779 int clean_page = 0;
1764 1780
1765 inode_lock(inode); 1781 inode_lock(inode);
1766 err = generic_write_checks(iocb, from); 1782 err = generic_write_checks(iocb, from);
@@ -1799,14 +1815,17 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1799 pos = iocb->ki_pos; 1815 pos = iocb->ki_pos;
1800 count = iov_iter_count(from); 1816 count = iov_iter_count(from);
1801 start_pos = round_down(pos, root->sectorsize); 1817 start_pos = round_down(pos, root->sectorsize);
1802 if (start_pos > i_size_read(inode)) { 1818 oldsize = i_size_read(inode);
1819 if (start_pos > oldsize) {
1803 /* Expand hole size to cover write data, preventing empty gap */ 1820 /* Expand hole size to cover write data, preventing empty gap */
1804 end_pos = round_up(pos + count, root->sectorsize); 1821 end_pos = round_up(pos + count, root->sectorsize);
1805 err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); 1822 err = btrfs_cont_expand(inode, oldsize, end_pos);
1806 if (err) { 1823 if (err) {
1807 inode_unlock(inode); 1824 inode_unlock(inode);
1808 goto out; 1825 goto out;
1809 } 1826 }
1827 if (start_pos > round_up(oldsize, root->sectorsize))
1828 clean_page = 1;
1810 } 1829 }
1811 1830
1812 if (sync) 1831 if (sync)
@@ -1818,6 +1837,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1818 num_written = __btrfs_buffered_write(file, from, pos); 1837 num_written = __btrfs_buffered_write(file, from, pos);
1819 if (num_written > 0) 1838 if (num_written > 0)
1820 iocb->ki_pos = pos + num_written; 1839 iocb->ki_pos = pos + num_written;
1840 if (clean_page)
1841 pagecache_isize_extended(inode, oldsize,
1842 i_size_read(inode));
1821 } 1843 }
1822 1844
1823 inode_unlock(inode); 1845 inode_unlock(inode);
@@ -1825,7 +1847,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1825 /* 1847 /*
1826 * We also have to set last_sub_trans to the current log transid, 1848 * We also have to set last_sub_trans to the current log transid,
1827 * otherwise subsequent syncs to a file that's been synced in this 1849 * otherwise subsequent syncs to a file that's been synced in this
1828 * transaction will appear to have already occured. 1850 * transaction will appear to have already occurred.
1829 */ 1851 */
1830 spin_lock(&BTRFS_I(inode)->lock); 1852 spin_lock(&BTRFS_I(inode)->lock);
1831 BTRFS_I(inode)->last_sub_trans = root->log_transid; 1853 BTRFS_I(inode)->last_sub_trans = root->log_transid;
@@ -1996,10 +2018,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1996 */ 2018 */
1997 smp_mb(); 2019 smp_mb();
1998 if (btrfs_inode_in_log(inode, root->fs_info->generation) || 2020 if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
1999 (BTRFS_I(inode)->last_trans <= 2021 (full_sync && BTRFS_I(inode)->last_trans <=
2000 root->fs_info->last_trans_committed && 2022 root->fs_info->last_trans_committed) ||
2001 (full_sync || 2023 (!btrfs_have_ordered_extents_in_range(inode, start, len) &&
2002 !btrfs_have_ordered_extents_in_range(inode, start, len)))) { 2024 BTRFS_I(inode)->last_trans
2025 <= root->fs_info->last_trans_committed)) {
2003 /* 2026 /*
2004 * We'v had everything committed since the last time we were 2027 * We'v had everything committed since the last time we were
2005 * modified so clear this flag in case it was set for whatever 2028 * modified so clear this flag in case it was set for whatever
@@ -2293,10 +2316,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2293 int ret = 0; 2316 int ret = 0;
2294 int err = 0; 2317 int err = 0;
2295 unsigned int rsv_count; 2318 unsigned int rsv_count;
2296 bool same_page; 2319 bool same_block;
2297 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); 2320 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2298 u64 ino_size; 2321 u64 ino_size;
2299 bool truncated_page = false; 2322 bool truncated_block = false;
2300 bool updated_inode = false; 2323 bool updated_inode = false;
2301 2324
2302 ret = btrfs_wait_ordered_range(inode, offset, len); 2325 ret = btrfs_wait_ordered_range(inode, offset, len);
@@ -2304,7 +2327,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2304 return ret; 2327 return ret;
2305 2328
2306 inode_lock(inode); 2329 inode_lock(inode);
2307 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); 2330 ino_size = round_up(inode->i_size, root->sectorsize);
2308 ret = find_first_non_hole(inode, &offset, &len); 2331 ret = find_first_non_hole(inode, &offset, &len);
2309 if (ret < 0) 2332 if (ret < 0)
2310 goto out_only_mutex; 2333 goto out_only_mutex;
@@ -2317,31 +2340,30 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2317 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); 2340 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
2318 lockend = round_down(offset + len, 2341 lockend = round_down(offset + len,
2319 BTRFS_I(inode)->root->sectorsize) - 1; 2342 BTRFS_I(inode)->root->sectorsize) - 1;
2320 same_page = ((offset >> PAGE_CACHE_SHIFT) == 2343 same_block = (BTRFS_BYTES_TO_BLKS(root->fs_info, offset))
2321 ((offset + len - 1) >> PAGE_CACHE_SHIFT)); 2344 == (BTRFS_BYTES_TO_BLKS(root->fs_info, offset + len - 1));
2322
2323 /* 2345 /*
2324 * We needn't truncate any page which is beyond the end of the file 2346 * We needn't truncate any block which is beyond the end of the file
2325 * because we are sure there is no data there. 2347 * because we are sure there is no data there.
2326 */ 2348 */
2327 /* 2349 /*
2328 * Only do this if we are in the same page and we aren't doing the 2350 * Only do this if we are in the same block and we aren't doing the
2329 * entire page. 2351 * entire block.
2330 */ 2352 */
2331 if (same_page && len < PAGE_CACHE_SIZE) { 2353 if (same_block && len < root->sectorsize) {
2332 if (offset < ino_size) { 2354 if (offset < ino_size) {
2333 truncated_page = true; 2355 truncated_block = true;
2334 ret = btrfs_truncate_page(inode, offset, len, 0); 2356 ret = btrfs_truncate_block(inode, offset, len, 0);
2335 } else { 2357 } else {
2336 ret = 0; 2358 ret = 0;
2337 } 2359 }
2338 goto out_only_mutex; 2360 goto out_only_mutex;
2339 } 2361 }
2340 2362
2341 /* zero back part of the first page */ 2363 /* zero back part of the first block */
2342 if (offset < ino_size) { 2364 if (offset < ino_size) {
2343 truncated_page = true; 2365 truncated_block = true;
2344 ret = btrfs_truncate_page(inode, offset, 0, 0); 2366 ret = btrfs_truncate_block(inode, offset, 0, 0);
2345 if (ret) { 2367 if (ret) {
2346 inode_unlock(inode); 2368 inode_unlock(inode);
2347 return ret; 2369 return ret;
@@ -2376,9 +2398,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2376 if (!ret) { 2398 if (!ret) {
2377 /* zero the front end of the last page */ 2399 /* zero the front end of the last page */
2378 if (tail_start + tail_len < ino_size) { 2400 if (tail_start + tail_len < ino_size) {
2379 truncated_page = true; 2401 truncated_block = true;
2380 ret = btrfs_truncate_page(inode, 2402 ret = btrfs_truncate_block(inode,
2381 tail_start + tail_len, 0, 1); 2403 tail_start + tail_len,
2404 0, 1);
2382 if (ret) 2405 if (ret)
2383 goto out_only_mutex; 2406 goto out_only_mutex;
2384 } 2407 }
@@ -2544,7 +2567,7 @@ out_trans:
2544 goto out_free; 2567 goto out_free;
2545 2568
2546 inode_inc_iversion(inode); 2569 inode_inc_iversion(inode);
2547 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2570 inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
2548 2571
2549 trans->block_rsv = &root->fs_info->trans_block_rsv; 2572 trans->block_rsv = &root->fs_info->trans_block_rsv;
2550 ret = btrfs_update_inode(trans, root, inode); 2573 ret = btrfs_update_inode(trans, root, inode);
@@ -2558,7 +2581,7 @@ out:
2558 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2581 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2559 &cached_state, GFP_NOFS); 2582 &cached_state, GFP_NOFS);
2560out_only_mutex: 2583out_only_mutex:
2561 if (!updated_inode && truncated_page && !ret && !err) { 2584 if (!updated_inode && truncated_block && !ret && !err) {
2562 /* 2585 /*
2563 * If we only end up zeroing part of a page, we still need to 2586 * If we only end up zeroing part of a page, we still need to
2564 * update the inode item, so that all the time fields are 2587 * update the inode item, so that all the time fields are
@@ -2611,7 +2634,7 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len)
2611 return 0; 2634 return 0;
2612 } 2635 }
2613insert: 2636insert:
2614 range = kmalloc(sizeof(*range), GFP_NOFS); 2637 range = kmalloc(sizeof(*range), GFP_KERNEL);
2615 if (!range) 2638 if (!range)
2616 return -ENOMEM; 2639 return -ENOMEM;
2617 range->start = start; 2640 range->start = start;
@@ -2678,10 +2701,10 @@ static long btrfs_fallocate(struct file *file, int mode,
2678 } else if (offset + len > inode->i_size) { 2701 } else if (offset + len > inode->i_size) {
2679 /* 2702 /*
2680 * If we are fallocating from the end of the file onward we 2703 * If we are fallocating from the end of the file onward we
2681 * need to zero out the end of the page if i_size lands in the 2704 * need to zero out the end of the block if i_size lands in the
2682 * middle of a page. 2705 * middle of a block.
2683 */ 2706 */
2684 ret = btrfs_truncate_page(inode, inode->i_size, 0, 0); 2707 ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
2685 if (ret) 2708 if (ret)
2686 goto out; 2709 goto out;
2687 } 2710 }
@@ -2712,7 +2735,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2712 btrfs_put_ordered_extent(ordered); 2735 btrfs_put_ordered_extent(ordered);
2713 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 2736 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
2714 alloc_start, locked_end, 2737 alloc_start, locked_end,
2715 &cached_state, GFP_NOFS); 2738 &cached_state, GFP_KERNEL);
2716 /* 2739 /*
2717 * we can't wait on the range with the transaction 2740 * we can't wait on the range with the transaction
2718 * running or with the extent lock held 2741 * running or with the extent lock held
@@ -2794,7 +2817,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2794 if (IS_ERR(trans)) { 2817 if (IS_ERR(trans)) {
2795 ret = PTR_ERR(trans); 2818 ret = PTR_ERR(trans);
2796 } else { 2819 } else {
2797 inode->i_ctime = CURRENT_TIME; 2820 inode->i_ctime = current_fs_time(inode->i_sb);
2798 i_size_write(inode, actual_end); 2821 i_size_write(inode, actual_end);
2799 btrfs_ordered_update_i_size(inode, actual_end, NULL); 2822 btrfs_ordered_update_i_size(inode, actual_end, NULL);
2800 ret = btrfs_update_inode(trans, root, inode); 2823 ret = btrfs_update_inode(trans, root, inode);
@@ -2806,7 +2829,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2806 } 2829 }
2807out_unlock: 2830out_unlock:
2808 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 2831 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
2809 &cached_state, GFP_NOFS); 2832 &cached_state, GFP_KERNEL);
2810out: 2833out:
2811 /* 2834 /*
2812 * As we waited the extent range, the data_rsv_map must be empty 2835 * As we waited the extent range, the data_rsv_map must be empty
@@ -2939,8 +2962,7 @@ const struct file_operations btrfs_file_operations = {
2939 2962
2940void btrfs_auto_defrag_exit(void) 2963void btrfs_auto_defrag_exit(void)
2941{ 2964{
2942 if (btrfs_inode_defrag_cachep) 2965 kmem_cache_destroy(btrfs_inode_defrag_cachep);
2943 kmem_cache_destroy(btrfs_inode_defrag_cachep);
2944} 2966}
2945 2967
2946int btrfs_auto_defrag_init(void) 2968int btrfs_auto_defrag_init(void)
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index e50316c4af15..1f0ec19b23f6 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -556,6 +556,9 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
556 mutex_lock(&root->objectid_mutex); 556 mutex_lock(&root->objectid_mutex);
557 557
558 if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { 558 if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
559 btrfs_warn(root->fs_info,
560 "the objectid of root %llu reaches its highest value",
561 root->root_key.objectid);
559 ret = -ENOSPC; 562 ret = -ENOSPC;
560 goto out; 563 goto out;
561 } 564 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d96f5cf38a2d..41a5688ffdfe 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -263,7 +263,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
263 data_len = compressed_size; 263 data_len = compressed_size;
264 264
265 if (start > 0 || 265 if (start > 0 ||
266 actual_end > PAGE_CACHE_SIZE || 266 actual_end > root->sectorsize ||
267 data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) || 267 data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
268 (!compressed_size && 268 (!compressed_size &&
269 (actual_end & (root->sectorsize - 1)) == 0) || 269 (actual_end & (root->sectorsize - 1)) == 0) ||
@@ -2002,7 +2002,8 @@ again:
2002 if (PagePrivate2(page)) 2002 if (PagePrivate2(page))
2003 goto out; 2003 goto out;
2004 2004
2005 ordered = btrfs_lookup_ordered_extent(inode, page_start); 2005 ordered = btrfs_lookup_ordered_range(inode, page_start,
2006 PAGE_CACHE_SIZE);
2006 if (ordered) { 2007 if (ordered) {
2007 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, 2008 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2008 page_end, &cached_state, GFP_NOFS); 2009 page_end, &cached_state, GFP_NOFS);
@@ -4013,7 +4014,8 @@ err:
4013 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 4014 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
4014 inode_inc_iversion(inode); 4015 inode_inc_iversion(inode);
4015 inode_inc_iversion(dir); 4016 inode_inc_iversion(dir);
4016 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 4017 inode->i_ctime = dir->i_mtime =
4018 dir->i_ctime = current_fs_time(inode->i_sb);
4017 ret = btrfs_update_inode(trans, root, dir); 4019 ret = btrfs_update_inode(trans, root, dir);
4018out: 4020out:
4019 return ret; 4021 return ret;
@@ -4156,7 +4158,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4156 4158
4157 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 4159 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
4158 inode_inc_iversion(dir); 4160 inode_inc_iversion(dir);
4159 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 4161 dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
4160 ret = btrfs_update_inode_fallback(trans, root, dir); 4162 ret = btrfs_update_inode_fallback(trans, root, dir);
4161 if (ret) 4163 if (ret)
4162 btrfs_abort_transaction(trans, root, ret); 4164 btrfs_abort_transaction(trans, root, ret);
@@ -4211,11 +4213,20 @@ static int truncate_space_check(struct btrfs_trans_handle *trans,
4211{ 4213{
4212 int ret; 4214 int ret;
4213 4215
4216 /*
4217 * This is only used to apply pressure to the enospc system, we don't
4218 * intend to use this reservation at all.
4219 */
4214 bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted); 4220 bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted);
4221 bytes_deleted *= root->nodesize;
4215 ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, 4222 ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv,
4216 bytes_deleted, BTRFS_RESERVE_NO_FLUSH); 4223 bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
4217 if (!ret) 4224 if (!ret) {
4225 trace_btrfs_space_reservation(root->fs_info, "transaction",
4226 trans->transid,
4227 bytes_deleted, 1);
4218 trans->bytes_reserved += bytes_deleted; 4228 trans->bytes_reserved += bytes_deleted;
4229 }
4219 return ret; 4230 return ret;
4220 4231
4221} 4232}
@@ -4248,7 +4259,8 @@ static int truncate_inline_extent(struct inode *inode,
4248 * read the extent item from disk (data not in the page cache). 4259 * read the extent item from disk (data not in the page cache).
4249 */ 4260 */
4250 btrfs_release_path(path); 4261 btrfs_release_path(path);
4251 return btrfs_truncate_page(inode, offset, page_end - offset, 0); 4262 return btrfs_truncate_block(inode, offset, page_end - offset,
4263 0);
4252 } 4264 }
4253 4265
4254 btrfs_set_file_extent_ram_bytes(leaf, fi, size); 4266 btrfs_set_file_extent_ram_bytes(leaf, fi, size);
@@ -4601,17 +4613,17 @@ error:
4601} 4613}
4602 4614
4603/* 4615/*
4604 * btrfs_truncate_page - read, zero a chunk and write a page 4616 * btrfs_truncate_block - read, zero a chunk and write a block
4605 * @inode - inode that we're zeroing 4617 * @inode - inode that we're zeroing
4606 * @from - the offset to start zeroing 4618 * @from - the offset to start zeroing
4607 * @len - the length to zero, 0 to zero the entire range respective to the 4619 * @len - the length to zero, 0 to zero the entire range respective to the
4608 * offset 4620 * offset
4609 * @front - zero up to the offset instead of from the offset on 4621 * @front - zero up to the offset instead of from the offset on
4610 * 4622 *
4611 * This will find the page for the "from" offset and cow the page and zero the 4623 * This will find the block for the "from" offset and cow the block and zero the
4612 * part we want to zero. This is used with truncate and hole punching. 4624 * part we want to zero. This is used with truncate and hole punching.
4613 */ 4625 */
4614int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, 4626int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
4615 int front) 4627 int front)
4616{ 4628{
4617 struct address_space *mapping = inode->i_mapping; 4629 struct address_space *mapping = inode->i_mapping;
@@ -4622,18 +4634,19 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
4622 char *kaddr; 4634 char *kaddr;
4623 u32 blocksize = root->sectorsize; 4635 u32 blocksize = root->sectorsize;
4624 pgoff_t index = from >> PAGE_CACHE_SHIFT; 4636 pgoff_t index = from >> PAGE_CACHE_SHIFT;
4625 unsigned offset = from & (PAGE_CACHE_SIZE-1); 4637 unsigned offset = from & (blocksize - 1);
4626 struct page *page; 4638 struct page *page;
4627 gfp_t mask = btrfs_alloc_write_mask(mapping); 4639 gfp_t mask = btrfs_alloc_write_mask(mapping);
4628 int ret = 0; 4640 int ret = 0;
4629 u64 page_start; 4641 u64 block_start;
4630 u64 page_end; 4642 u64 block_end;
4631 4643
4632 if ((offset & (blocksize - 1)) == 0 && 4644 if ((offset & (blocksize - 1)) == 0 &&
4633 (!len || ((len & (blocksize - 1)) == 0))) 4645 (!len || ((len & (blocksize - 1)) == 0)))
4634 goto out; 4646 goto out;
4647
4635 ret = btrfs_delalloc_reserve_space(inode, 4648 ret = btrfs_delalloc_reserve_space(inode,
4636 round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE); 4649 round_down(from, blocksize), blocksize);
4637 if (ret) 4650 if (ret)
4638 goto out; 4651 goto out;
4639 4652
@@ -4641,14 +4654,14 @@ again:
4641 page = find_or_create_page(mapping, index, mask); 4654 page = find_or_create_page(mapping, index, mask);
4642 if (!page) { 4655 if (!page) {
4643 btrfs_delalloc_release_space(inode, 4656 btrfs_delalloc_release_space(inode,
4644 round_down(from, PAGE_CACHE_SIZE), 4657 round_down(from, blocksize),
4645 PAGE_CACHE_SIZE); 4658 blocksize);
4646 ret = -ENOMEM; 4659 ret = -ENOMEM;
4647 goto out; 4660 goto out;
4648 } 4661 }
4649 4662
4650 page_start = page_offset(page); 4663 block_start = round_down(from, blocksize);
4651 page_end = page_start + PAGE_CACHE_SIZE - 1; 4664 block_end = block_start + blocksize - 1;
4652 4665
4653 if (!PageUptodate(page)) { 4666 if (!PageUptodate(page)) {
4654 ret = btrfs_readpage(NULL, page); 4667 ret = btrfs_readpage(NULL, page);
@@ -4665,12 +4678,12 @@ again:
4665 } 4678 }
4666 wait_on_page_writeback(page); 4679 wait_on_page_writeback(page);
4667 4680
4668 lock_extent_bits(io_tree, page_start, page_end, &cached_state); 4681 lock_extent_bits(io_tree, block_start, block_end, &cached_state);
4669 set_page_extent_mapped(page); 4682 set_page_extent_mapped(page);
4670 4683
4671 ordered = btrfs_lookup_ordered_extent(inode, page_start); 4684 ordered = btrfs_lookup_ordered_extent(inode, block_start);
4672 if (ordered) { 4685 if (ordered) {
4673 unlock_extent_cached(io_tree, page_start, page_end, 4686 unlock_extent_cached(io_tree, block_start, block_end,
4674 &cached_state, GFP_NOFS); 4687 &cached_state, GFP_NOFS);
4675 unlock_page(page); 4688 unlock_page(page);
4676 page_cache_release(page); 4689 page_cache_release(page);
@@ -4679,39 +4692,41 @@ again:
4679 goto again; 4692 goto again;
4680 } 4693 }
4681 4694
4682 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 4695 clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
4683 EXTENT_DIRTY | EXTENT_DELALLOC | 4696 EXTENT_DIRTY | EXTENT_DELALLOC |
4684 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 4697 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4685 0, 0, &cached_state, GFP_NOFS); 4698 0, 0, &cached_state, GFP_NOFS);
4686 4699
4687 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 4700 ret = btrfs_set_extent_delalloc(inode, block_start, block_end,
4688 &cached_state); 4701 &cached_state);
4689 if (ret) { 4702 if (ret) {
4690 unlock_extent_cached(io_tree, page_start, page_end, 4703 unlock_extent_cached(io_tree, block_start, block_end,
4691 &cached_state, GFP_NOFS); 4704 &cached_state, GFP_NOFS);
4692 goto out_unlock; 4705 goto out_unlock;
4693 } 4706 }
4694 4707
4695 if (offset != PAGE_CACHE_SIZE) { 4708 if (offset != blocksize) {
4696 if (!len) 4709 if (!len)
4697 len = PAGE_CACHE_SIZE - offset; 4710 len = blocksize - offset;
4698 kaddr = kmap(page); 4711 kaddr = kmap(page);
4699 if (front) 4712 if (front)
4700 memset(kaddr, 0, offset); 4713 memset(kaddr + (block_start - page_offset(page)),
4714 0, offset);
4701 else 4715 else
4702 memset(kaddr + offset, 0, len); 4716 memset(kaddr + (block_start - page_offset(page)) + offset,
4717 0, len);
4703 flush_dcache_page(page); 4718 flush_dcache_page(page);
4704 kunmap(page); 4719 kunmap(page);
4705 } 4720 }
4706 ClearPageChecked(page); 4721 ClearPageChecked(page);
4707 set_page_dirty(page); 4722 set_page_dirty(page);
4708 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, 4723 unlock_extent_cached(io_tree, block_start, block_end, &cached_state,
4709 GFP_NOFS); 4724 GFP_NOFS);
4710 4725
4711out_unlock: 4726out_unlock:
4712 if (ret) 4727 if (ret)
4713 btrfs_delalloc_release_space(inode, page_start, 4728 btrfs_delalloc_release_space(inode, block_start,
4714 PAGE_CACHE_SIZE); 4729 blocksize);
4715 unlock_page(page); 4730 unlock_page(page);
4716 page_cache_release(page); 4731 page_cache_release(page);
4717out: 4732out:
@@ -4782,11 +4797,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4782 int err = 0; 4797 int err = 0;
4783 4798
4784 /* 4799 /*
4785 * If our size started in the middle of a page we need to zero out the 4800 * If our size started in the middle of a block we need to zero out the
4786 * rest of the page before we expand the i_size, otherwise we could 4801 * rest of the block before we expand the i_size, otherwise we could
4787 * expose stale data. 4802 * expose stale data.
4788 */ 4803 */
4789 err = btrfs_truncate_page(inode, oldsize, 0, 0); 4804 err = btrfs_truncate_block(inode, oldsize, 0, 0);
4790 if (err) 4805 if (err)
4791 return err; 4806 return err;
4792 4807
@@ -4895,7 +4910,6 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4895 } 4910 }
4896 4911
4897 if (newsize > oldsize) { 4912 if (newsize > oldsize) {
4898 truncate_pagecache(inode, newsize);
4899 /* 4913 /*
4900 * Don't do an expanding truncate while snapshoting is ongoing. 4914 * Don't do an expanding truncate while snapshoting is ongoing.
4901 * This is to ensure the snapshot captures a fully consistent 4915 * This is to ensure the snapshot captures a fully consistent
@@ -4918,6 +4932,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4918 4932
4919 i_size_write(inode, newsize); 4933 i_size_write(inode, newsize);
4920 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 4934 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
4935 pagecache_isize_extended(inode, oldsize, newsize);
4921 ret = btrfs_update_inode(trans, root, inode); 4936 ret = btrfs_update_inode(trans, root, inode);
4922 btrfs_end_write_no_snapshoting(root); 4937 btrfs_end_write_no_snapshoting(root);
4923 btrfs_end_transaction(trans, root); 4938 btrfs_end_transaction(trans, root);
@@ -5588,7 +5603,7 @@ static struct inode *new_simple_dir(struct super_block *s,
5588 inode->i_op = &btrfs_dir_ro_inode_operations; 5603 inode->i_op = &btrfs_dir_ro_inode_operations;
5589 inode->i_fop = &simple_dir_operations; 5604 inode->i_fop = &simple_dir_operations;
5590 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 5605 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5591 inode->i_mtime = CURRENT_TIME; 5606 inode->i_mtime = current_fs_time(inode->i_sb);
5592 inode->i_atime = inode->i_mtime; 5607 inode->i_atime = inode->i_mtime;
5593 inode->i_ctime = inode->i_mtime; 5608 inode->i_ctime = inode->i_mtime;
5594 BTRFS_I(inode)->i_otime = inode->i_mtime; 5609 BTRFS_I(inode)->i_otime = inode->i_mtime;
@@ -5790,7 +5805,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5790 if (name_len <= sizeof(tmp_name)) { 5805 if (name_len <= sizeof(tmp_name)) {
5791 name_ptr = tmp_name; 5806 name_ptr = tmp_name;
5792 } else { 5807 } else {
5793 name_ptr = kmalloc(name_len, GFP_NOFS); 5808 name_ptr = kmalloc(name_len, GFP_KERNEL);
5794 if (!name_ptr) { 5809 if (!name_ptr) {
5795 ret = -ENOMEM; 5810 ret = -ENOMEM;
5796 goto err; 5811 goto err;
@@ -6172,7 +6187,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
6172 inode_init_owner(inode, dir, mode); 6187 inode_init_owner(inode, dir, mode);
6173 inode_set_bytes(inode, 0); 6188 inode_set_bytes(inode, 0);
6174 6189
6175 inode->i_mtime = CURRENT_TIME; 6190 inode->i_mtime = current_fs_time(inode->i_sb);
6176 inode->i_atime = inode->i_mtime; 6191 inode->i_atime = inode->i_mtime;
6177 inode->i_ctime = inode->i_mtime; 6192 inode->i_ctime = inode->i_mtime;
6178 BTRFS_I(inode)->i_otime = inode->i_mtime; 6193 BTRFS_I(inode)->i_otime = inode->i_mtime;
@@ -6285,7 +6300,8 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
6285 btrfs_i_size_write(parent_inode, parent_inode->i_size + 6300 btrfs_i_size_write(parent_inode, parent_inode->i_size +
6286 name_len * 2); 6301 name_len * 2);
6287 inode_inc_iversion(parent_inode); 6302 inode_inc_iversion(parent_inode);
6288 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 6303 parent_inode->i_mtime = parent_inode->i_ctime =
6304 current_fs_time(parent_inode->i_sb);
6289 ret = btrfs_update_inode(trans, root, parent_inode); 6305 ret = btrfs_update_inode(trans, root, parent_inode);
6290 if (ret) 6306 if (ret)
6291 btrfs_abort_transaction(trans, root, ret); 6307 btrfs_abort_transaction(trans, root, ret);
@@ -6503,7 +6519,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6503 BTRFS_I(inode)->dir_index = 0ULL; 6519 BTRFS_I(inode)->dir_index = 0ULL;
6504 inc_nlink(inode); 6520 inc_nlink(inode);
6505 inode_inc_iversion(inode); 6521 inode_inc_iversion(inode);
6506 inode->i_ctime = CURRENT_TIME; 6522 inode->i_ctime = current_fs_time(inode->i_sb);
6507 ihold(inode); 6523 ihold(inode);
6508 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); 6524 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6509 6525
@@ -7414,7 +7430,26 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7414 cached_state, GFP_NOFS); 7430 cached_state, GFP_NOFS);
7415 7431
7416 if (ordered) { 7432 if (ordered) {
7417 btrfs_start_ordered_extent(inode, ordered, 1); 7433 /*
7434 * If we are doing a DIO read and the ordered extent we
7435 * found is for a buffered write, we can not wait for it
7436 * to complete and retry, because if we do so we can
7437 * deadlock with concurrent buffered writes on page
7438 * locks. This happens only if our DIO read covers more
7439 * than one extent map, if at this point has already
7440 * created an ordered extent for a previous extent map
7441 * and locked its range in the inode's io tree, and a
7442 * concurrent write against that previous extent map's
7443 * range and this range started (we unlock the ranges
7444 * in the io tree only when the bios complete and
7445 * buffered writes always lock pages before attempting
7446 * to lock range in the io tree).
7447 */
7448 if (writing ||
7449 test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
7450 btrfs_start_ordered_extent(inode, ordered, 1);
7451 else
7452 ret = -ENOTBLK;
7418 btrfs_put_ordered_extent(ordered); 7453 btrfs_put_ordered_extent(ordered);
7419 } else { 7454 } else {
7420 /* 7455 /*
@@ -7431,9 +7466,11 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7431 * that page. 7466 * that page.
7432 */ 7467 */
7433 ret = -ENOTBLK; 7468 ret = -ENOTBLK;
7434 break;
7435 } 7469 }
7436 7470
7471 if (ret)
7472 break;
7473
7437 cond_resched(); 7474 cond_resched();
7438 } 7475 }
7439 7476
@@ -7764,9 +7801,9 @@ static int btrfs_check_dio_repairable(struct inode *inode,
7764} 7801}
7765 7802
7766static int dio_read_error(struct inode *inode, struct bio *failed_bio, 7803static int dio_read_error(struct inode *inode, struct bio *failed_bio,
7767 struct page *page, u64 start, u64 end, 7804 struct page *page, unsigned int pgoff,
7768 int failed_mirror, bio_end_io_t *repair_endio, 7805 u64 start, u64 end, int failed_mirror,
7769 void *repair_arg) 7806 bio_end_io_t *repair_endio, void *repair_arg)
7770{ 7807{
7771 struct io_failure_record *failrec; 7808 struct io_failure_record *failrec;
7772 struct bio *bio; 7809 struct bio *bio;
@@ -7787,7 +7824,9 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
7787 return -EIO; 7824 return -EIO;
7788 } 7825 }
7789 7826
7790 if (failed_bio->bi_vcnt > 1) 7827 if ((failed_bio->bi_vcnt > 1)
7828 || (failed_bio->bi_io_vec->bv_len
7829 > BTRFS_I(inode)->root->sectorsize))
7791 read_mode = READ_SYNC | REQ_FAILFAST_DEV; 7830 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
7792 else 7831 else
7793 read_mode = READ_SYNC; 7832 read_mode = READ_SYNC;
@@ -7795,7 +7834,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
7795 isector = start - btrfs_io_bio(failed_bio)->logical; 7834 isector = start - btrfs_io_bio(failed_bio)->logical;
7796 isector >>= inode->i_sb->s_blocksize_bits; 7835 isector >>= inode->i_sb->s_blocksize_bits;
7797 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, 7836 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
7798 0, isector, repair_endio, repair_arg); 7837 pgoff, isector, repair_endio, repair_arg);
7799 if (!bio) { 7838 if (!bio) {
7800 free_io_failure(inode, failrec); 7839 free_io_failure(inode, failrec);
7801 return -EIO; 7840 return -EIO;
@@ -7825,12 +7864,17 @@ struct btrfs_retry_complete {
7825static void btrfs_retry_endio_nocsum(struct bio *bio) 7864static void btrfs_retry_endio_nocsum(struct bio *bio)
7826{ 7865{
7827 struct btrfs_retry_complete *done = bio->bi_private; 7866 struct btrfs_retry_complete *done = bio->bi_private;
7867 struct inode *inode;
7828 struct bio_vec *bvec; 7868 struct bio_vec *bvec;
7829 int i; 7869 int i;
7830 7870
7831 if (bio->bi_error) 7871 if (bio->bi_error)
7832 goto end; 7872 goto end;
7833 7873
7874 ASSERT(bio->bi_vcnt == 1);
7875 inode = bio->bi_io_vec->bv_page->mapping->host;
7876 ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
7877
7834 done->uptodate = 1; 7878 done->uptodate = 1;
7835 bio_for_each_segment_all(bvec, bio, i) 7879 bio_for_each_segment_all(bvec, bio, i)
7836 clean_io_failure(done->inode, done->start, bvec->bv_page, 0); 7880 clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
@@ -7842,25 +7886,35 @@ end:
7842static int __btrfs_correct_data_nocsum(struct inode *inode, 7886static int __btrfs_correct_data_nocsum(struct inode *inode,
7843 struct btrfs_io_bio *io_bio) 7887 struct btrfs_io_bio *io_bio)
7844{ 7888{
7889 struct btrfs_fs_info *fs_info;
7845 struct bio_vec *bvec; 7890 struct bio_vec *bvec;
7846 struct btrfs_retry_complete done; 7891 struct btrfs_retry_complete done;
7847 u64 start; 7892 u64 start;
7893 unsigned int pgoff;
7894 u32 sectorsize;
7895 int nr_sectors;
7848 int i; 7896 int i;
7849 int ret; 7897 int ret;
7850 7898
7899 fs_info = BTRFS_I(inode)->root->fs_info;
7900 sectorsize = BTRFS_I(inode)->root->sectorsize;
7901
7851 start = io_bio->logical; 7902 start = io_bio->logical;
7852 done.inode = inode; 7903 done.inode = inode;
7853 7904
7854 bio_for_each_segment_all(bvec, &io_bio->bio, i) { 7905 bio_for_each_segment_all(bvec, &io_bio->bio, i) {
7855try_again: 7906 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
7907 pgoff = bvec->bv_offset;
7908
7909next_block_or_try_again:
7856 done.uptodate = 0; 7910 done.uptodate = 0;
7857 done.start = start; 7911 done.start = start;
7858 init_completion(&done.done); 7912 init_completion(&done.done);
7859 7913
7860 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, 7914 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
7861 start + bvec->bv_len - 1, 7915 pgoff, start, start + sectorsize - 1,
7862 io_bio->mirror_num, 7916 io_bio->mirror_num,
7863 btrfs_retry_endio_nocsum, &done); 7917 btrfs_retry_endio_nocsum, &done);
7864 if (ret) 7918 if (ret)
7865 return ret; 7919 return ret;
7866 7920
@@ -7868,10 +7922,15 @@ try_again:
7868 7922
7869 if (!done.uptodate) { 7923 if (!done.uptodate) {
7870 /* We might have another mirror, so try again */ 7924 /* We might have another mirror, so try again */
7871 goto try_again; 7925 goto next_block_or_try_again;
7872 } 7926 }
7873 7927
7874 start += bvec->bv_len; 7928 start += sectorsize;
7929
7930 if (nr_sectors--) {
7931 pgoff += sectorsize;
7932 goto next_block_or_try_again;
7933 }
7875 } 7934 }
7876 7935
7877 return 0; 7936 return 0;
@@ -7881,7 +7940,9 @@ static void btrfs_retry_endio(struct bio *bio)
7881{ 7940{
7882 struct btrfs_retry_complete *done = bio->bi_private; 7941 struct btrfs_retry_complete *done = bio->bi_private;
7883 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 7942 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7943 struct inode *inode;
7884 struct bio_vec *bvec; 7944 struct bio_vec *bvec;
7945 u64 start;
7885 int uptodate; 7946 int uptodate;
7886 int ret; 7947 int ret;
7887 int i; 7948 int i;
@@ -7890,13 +7951,20 @@ static void btrfs_retry_endio(struct bio *bio)
7890 goto end; 7951 goto end;
7891 7952
7892 uptodate = 1; 7953 uptodate = 1;
7954
7955 start = done->start;
7956
7957 ASSERT(bio->bi_vcnt == 1);
7958 inode = bio->bi_io_vec->bv_page->mapping->host;
7959 ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
7960
7893 bio_for_each_segment_all(bvec, bio, i) { 7961 bio_for_each_segment_all(bvec, bio, i) {
7894 ret = __readpage_endio_check(done->inode, io_bio, i, 7962 ret = __readpage_endio_check(done->inode, io_bio, i,
7895 bvec->bv_page, 0, 7963 bvec->bv_page, bvec->bv_offset,
7896 done->start, bvec->bv_len); 7964 done->start, bvec->bv_len);
7897 if (!ret) 7965 if (!ret)
7898 clean_io_failure(done->inode, done->start, 7966 clean_io_failure(done->inode, done->start,
7899 bvec->bv_page, 0); 7967 bvec->bv_page, bvec->bv_offset);
7900 else 7968 else
7901 uptodate = 0; 7969 uptodate = 0;
7902 } 7970 }
@@ -7910,20 +7978,34 @@ end:
7910static int __btrfs_subio_endio_read(struct inode *inode, 7978static int __btrfs_subio_endio_read(struct inode *inode,
7911 struct btrfs_io_bio *io_bio, int err) 7979 struct btrfs_io_bio *io_bio, int err)
7912{ 7980{
7981 struct btrfs_fs_info *fs_info;
7913 struct bio_vec *bvec; 7982 struct bio_vec *bvec;
7914 struct btrfs_retry_complete done; 7983 struct btrfs_retry_complete done;
7915 u64 start; 7984 u64 start;
7916 u64 offset = 0; 7985 u64 offset = 0;
7986 u32 sectorsize;
7987 int nr_sectors;
7988 unsigned int pgoff;
7989 int csum_pos;
7917 int i; 7990 int i;
7918 int ret; 7991 int ret;
7919 7992
7993 fs_info = BTRFS_I(inode)->root->fs_info;
7994 sectorsize = BTRFS_I(inode)->root->sectorsize;
7995
7920 err = 0; 7996 err = 0;
7921 start = io_bio->logical; 7997 start = io_bio->logical;
7922 done.inode = inode; 7998 done.inode = inode;
7923 7999
7924 bio_for_each_segment_all(bvec, &io_bio->bio, i) { 8000 bio_for_each_segment_all(bvec, &io_bio->bio, i) {
7925 ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, 8001 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
7926 0, start, bvec->bv_len); 8002
8003 pgoff = bvec->bv_offset;
8004next_block:
8005 csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
8006 ret = __readpage_endio_check(inode, io_bio, csum_pos,
8007 bvec->bv_page, pgoff, start,
8008 sectorsize);
7927 if (likely(!ret)) 8009 if (likely(!ret))
7928 goto next; 8010 goto next;
7929try_again: 8011try_again:
@@ -7931,10 +8013,10 @@ try_again:
7931 done.start = start; 8013 done.start = start;
7932 init_completion(&done.done); 8014 init_completion(&done.done);
7933 8015
7934 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, 8016 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
7935 start + bvec->bv_len - 1, 8017 pgoff, start, start + sectorsize - 1,
7936 io_bio->mirror_num, 8018 io_bio->mirror_num,
7937 btrfs_retry_endio, &done); 8019 btrfs_retry_endio, &done);
7938 if (ret) { 8020 if (ret) {
7939 err = ret; 8021 err = ret;
7940 goto next; 8022 goto next;
@@ -7947,8 +8029,15 @@ try_again:
7947 goto try_again; 8029 goto try_again;
7948 } 8030 }
7949next: 8031next:
7950 offset += bvec->bv_len; 8032 offset += sectorsize;
7951 start += bvec->bv_len; 8033 start += sectorsize;
8034
8035 ASSERT(nr_sectors);
8036
8037 if (--nr_sectors) {
8038 pgoff += sectorsize;
8039 goto next_block;
8040 }
7952 } 8041 }
7953 8042
7954 return err; 8043 return err;
@@ -8202,9 +8291,11 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
8202 u64 file_offset = dip->logical_offset; 8291 u64 file_offset = dip->logical_offset;
8203 u64 submit_len = 0; 8292 u64 submit_len = 0;
8204 u64 map_length; 8293 u64 map_length;
8205 int nr_pages = 0; 8294 u32 blocksize = root->sectorsize;
8206 int ret;
8207 int async_submit = 0; 8295 int async_submit = 0;
8296 int nr_sectors;
8297 int ret;
8298 int i;
8208 8299
8209 map_length = orig_bio->bi_iter.bi_size; 8300 map_length = orig_bio->bi_iter.bi_size;
8210 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, 8301 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
@@ -8234,9 +8325,12 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
8234 atomic_inc(&dip->pending_bios); 8325 atomic_inc(&dip->pending_bios);
8235 8326
8236 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 8327 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
8237 if (map_length < submit_len + bvec->bv_len || 8328 nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, bvec->bv_len);
8238 bio_add_page(bio, bvec->bv_page, bvec->bv_len, 8329 i = 0;
8239 bvec->bv_offset) < bvec->bv_len) { 8330next_block:
8331 if (unlikely(map_length < submit_len + blocksize ||
8332 bio_add_page(bio, bvec->bv_page, blocksize,
8333 bvec->bv_offset + (i * blocksize)) < blocksize)) {
8240 /* 8334 /*
8241 * inc the count before we submit the bio so 8335 * inc the count before we submit the bio so
8242 * we know the end IO handler won't happen before 8336 * we know the end IO handler won't happen before
@@ -8257,7 +8351,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
8257 file_offset += submit_len; 8351 file_offset += submit_len;
8258 8352
8259 submit_len = 0; 8353 submit_len = 0;
8260 nr_pages = 0;
8261 8354
8262 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, 8355 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
8263 start_sector, GFP_NOFS); 8356 start_sector, GFP_NOFS);
@@ -8275,9 +8368,14 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
8275 bio_put(bio); 8368 bio_put(bio);
8276 goto out_err; 8369 goto out_err;
8277 } 8370 }
8371
8372 goto next_block;
8278 } else { 8373 } else {
8279 submit_len += bvec->bv_len; 8374 submit_len += blocksize;
8280 nr_pages++; 8375 if (--nr_sectors) {
8376 i++;
8377 goto next_block;
8378 }
8281 bvec++; 8379 bvec++;
8282 } 8380 }
8283 } 8381 }
@@ -8642,6 +8740,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
8642 struct extent_state *cached_state = NULL; 8740 struct extent_state *cached_state = NULL;
8643 u64 page_start = page_offset(page); 8741 u64 page_start = page_offset(page);
8644 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 8742 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
8743 u64 start;
8744 u64 end;
8645 int inode_evicting = inode->i_state & I_FREEING; 8745 int inode_evicting = inode->i_state & I_FREEING;
8646 8746
8647 /* 8747 /*
@@ -8661,14 +8761,18 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
8661 8761
8662 if (!inode_evicting) 8762 if (!inode_evicting)
8663 lock_extent_bits(tree, page_start, page_end, &cached_state); 8763 lock_extent_bits(tree, page_start, page_end, &cached_state);
8664 ordered = btrfs_lookup_ordered_extent(inode, page_start); 8764again:
8765 start = page_start;
8766 ordered = btrfs_lookup_ordered_range(inode, start,
8767 page_end - start + 1);
8665 if (ordered) { 8768 if (ordered) {
8769 end = min(page_end, ordered->file_offset + ordered->len - 1);
8666 /* 8770 /*
8667 * IO on this page will never be started, so we need 8771 * IO on this page will never be started, so we need
8668 * to account for any ordered extents now 8772 * to account for any ordered extents now
8669 */ 8773 */
8670 if (!inode_evicting) 8774 if (!inode_evicting)
8671 clear_extent_bit(tree, page_start, page_end, 8775 clear_extent_bit(tree, start, end,
8672 EXTENT_DIRTY | EXTENT_DELALLOC | 8776 EXTENT_DIRTY | EXTENT_DELALLOC |
8673 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 8777 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
8674 EXTENT_DEFRAG, 1, 0, &cached_state, 8778 EXTENT_DEFRAG, 1, 0, &cached_state,
@@ -8685,22 +8789,26 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
8685 8789
8686 spin_lock_irq(&tree->lock); 8790 spin_lock_irq(&tree->lock);
8687 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); 8791 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
8688 new_len = page_start - ordered->file_offset; 8792 new_len = start - ordered->file_offset;
8689 if (new_len < ordered->truncated_len) 8793 if (new_len < ordered->truncated_len)
8690 ordered->truncated_len = new_len; 8794 ordered->truncated_len = new_len;
8691 spin_unlock_irq(&tree->lock); 8795 spin_unlock_irq(&tree->lock);
8692 8796
8693 if (btrfs_dec_test_ordered_pending(inode, &ordered, 8797 if (btrfs_dec_test_ordered_pending(inode, &ordered,
8694 page_start, 8798 start,
8695 PAGE_CACHE_SIZE, 1)) 8799 end - start + 1, 1))
8696 btrfs_finish_ordered_io(ordered); 8800 btrfs_finish_ordered_io(ordered);
8697 } 8801 }
8698 btrfs_put_ordered_extent(ordered); 8802 btrfs_put_ordered_extent(ordered);
8699 if (!inode_evicting) { 8803 if (!inode_evicting) {
8700 cached_state = NULL; 8804 cached_state = NULL;
8701 lock_extent_bits(tree, page_start, page_end, 8805 lock_extent_bits(tree, start, end,
8702 &cached_state); 8806 &cached_state);
8703 } 8807 }
8808
8809 start = end + 1;
8810 if (start < page_end)
8811 goto again;
8704 } 8812 }
8705 8813
8706 /* 8814 /*
@@ -8761,15 +8869,28 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
8761 loff_t size; 8869 loff_t size;
8762 int ret; 8870 int ret;
8763 int reserved = 0; 8871 int reserved = 0;
8872 u64 reserved_space;
8764 u64 page_start; 8873 u64 page_start;
8765 u64 page_end; 8874 u64 page_end;
8875 u64 end;
8876
8877 reserved_space = PAGE_CACHE_SIZE;
8766 8878
8767 sb_start_pagefault(inode->i_sb); 8879 sb_start_pagefault(inode->i_sb);
8768 page_start = page_offset(page); 8880 page_start = page_offset(page);
8769 page_end = page_start + PAGE_CACHE_SIZE - 1; 8881 page_end = page_start + PAGE_CACHE_SIZE - 1;
8882 end = page_end;
8770 8883
8884 /*
8885 * Reserving delalloc space after obtaining the page lock can lead to
8886 * deadlock. For example, if a dirty page is locked by this function
8887 * and the call to btrfs_delalloc_reserve_space() ends up triggering
8888 * dirty page write out, then the btrfs_writepage() function could
8889 * end up waiting indefinitely to get a lock on the page currently
8890 * being processed by btrfs_page_mkwrite() function.
8891 */
8771 ret = btrfs_delalloc_reserve_space(inode, page_start, 8892 ret = btrfs_delalloc_reserve_space(inode, page_start,
8772 PAGE_CACHE_SIZE); 8893 reserved_space);
8773 if (!ret) { 8894 if (!ret) {
8774 ret = file_update_time(vma->vm_file); 8895 ret = file_update_time(vma->vm_file);
8775 reserved = 1; 8896 reserved = 1;
@@ -8803,7 +8924,7 @@ again:
8803 * we can't set the delalloc bits if there are pending ordered 8924 * we can't set the delalloc bits if there are pending ordered
8804 * extents. Drop our locks and wait for them to finish 8925 * extents. Drop our locks and wait for them to finish
8805 */ 8926 */
8806 ordered = btrfs_lookup_ordered_extent(inode, page_start); 8927 ordered = btrfs_lookup_ordered_range(inode, page_start, page_end);
8807 if (ordered) { 8928 if (ordered) {
8808 unlock_extent_cached(io_tree, page_start, page_end, 8929 unlock_extent_cached(io_tree, page_start, page_end,
8809 &cached_state, GFP_NOFS); 8930 &cached_state, GFP_NOFS);
@@ -8813,6 +8934,18 @@ again:
8813 goto again; 8934 goto again;
8814 } 8935 }
8815 8936
8937 if (page->index == ((size - 1) >> PAGE_CACHE_SHIFT)) {
8938 reserved_space = round_up(size - page_start, root->sectorsize);
8939 if (reserved_space < PAGE_CACHE_SIZE) {
8940 end = page_start + reserved_space - 1;
8941 spin_lock(&BTRFS_I(inode)->lock);
8942 BTRFS_I(inode)->outstanding_extents++;
8943 spin_unlock(&BTRFS_I(inode)->lock);
8944 btrfs_delalloc_release_space(inode, page_start,
8945 PAGE_CACHE_SIZE - reserved_space);
8946 }
8947 }
8948
8816 /* 8949 /*
8817 * XXX - page_mkwrite gets called every time the page is dirtied, even 8950 * XXX - page_mkwrite gets called every time the page is dirtied, even
8818 * if it was already dirty, so for space accounting reasons we need to 8951 * if it was already dirty, so for space accounting reasons we need to
@@ -8820,12 +8953,12 @@ again:
8820 * is probably a better way to do this, but for now keep consistent with 8953 * is probably a better way to do this, but for now keep consistent with
8821 * prepare_pages in the normal write path. 8954 * prepare_pages in the normal write path.
8822 */ 8955 */
8823 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 8956 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
8824 EXTENT_DIRTY | EXTENT_DELALLOC | 8957 EXTENT_DIRTY | EXTENT_DELALLOC |
8825 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 8958 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
8826 0, 0, &cached_state, GFP_NOFS); 8959 0, 0, &cached_state, GFP_NOFS);
8827 8960
8828 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 8961 ret = btrfs_set_extent_delalloc(inode, page_start, end,
8829 &cached_state); 8962 &cached_state);
8830 if (ret) { 8963 if (ret) {
8831 unlock_extent_cached(io_tree, page_start, page_end, 8964 unlock_extent_cached(io_tree, page_start, page_end,
@@ -8864,7 +8997,7 @@ out_unlock:
8864 } 8997 }
8865 unlock_page(page); 8998 unlock_page(page);
8866out: 8999out:
8867 btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE); 9000 btrfs_delalloc_release_space(inode, page_start, reserved_space);
8868out_noreserve: 9001out_noreserve:
8869 sb_end_pagefault(inode->i_sb); 9002 sb_end_pagefault(inode->i_sb);
8870 return ret; 9003 return ret;
@@ -9190,16 +9323,11 @@ void btrfs_destroy_cachep(void)
9190 * destroy cache. 9323 * destroy cache.
9191 */ 9324 */
9192 rcu_barrier(); 9325 rcu_barrier();
9193 if (btrfs_inode_cachep) 9326 kmem_cache_destroy(btrfs_inode_cachep);
9194 kmem_cache_destroy(btrfs_inode_cachep); 9327 kmem_cache_destroy(btrfs_trans_handle_cachep);
9195 if (btrfs_trans_handle_cachep) 9328 kmem_cache_destroy(btrfs_transaction_cachep);
9196 kmem_cache_destroy(btrfs_trans_handle_cachep); 9329 kmem_cache_destroy(btrfs_path_cachep);
9197 if (btrfs_transaction_cachep) 9330 kmem_cache_destroy(btrfs_free_space_cachep);
9198 kmem_cache_destroy(btrfs_transaction_cachep);
9199 if (btrfs_path_cachep)
9200 kmem_cache_destroy(btrfs_path_cachep);
9201 if (btrfs_free_space_cachep)
9202 kmem_cache_destroy(btrfs_free_space_cachep);
9203} 9331}
9204 9332
9205int btrfs_init_cachep(void) 9333int btrfs_init_cachep(void)
@@ -9250,7 +9378,6 @@ static int btrfs_getattr(struct vfsmount *mnt,
9250 9378
9251 generic_fillattr(inode, stat); 9379 generic_fillattr(inode, stat);
9252 stat->dev = BTRFS_I(inode)->root->anon_dev; 9380 stat->dev = BTRFS_I(inode)->root->anon_dev;
9253 stat->blksize = PAGE_CACHE_SIZE;
9254 9381
9255 spin_lock(&BTRFS_I(inode)->lock); 9382 spin_lock(&BTRFS_I(inode)->lock);
9256 delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; 9383 delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
@@ -9268,7 +9395,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9268 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 9395 struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9269 struct inode *new_inode = d_inode(new_dentry); 9396 struct inode *new_inode = d_inode(new_dentry);
9270 struct inode *old_inode = d_inode(old_dentry); 9397 struct inode *old_inode = d_inode(old_dentry);
9271 struct timespec ctime = CURRENT_TIME;
9272 u64 index = 0; 9398 u64 index = 0;
9273 u64 root_objectid; 9399 u64 root_objectid;
9274 int ret; 9400 int ret;
@@ -9365,9 +9491,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9365 inode_inc_iversion(old_dir); 9491 inode_inc_iversion(old_dir);
9366 inode_inc_iversion(new_dir); 9492 inode_inc_iversion(new_dir);
9367 inode_inc_iversion(old_inode); 9493 inode_inc_iversion(old_inode);
9368 old_dir->i_ctime = old_dir->i_mtime = ctime; 9494 old_dir->i_ctime = old_dir->i_mtime =
9369 new_dir->i_ctime = new_dir->i_mtime = ctime; 9495 new_dir->i_ctime = new_dir->i_mtime =
9370 old_inode->i_ctime = ctime; 9496 old_inode->i_ctime = current_fs_time(old_dir->i_sb);
9371 9497
9372 if (old_dentry->d_parent != new_dentry->d_parent) 9498 if (old_dentry->d_parent != new_dentry->d_parent)
9373 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); 9499 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
@@ -9392,7 +9518,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9392 9518
9393 if (new_inode) { 9519 if (new_inode) {
9394 inode_inc_iversion(new_inode); 9520 inode_inc_iversion(new_inode);
9395 new_inode->i_ctime = CURRENT_TIME; 9521 new_inode->i_ctime = current_fs_time(new_inode->i_sb);
9396 if (unlikely(btrfs_ino(new_inode) == 9522 if (unlikely(btrfs_ino(new_inode) ==
9397 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 9523 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
9398 root_objectid = BTRFS_I(new_inode)->location.objectid; 9524 root_objectid = BTRFS_I(new_inode)->location.objectid;
@@ -9870,7 +9996,7 @@ next:
9870 *alloc_hint = ins.objectid + ins.offset; 9996 *alloc_hint = ins.objectid + ins.offset;
9871 9997
9872 inode_inc_iversion(inode); 9998 inode_inc_iversion(inode);
9873 inode->i_ctime = CURRENT_TIME; 9999 inode->i_ctime = current_fs_time(inode->i_sb);
9874 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 10000 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
9875 if (!(mode & FALLOC_FL_KEEP_SIZE) && 10001 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
9876 (actual_len > inode->i_size) && 10002 (actual_len > inode->i_size) &&
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 48aee9846329..053e677839fe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -59,6 +59,8 @@
59#include "props.h" 59#include "props.h"
60#include "sysfs.h" 60#include "sysfs.h"
61#include "qgroup.h" 61#include "qgroup.h"
62#include "tree-log.h"
63#include "compression.h"
62 64
63#ifdef CONFIG_64BIT 65#ifdef CONFIG_64BIT
64/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI 66/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -347,7 +349,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
347 349
348 btrfs_update_iflags(inode); 350 btrfs_update_iflags(inode);
349 inode_inc_iversion(inode); 351 inode_inc_iversion(inode);
350 inode->i_ctime = CURRENT_TIME; 352 inode->i_ctime = current_fs_time(inode->i_sb);
351 ret = btrfs_update_inode(trans, root, inode); 353 ret = btrfs_update_inode(trans, root, inode);
352 354
353 btrfs_end_transaction(trans, root); 355 btrfs_end_transaction(trans, root);
@@ -443,7 +445,7 @@ static noinline int create_subvol(struct inode *dir,
443 struct btrfs_root *root = BTRFS_I(dir)->root; 445 struct btrfs_root *root = BTRFS_I(dir)->root;
444 struct btrfs_root *new_root; 446 struct btrfs_root *new_root;
445 struct btrfs_block_rsv block_rsv; 447 struct btrfs_block_rsv block_rsv;
446 struct timespec cur_time = CURRENT_TIME; 448 struct timespec cur_time = current_fs_time(dir->i_sb);
447 struct inode *inode; 449 struct inode *inode;
448 int ret; 450 int ret;
449 int err; 451 int err;
@@ -844,10 +846,6 @@ static noinline int btrfs_mksubvol(struct path *parent,
844 if (IS_ERR(dentry)) 846 if (IS_ERR(dentry))
845 goto out_unlock; 847 goto out_unlock;
846 848
847 error = -EEXIST;
848 if (d_really_is_positive(dentry))
849 goto out_dput;
850
851 error = btrfs_may_create(dir, dentry); 849 error = btrfs_may_create(dir, dentry);
852 if (error) 850 if (error)
853 goto out_dput; 851 goto out_dput;
@@ -2097,8 +2095,6 @@ static noinline int search_ioctl(struct inode *inode,
2097 key.offset = (u64)-1; 2095 key.offset = (u64)-1;
2098 root = btrfs_read_fs_root_no_name(info, &key); 2096 root = btrfs_read_fs_root_no_name(info, &key);
2099 if (IS_ERR(root)) { 2097 if (IS_ERR(root)) {
2100 btrfs_err(info, "could not find root %llu",
2101 sk->tree_id);
2102 btrfs_free_path(path); 2098 btrfs_free_path(path);
2103 return -ENOENT; 2099 return -ENOENT;
2104 } 2100 }
@@ -2476,6 +2472,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2476 trans->block_rsv = &block_rsv; 2472 trans->block_rsv = &block_rsv;
2477 trans->bytes_reserved = block_rsv.size; 2473 trans->bytes_reserved = block_rsv.size;
2478 2474
2475 btrfs_record_snapshot_destroy(trans, dir);
2476
2479 ret = btrfs_unlink_subvol(trans, root, dir, 2477 ret = btrfs_unlink_subvol(trans, root, dir,
2480 dest->root_key.objectid, 2478 dest->root_key.objectid,
2481 dentry->d_name.name, 2479 dentry->d_name.name,
@@ -2960,8 +2958,8 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
2960 * of the array is bounded by len, which is in turn bounded by 2958 * of the array is bounded by len, which is in turn bounded by
2961 * BTRFS_MAX_DEDUPE_LEN. 2959 * BTRFS_MAX_DEDUPE_LEN.
2962 */ 2960 */
2963 src_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS); 2961 src_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
2964 dst_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS); 2962 dst_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
2965 if (!src_pgarr || !dst_pgarr) { 2963 if (!src_pgarr || !dst_pgarr) {
2966 kfree(src_pgarr); 2964 kfree(src_pgarr);
2967 kfree(dst_pgarr); 2965 kfree(dst_pgarr);
@@ -3068,6 +3066,9 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
3068 ret = extent_same_check_offsets(src, loff, &len, olen); 3066 ret = extent_same_check_offsets(src, loff, &len, olen);
3069 if (ret) 3067 if (ret)
3070 goto out_unlock; 3068 goto out_unlock;
3069 ret = extent_same_check_offsets(src, dst_loff, &len, olen);
3070 if (ret)
3071 goto out_unlock;
3071 3072
3072 /* 3073 /*
3073 * Single inode case wants the same checks, except we 3074 * Single inode case wants the same checks, except we
@@ -3217,7 +3218,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
3217 3218
3218 inode_inc_iversion(inode); 3219 inode_inc_iversion(inode);
3219 if (!no_time_update) 3220 if (!no_time_update)
3220 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 3221 inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
3221 /* 3222 /*
3222 * We round up to the block size at eof when determining which 3223 * We round up to the block size at eof when determining which
3223 * extents to clone above, but shouldn't round up the file size. 3224 * extents to clone above, but shouldn't round up the file size.
@@ -3889,8 +3890,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
3889 * Truncate page cache pages so that future reads will see the cloned 3890 * Truncate page cache pages so that future reads will see the cloned
3890 * data immediately and not the previous data. 3891 * data immediately and not the previous data.
3891 */ 3892 */
3892 truncate_inode_pages_range(&inode->i_data, destoff, 3893 truncate_inode_pages_range(&inode->i_data,
3893 PAGE_CACHE_ALIGN(destoff + len) - 1); 3894 round_down(destoff, PAGE_CACHE_SIZE),
3895 round_up(destoff + len, PAGE_CACHE_SIZE) - 1);
3894out_unlock: 3896out_unlock:
3895 if (!same_inode) 3897 if (!same_inode)
3896 btrfs_double_inode_unlock(src, inode); 3898 btrfs_double_inode_unlock(src, inode);
@@ -5031,7 +5033,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
5031 struct btrfs_root *root = BTRFS_I(inode)->root; 5033 struct btrfs_root *root = BTRFS_I(inode)->root;
5032 struct btrfs_root_item *root_item = &root->root_item; 5034 struct btrfs_root_item *root_item = &root->root_item;
5033 struct btrfs_trans_handle *trans; 5035 struct btrfs_trans_handle *trans;
5034 struct timespec ct = CURRENT_TIME; 5036 struct timespec ct = current_fs_time(inode->i_sb);
5035 int ret = 0; 5037 int ret = 0;
5036 int received_uuid_changed; 5038 int received_uuid_changed;
5037 5039
@@ -5262,8 +5264,7 @@ out_unlock:
5262 .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \ 5264 .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
5263 .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix } 5265 .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }
5264 5266
5265static int btrfs_ioctl_get_supported_features(struct file *file, 5267int btrfs_ioctl_get_supported_features(void __user *arg)
5266 void __user *arg)
5267{ 5268{
5268 static const struct btrfs_ioctl_feature_flags features[3] = { 5269 static const struct btrfs_ioctl_feature_flags features[3] = {
5269 INIT_FEATURE_FLAGS(SUPP), 5270 INIT_FEATURE_FLAGS(SUPP),
@@ -5542,7 +5543,7 @@ long btrfs_ioctl(struct file *file, unsigned int
5542 case BTRFS_IOC_SET_FSLABEL: 5543 case BTRFS_IOC_SET_FSLABEL:
5543 return btrfs_ioctl_set_fslabel(file, argp); 5544 return btrfs_ioctl_set_fslabel(file, argp);
5544 case BTRFS_IOC_GET_SUPPORTED_FEATURES: 5545 case BTRFS_IOC_GET_SUPPORTED_FEATURES:
5545 return btrfs_ioctl_get_supported_features(file, argp); 5546 return btrfs_ioctl_get_supported_features(argp);
5546 case BTRFS_IOC_GET_FEATURES: 5547 case BTRFS_IOC_GET_FEATURES:
5547 return btrfs_ioctl_get_features(file, argp); 5548 return btrfs_ioctl_get_features(file, argp);
5548 case BTRFS_IOC_SET_FEATURES: 5549 case BTRFS_IOC_SET_FEATURES:
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 8c27292ea9ea..0de7da5a610d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -25,6 +25,7 @@
25#include "btrfs_inode.h" 25#include "btrfs_inode.h"
26#include "extent_io.h" 26#include "extent_io.h"
27#include "disk-io.h" 27#include "disk-io.h"
28#include "compression.h"
28 29
29static struct kmem_cache *btrfs_ordered_extent_cache; 30static struct kmem_cache *btrfs_ordered_extent_cache;
30 31
@@ -1009,7 +1010,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
1009 for (; node; node = rb_prev(node)) { 1010 for (; node; node = rb_prev(node)) {
1010 test = rb_entry(node, struct btrfs_ordered_extent, rb_node); 1011 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
1011 1012
1012 /* We treat this entry as if it doesnt exist */ 1013 /* We treat this entry as if it doesn't exist */
1013 if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) 1014 if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
1014 continue; 1015 continue;
1015 if (test->file_offset + test->len <= disk_i_size) 1016 if (test->file_offset + test->len <= disk_i_size)
@@ -1114,6 +1115,5 @@ int __init ordered_data_init(void)
1114 1115
1115void ordered_data_exit(void) 1116void ordered_data_exit(void)
1116{ 1117{
1117 if (btrfs_ordered_extent_cache) 1118 kmem_cache_destroy(btrfs_ordered_extent_cache);
1118 kmem_cache_destroy(btrfs_ordered_extent_cache);
1119} 1119}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 647ab12fdf5d..147dc6ca5de1 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -295,8 +295,27 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
295 btrfs_dev_extent_chunk_offset(l, dev_extent), 295 btrfs_dev_extent_chunk_offset(l, dev_extent),
296 btrfs_dev_extent_length(l, dev_extent)); 296 btrfs_dev_extent_length(l, dev_extent));
297 break; 297 break;
298 case BTRFS_DEV_STATS_KEY: 298 case BTRFS_PERSISTENT_ITEM_KEY:
299 printk(KERN_INFO "\t\tdevice stats\n"); 299 printk(KERN_INFO "\t\tpersistent item objectid %llu offset %llu\n",
300 key.objectid, key.offset);
301 switch (key.objectid) {
302 case BTRFS_DEV_STATS_OBJECTID:
303 printk(KERN_INFO "\t\tdevice stats\n");
304 break;
305 default:
306 printk(KERN_INFO "\t\tunknown persistent item\n");
307 }
308 break;
309 case BTRFS_TEMPORARY_ITEM_KEY:
310 printk(KERN_INFO "\t\ttemporary item objectid %llu offset %llu\n",
311 key.objectid, key.offset);
312 switch (key.objectid) {
313 case BTRFS_BALANCE_OBJECTID:
314 printk(KERN_INFO "\t\tbalance status\n");
315 break;
316 default:
317 printk(KERN_INFO "\t\tunknown temporary item\n");
318 }
300 break; 319 break;
301 case BTRFS_DEV_REPLACE_KEY: 320 case BTRFS_DEV_REPLACE_KEY:
302 printk(KERN_INFO "\t\tdev replace\n"); 321 printk(KERN_INFO "\t\tdev replace\n");
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index f9e60231f685..36992128c746 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -22,6 +22,7 @@
22#include "hash.h" 22#include "hash.h"
23#include "transaction.h" 23#include "transaction.h"
24#include "xattr.h" 24#include "xattr.h"
25#include "compression.h"
25 26
26#define BTRFS_PROP_HANDLERS_HT_BITS 8 27#define BTRFS_PROP_HANDLERS_HT_BITS 8
27static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); 28static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 619f92963e27..b892914968c1 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -72,7 +72,7 @@ struct reada_extent {
72 spinlock_t lock; 72 spinlock_t lock;
73 struct reada_zone *zones[BTRFS_MAX_MIRRORS]; 73 struct reada_zone *zones[BTRFS_MAX_MIRRORS];
74 int nzones; 74 int nzones;
75 struct btrfs_device *scheduled_for; 75 int scheduled;
76}; 76};
77 77
78struct reada_zone { 78struct reada_zone {
@@ -101,67 +101,53 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info);
101static void __reada_start_machine(struct btrfs_fs_info *fs_info); 101static void __reada_start_machine(struct btrfs_fs_info *fs_info);
102 102
103static int reada_add_block(struct reada_control *rc, u64 logical, 103static int reada_add_block(struct reada_control *rc, u64 logical,
104 struct btrfs_key *top, int level, u64 generation); 104 struct btrfs_key *top, u64 generation);
105 105
106/* recurses */ 106/* recurses */
107/* in case of err, eb might be NULL */ 107/* in case of err, eb might be NULL */
108static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, 108static void __readahead_hook(struct btrfs_fs_info *fs_info,
109 u64 start, int err) 109 struct reada_extent *re, struct extent_buffer *eb,
110 u64 start, int err)
110{ 111{
111 int level = 0; 112 int level = 0;
112 int nritems; 113 int nritems;
113 int i; 114 int i;
114 u64 bytenr; 115 u64 bytenr;
115 u64 generation; 116 u64 generation;
116 struct reada_extent *re;
117 struct btrfs_fs_info *fs_info = root->fs_info;
118 struct list_head list; 117 struct list_head list;
119 unsigned long index = start >> PAGE_CACHE_SHIFT;
120 struct btrfs_device *for_dev;
121 118
122 if (eb) 119 if (eb)
123 level = btrfs_header_level(eb); 120 level = btrfs_header_level(eb);
124 121
125 /* find extent */
126 spin_lock(&fs_info->reada_lock);
127 re = radix_tree_lookup(&fs_info->reada_tree, index);
128 if (re)
129 re->refcnt++;
130 spin_unlock(&fs_info->reada_lock);
131
132 if (!re)
133 return -1;
134
135 spin_lock(&re->lock); 122 spin_lock(&re->lock);
136 /* 123 /*
137 * just take the full list from the extent. afterwards we 124 * just take the full list from the extent. afterwards we
138 * don't need the lock anymore 125 * don't need the lock anymore
139 */ 126 */
140 list_replace_init(&re->extctl, &list); 127 list_replace_init(&re->extctl, &list);
141 for_dev = re->scheduled_for; 128 re->scheduled = 0;
142 re->scheduled_for = NULL;
143 spin_unlock(&re->lock); 129 spin_unlock(&re->lock);
144 130
145 if (err == 0) { 131 /*
146 nritems = level ? btrfs_header_nritems(eb) : 0; 132 * this is the error case, the extent buffer has not been
147 generation = btrfs_header_generation(eb); 133 * read correctly. We won't access anything from it and
148 /* 134 * just cleanup our data structures. Effectively this will
149 * FIXME: currently we just set nritems to 0 if this is a leaf, 135 * cut the branch below this node from read ahead.
150 * effectively ignoring the content. In a next step we could 136 */
151 * trigger more readahead depending from the content, e.g. 137 if (err)
152 * fetch the checksums for the extents in the leaf. 138 goto cleanup;
153 */
154 } else {
155 /*
156 * this is the error case, the extent buffer has not been
157 * read correctly. We won't access anything from it and
158 * just cleanup our data structures. Effectively this will
159 * cut the branch below this node from read ahead.
160 */
161 nritems = 0;
162 generation = 0;
163 }
164 139
140 /*
141 * FIXME: currently we just set nritems to 0 if this is a leaf,
142 * effectively ignoring the content. In a next step we could
143 * trigger more readahead depending from the content, e.g.
144 * fetch the checksums for the extents in the leaf.
145 */
146 if (!level)
147 goto cleanup;
148
149 nritems = btrfs_header_nritems(eb);
150 generation = btrfs_header_generation(eb);
165 for (i = 0; i < nritems; i++) { 151 for (i = 0; i < nritems; i++) {
166 struct reada_extctl *rec; 152 struct reada_extctl *rec;
167 u64 n_gen; 153 u64 n_gen;
@@ -188,19 +174,20 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
188 */ 174 */
189#ifdef DEBUG 175#ifdef DEBUG
190 if (rec->generation != generation) { 176 if (rec->generation != generation) {
191 btrfs_debug(root->fs_info, 177 btrfs_debug(fs_info,
192 "generation mismatch for (%llu,%d,%llu) %llu != %llu", 178 "generation mismatch for (%llu,%d,%llu) %llu != %llu",
193 key.objectid, key.type, key.offset, 179 key.objectid, key.type, key.offset,
194 rec->generation, generation); 180 rec->generation, generation);
195 } 181 }
196#endif 182#endif
197 if (rec->generation == generation && 183 if (rec->generation == generation &&
198 btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && 184 btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
199 btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) 185 btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
200 reada_add_block(rc, bytenr, &next_key, 186 reada_add_block(rc, bytenr, &next_key, n_gen);
201 level - 1, n_gen);
202 } 187 }
203 } 188 }
189
190cleanup:
204 /* 191 /*
205 * free extctl records 192 * free extctl records
206 */ 193 */
@@ -222,26 +209,37 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
222 209
223 reada_extent_put(fs_info, re); /* one ref for each entry */ 210 reada_extent_put(fs_info, re); /* one ref for each entry */
224 } 211 }
225 reada_extent_put(fs_info, re); /* our ref */
226 if (for_dev)
227 atomic_dec(&for_dev->reada_in_flight);
228 212
229 return 0; 213 return;
230} 214}
231 215
232/* 216/*
233 * start is passed separately in case eb in NULL, which may be the case with 217 * start is passed separately in case eb in NULL, which may be the case with
234 * failed I/O 218 * failed I/O
235 */ 219 */
236int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, 220int btree_readahead_hook(struct btrfs_fs_info *fs_info,
237 u64 start, int err) 221 struct extent_buffer *eb, u64 start, int err)
238{ 222{
239 int ret; 223 int ret = 0;
224 struct reada_extent *re;
240 225
241 ret = __readahead_hook(root, eb, start, err); 226 /* find extent */
227 spin_lock(&fs_info->reada_lock);
228 re = radix_tree_lookup(&fs_info->reada_tree,
229 start >> PAGE_CACHE_SHIFT);
230 if (re)
231 re->refcnt++;
232 spin_unlock(&fs_info->reada_lock);
233 if (!re) {
234 ret = -1;
235 goto start_machine;
236 }
242 237
243 reada_start_machine(root->fs_info); 238 __readahead_hook(fs_info, re, eb, start, err);
239 reada_extent_put(fs_info, re); /* our ref */
244 240
241start_machine:
242 reada_start_machine(fs_info);
245 return ret; 243 return ret;
246} 244}
247 245
@@ -260,18 +258,14 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
260 spin_lock(&fs_info->reada_lock); 258 spin_lock(&fs_info->reada_lock);
261 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, 259 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
262 logical >> PAGE_CACHE_SHIFT, 1); 260 logical >> PAGE_CACHE_SHIFT, 1);
263 if (ret == 1) 261 if (ret == 1 && logical >= zone->start && logical <= zone->end) {
264 kref_get(&zone->refcnt); 262 kref_get(&zone->refcnt);
265 spin_unlock(&fs_info->reada_lock);
266
267 if (ret == 1) {
268 if (logical >= zone->start && logical < zone->end)
269 return zone;
270 spin_lock(&fs_info->reada_lock);
271 kref_put(&zone->refcnt, reada_zone_release);
272 spin_unlock(&fs_info->reada_lock); 263 spin_unlock(&fs_info->reada_lock);
264 return zone;
273 } 265 }
274 266
267 spin_unlock(&fs_info->reada_lock);
268
275 cache = btrfs_lookup_block_group(fs_info, logical); 269 cache = btrfs_lookup_block_group(fs_info, logical);
276 if (!cache) 270 if (!cache)
277 return NULL; 271 return NULL;
@@ -280,7 +274,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
280 end = start + cache->key.offset - 1; 274 end = start + cache->key.offset - 1;
281 btrfs_put_block_group(cache); 275 btrfs_put_block_group(cache);
282 276
283 zone = kzalloc(sizeof(*zone), GFP_NOFS); 277 zone = kzalloc(sizeof(*zone), GFP_KERNEL);
284 if (!zone) 278 if (!zone)
285 return NULL; 279 return NULL;
286 280
@@ -307,8 +301,10 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
307 kfree(zone); 301 kfree(zone);
308 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, 302 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
309 logical >> PAGE_CACHE_SHIFT, 1); 303 logical >> PAGE_CACHE_SHIFT, 1);
310 if (ret == 1) 304 if (ret == 1 && logical >= zone->start && logical <= zone->end)
311 kref_get(&zone->refcnt); 305 kref_get(&zone->refcnt);
306 else
307 zone = NULL;
312 } 308 }
313 spin_unlock(&fs_info->reada_lock); 309 spin_unlock(&fs_info->reada_lock);
314 310
@@ -317,7 +313,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
317 313
318static struct reada_extent *reada_find_extent(struct btrfs_root *root, 314static struct reada_extent *reada_find_extent(struct btrfs_root *root,
319 u64 logical, 315 u64 logical,
320 struct btrfs_key *top, int level) 316 struct btrfs_key *top)
321{ 317{
322 int ret; 318 int ret;
323 struct reada_extent *re = NULL; 319 struct reada_extent *re = NULL;
@@ -330,9 +326,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
330 u64 length; 326 u64 length;
331 int real_stripes; 327 int real_stripes;
332 int nzones = 0; 328 int nzones = 0;
333 int i;
334 unsigned long index = logical >> PAGE_CACHE_SHIFT; 329 unsigned long index = logical >> PAGE_CACHE_SHIFT;
335 int dev_replace_is_ongoing; 330 int dev_replace_is_ongoing;
331 int have_zone = 0;
336 332
337 spin_lock(&fs_info->reada_lock); 333 spin_lock(&fs_info->reada_lock);
338 re = radix_tree_lookup(&fs_info->reada_tree, index); 334 re = radix_tree_lookup(&fs_info->reada_tree, index);
@@ -343,7 +339,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
343 if (re) 339 if (re)
344 return re; 340 return re;
345 341
346 re = kzalloc(sizeof(*re), GFP_NOFS); 342 re = kzalloc(sizeof(*re), GFP_KERNEL);
347 if (!re) 343 if (!re)
348 return NULL; 344 return NULL;
349 345
@@ -375,11 +371,16 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
375 struct reada_zone *zone; 371 struct reada_zone *zone;
376 372
377 dev = bbio->stripes[nzones].dev; 373 dev = bbio->stripes[nzones].dev;
374
375 /* cannot read ahead on missing device. */
376 if (!dev->bdev)
377 continue;
378
378 zone = reada_find_zone(fs_info, dev, logical, bbio); 379 zone = reada_find_zone(fs_info, dev, logical, bbio);
379 if (!zone) 380 if (!zone)
380 break; 381 continue;
381 382
382 re->zones[nzones] = zone; 383 re->zones[re->nzones++] = zone;
383 spin_lock(&zone->lock); 384 spin_lock(&zone->lock);
384 if (!zone->elems) 385 if (!zone->elems)
385 kref_get(&zone->refcnt); 386 kref_get(&zone->refcnt);
@@ -389,14 +390,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
389 kref_put(&zone->refcnt, reada_zone_release); 390 kref_put(&zone->refcnt, reada_zone_release);
390 spin_unlock(&fs_info->reada_lock); 391 spin_unlock(&fs_info->reada_lock);
391 } 392 }
392 re->nzones = nzones; 393 if (re->nzones == 0) {
393 if (nzones == 0) {
394 /* not a single zone found, error and out */ 394 /* not a single zone found, error and out */
395 goto error; 395 goto error;
396 } 396 }
397 397
398 /* insert extent in reada_tree + all per-device trees, all or nothing */ 398 /* insert extent in reada_tree + all per-device trees, all or nothing */
399 btrfs_dev_replace_lock(&fs_info->dev_replace); 399 btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
400 spin_lock(&fs_info->reada_lock); 400 spin_lock(&fs_info->reada_lock);
401 ret = radix_tree_insert(&fs_info->reada_tree, index, re); 401 ret = radix_tree_insert(&fs_info->reada_tree, index, re);
402 if (ret == -EEXIST) { 402 if (ret == -EEXIST) {
@@ -404,19 +404,20 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
404 BUG_ON(!re_exist); 404 BUG_ON(!re_exist);
405 re_exist->refcnt++; 405 re_exist->refcnt++;
406 spin_unlock(&fs_info->reada_lock); 406 spin_unlock(&fs_info->reada_lock);
407 btrfs_dev_replace_unlock(&fs_info->dev_replace); 407 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
408 goto error; 408 goto error;
409 } 409 }
410 if (ret) { 410 if (ret) {
411 spin_unlock(&fs_info->reada_lock); 411 spin_unlock(&fs_info->reada_lock);
412 btrfs_dev_replace_unlock(&fs_info->dev_replace); 412 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
413 goto error; 413 goto error;
414 } 414 }
415 prev_dev = NULL; 415 prev_dev = NULL;
416 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( 416 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
417 &fs_info->dev_replace); 417 &fs_info->dev_replace);
418 for (i = 0; i < nzones; ++i) { 418 for (nzones = 0; nzones < re->nzones; ++nzones) {
419 dev = bbio->stripes[i].dev; 419 dev = re->zones[nzones]->device;
420
420 if (dev == prev_dev) { 421 if (dev == prev_dev) {
421 /* 422 /*
422 * in case of DUP, just add the first zone. As both 423 * in case of DUP, just add the first zone. As both
@@ -427,15 +428,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
427 */ 428 */
428 continue; 429 continue;
429 } 430 }
430 if (!dev->bdev) { 431 if (!dev->bdev)
431 /* 432 continue;
432 * cannot read ahead on missing device, but for RAID5/6, 433
433 * REQ_GET_READ_MIRRORS return 1. So don't skip missing
434 * device for such case.
435 */
436 if (nzones > 1)
437 continue;
438 }
439 if (dev_replace_is_ongoing && 434 if (dev_replace_is_ongoing &&
440 dev == fs_info->dev_replace.tgtdev) { 435 dev == fs_info->dev_replace.tgtdev) {
441 /* 436 /*
@@ -447,8 +442,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
447 prev_dev = dev; 442 prev_dev = dev;
448 ret = radix_tree_insert(&dev->reada_extents, index, re); 443 ret = radix_tree_insert(&dev->reada_extents, index, re);
449 if (ret) { 444 if (ret) {
450 while (--i >= 0) { 445 while (--nzones >= 0) {
451 dev = bbio->stripes[i].dev; 446 dev = re->zones[nzones]->device;
452 BUG_ON(dev == NULL); 447 BUG_ON(dev == NULL);
453 /* ignore whether the entry was inserted */ 448 /* ignore whether the entry was inserted */
454 radix_tree_delete(&dev->reada_extents, index); 449 radix_tree_delete(&dev->reada_extents, index);
@@ -456,21 +451,24 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
456 BUG_ON(fs_info == NULL); 451 BUG_ON(fs_info == NULL);
457 radix_tree_delete(&fs_info->reada_tree, index); 452 radix_tree_delete(&fs_info->reada_tree, index);
458 spin_unlock(&fs_info->reada_lock); 453 spin_unlock(&fs_info->reada_lock);
459 btrfs_dev_replace_unlock(&fs_info->dev_replace); 454 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
460 goto error; 455 goto error;
461 } 456 }
457 have_zone = 1;
462 } 458 }
463 spin_unlock(&fs_info->reada_lock); 459 spin_unlock(&fs_info->reada_lock);
464 btrfs_dev_replace_unlock(&fs_info->dev_replace); 460 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
461
462 if (!have_zone)
463 goto error;
465 464
466 btrfs_put_bbio(bbio); 465 btrfs_put_bbio(bbio);
467 return re; 466 return re;
468 467
469error: 468error:
470 while (nzones) { 469 for (nzones = 0; nzones < re->nzones; ++nzones) {
471 struct reada_zone *zone; 470 struct reada_zone *zone;
472 471
473 --nzones;
474 zone = re->zones[nzones]; 472 zone = re->zones[nzones];
475 kref_get(&zone->refcnt); 473 kref_get(&zone->refcnt);
476 spin_lock(&zone->lock); 474 spin_lock(&zone->lock);
@@ -531,8 +529,6 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
531 kref_put(&zone->refcnt, reada_zone_release); 529 kref_put(&zone->refcnt, reada_zone_release);
532 spin_unlock(&fs_info->reada_lock); 530 spin_unlock(&fs_info->reada_lock);
533 } 531 }
534 if (re->scheduled_for)
535 atomic_dec(&re->scheduled_for->reada_in_flight);
536 532
537 kfree(re); 533 kfree(re);
538} 534}
@@ -556,17 +552,17 @@ static void reada_control_release(struct kref *kref)
556} 552}
557 553
558static int reada_add_block(struct reada_control *rc, u64 logical, 554static int reada_add_block(struct reada_control *rc, u64 logical,
559 struct btrfs_key *top, int level, u64 generation) 555 struct btrfs_key *top, u64 generation)
560{ 556{
561 struct btrfs_root *root = rc->root; 557 struct btrfs_root *root = rc->root;
562 struct reada_extent *re; 558 struct reada_extent *re;
563 struct reada_extctl *rec; 559 struct reada_extctl *rec;
564 560
565 re = reada_find_extent(root, logical, top, level); /* takes one ref */ 561 re = reada_find_extent(root, logical, top); /* takes one ref */
566 if (!re) 562 if (!re)
567 return -1; 563 return -1;
568 564
569 rec = kzalloc(sizeof(*rec), GFP_NOFS); 565 rec = kzalloc(sizeof(*rec), GFP_KERNEL);
570 if (!rec) { 566 if (!rec) {
571 reada_extent_put(root->fs_info, re); 567 reada_extent_put(root->fs_info, re);
572 return -ENOMEM; 568 return -ENOMEM;
@@ -662,7 +658,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
662 u64 logical; 658 u64 logical;
663 int ret; 659 int ret;
664 int i; 660 int i;
665 int need_kick = 0;
666 661
667 spin_lock(&fs_info->reada_lock); 662 spin_lock(&fs_info->reada_lock);
668 if (dev->reada_curr_zone == NULL) { 663 if (dev->reada_curr_zone == NULL) {
@@ -679,7 +674,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
679 */ 674 */
680 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, 675 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
681 dev->reada_next >> PAGE_CACHE_SHIFT, 1); 676 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
682 if (ret == 0 || re->logical >= dev->reada_curr_zone->end) { 677 if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
683 ret = reada_pick_zone(dev); 678 ret = reada_pick_zone(dev);
684 if (!ret) { 679 if (!ret) {
685 spin_unlock(&fs_info->reada_lock); 680 spin_unlock(&fs_info->reada_lock);
@@ -698,6 +693,15 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
698 693
699 spin_unlock(&fs_info->reada_lock); 694 spin_unlock(&fs_info->reada_lock);
700 695
696 spin_lock(&re->lock);
697 if (re->scheduled || list_empty(&re->extctl)) {
698 spin_unlock(&re->lock);
699 reada_extent_put(fs_info, re);
700 return 0;
701 }
702 re->scheduled = 1;
703 spin_unlock(&re->lock);
704
701 /* 705 /*
702 * find mirror num 706 * find mirror num
703 */ 707 */
@@ -709,29 +713,20 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
709 } 713 }
710 logical = re->logical; 714 logical = re->logical;
711 715
712 spin_lock(&re->lock);
713 if (re->scheduled_for == NULL) {
714 re->scheduled_for = dev;
715 need_kick = 1;
716 }
717 spin_unlock(&re->lock);
718
719 reada_extent_put(fs_info, re);
720
721 if (!need_kick)
722 return 0;
723
724 atomic_inc(&dev->reada_in_flight); 716 atomic_inc(&dev->reada_in_flight);
725 ret = reada_tree_block_flagged(fs_info->extent_root, logical, 717 ret = reada_tree_block_flagged(fs_info->extent_root, logical,
726 mirror_num, &eb); 718 mirror_num, &eb);
727 if (ret) 719 if (ret)
728 __readahead_hook(fs_info->extent_root, NULL, logical, ret); 720 __readahead_hook(fs_info, re, NULL, logical, ret);
729 else if (eb) 721 else if (eb)
730 __readahead_hook(fs_info->extent_root, eb, eb->start, ret); 722 __readahead_hook(fs_info, re, eb, eb->start, ret);
731 723
732 if (eb) 724 if (eb)
733 free_extent_buffer(eb); 725 free_extent_buffer(eb);
734 726
727 atomic_dec(&dev->reada_in_flight);
728 reada_extent_put(fs_info, re);
729
735 return 1; 730 return 1;
736 731
737} 732}
@@ -752,6 +747,8 @@ static void reada_start_machine_worker(struct btrfs_work *work)
752 set_task_ioprio(current, BTRFS_IOPRIO_READA); 747 set_task_ioprio(current, BTRFS_IOPRIO_READA);
753 __reada_start_machine(fs_info); 748 __reada_start_machine(fs_info);
754 set_task_ioprio(current, old_ioprio); 749 set_task_ioprio(current, old_ioprio);
750
751 atomic_dec(&fs_info->reada_works_cnt);
755} 752}
756 753
757static void __reada_start_machine(struct btrfs_fs_info *fs_info) 754static void __reada_start_machine(struct btrfs_fs_info *fs_info)
@@ -783,15 +780,19 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
783 * enqueue to workers to finish it. This will distribute the load to 780 * enqueue to workers to finish it. This will distribute the load to
784 * the cores. 781 * the cores.
785 */ 782 */
786 for (i = 0; i < 2; ++i) 783 for (i = 0; i < 2; ++i) {
787 reada_start_machine(fs_info); 784 reada_start_machine(fs_info);
785 if (atomic_read(&fs_info->reada_works_cnt) >
786 BTRFS_MAX_MIRRORS * 2)
787 break;
788 }
788} 789}
789 790
790static void reada_start_machine(struct btrfs_fs_info *fs_info) 791static void reada_start_machine(struct btrfs_fs_info *fs_info)
791{ 792{
792 struct reada_machine_work *rmw; 793 struct reada_machine_work *rmw;
793 794
794 rmw = kzalloc(sizeof(*rmw), GFP_NOFS); 795 rmw = kzalloc(sizeof(*rmw), GFP_KERNEL);
795 if (!rmw) { 796 if (!rmw) {
796 /* FIXME we cannot handle this properly right now */ 797 /* FIXME we cannot handle this properly right now */
797 BUG(); 798 BUG();
@@ -801,6 +802,7 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
801 rmw->fs_info = fs_info; 802 rmw->fs_info = fs_info;
802 803
803 btrfs_queue_work(fs_info->readahead_workers, &rmw->work); 804 btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
805 atomic_inc(&fs_info->reada_works_cnt);
804} 806}
805 807
806#ifdef DEBUG 808#ifdef DEBUG
@@ -848,10 +850,9 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
848 if (ret == 0) 850 if (ret == 0)
849 break; 851 break;
850 printk(KERN_DEBUG 852 printk(KERN_DEBUG
851 " re: logical %llu size %u empty %d for %lld", 853 " re: logical %llu size %u empty %d scheduled %d",
852 re->logical, fs_info->tree_root->nodesize, 854 re->logical, fs_info->tree_root->nodesize,
853 list_empty(&re->extctl), re->scheduled_for ? 855 list_empty(&re->extctl), re->scheduled);
854 re->scheduled_for->devid : -1);
855 856
856 for (i = 0; i < re->nzones; ++i) { 857 for (i = 0; i < re->nzones; ++i) {
857 printk(KERN_CONT " zone %llu-%llu devs", 858 printk(KERN_CONT " zone %llu-%llu devs",
@@ -878,27 +879,21 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
878 index, 1); 879 index, 1);
879 if (ret == 0) 880 if (ret == 0)
880 break; 881 break;
881 if (!re->scheduled_for) { 882 if (!re->scheduled) {
882 index = (re->logical >> PAGE_CACHE_SHIFT) + 1; 883 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
883 continue; 884 continue;
884 } 885 }
885 printk(KERN_DEBUG 886 printk(KERN_DEBUG
886 "re: logical %llu size %u list empty %d for %lld", 887 "re: logical %llu size %u list empty %d scheduled %d",
887 re->logical, fs_info->tree_root->nodesize, 888 re->logical, fs_info->tree_root->nodesize,
888 list_empty(&re->extctl), 889 list_empty(&re->extctl), re->scheduled);
889 re->scheduled_for ? re->scheduled_for->devid : -1);
890 for (i = 0; i < re->nzones; ++i) { 890 for (i = 0; i < re->nzones; ++i) {
891 printk(KERN_CONT " zone %llu-%llu devs", 891 printk(KERN_CONT " zone %llu-%llu devs",
892 re->zones[i]->start, 892 re->zones[i]->start,
893 re->zones[i]->end); 893 re->zones[i]->end);
894 for (i = 0; i < re->nzones; ++i) { 894 for (j = 0; j < re->zones[i]->ndevs; ++j) {
895 printk(KERN_CONT " zone %llu-%llu devs", 895 printk(KERN_CONT " %lld",
896 re->zones[i]->start, 896 re->zones[i]->devs[j]->devid);
897 re->zones[i]->end);
898 for (j = 0; j < re->zones[i]->ndevs; ++j) {
899 printk(KERN_CONT " %lld",
900 re->zones[i]->devs[j]->devid);
901 }
902 } 897 }
903 } 898 }
904 printk(KERN_CONT "\n"); 899 printk(KERN_CONT "\n");
@@ -917,7 +912,6 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
917 struct reada_control *rc; 912 struct reada_control *rc;
918 u64 start; 913 u64 start;
919 u64 generation; 914 u64 generation;
920 int level;
921 int ret; 915 int ret;
922 struct extent_buffer *node; 916 struct extent_buffer *node;
923 static struct btrfs_key max_key = { 917 static struct btrfs_key max_key = {
@@ -926,7 +920,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
926 .offset = (u64)-1 920 .offset = (u64)-1
927 }; 921 };
928 922
929 rc = kzalloc(sizeof(*rc), GFP_NOFS); 923 rc = kzalloc(sizeof(*rc), GFP_KERNEL);
930 if (!rc) 924 if (!rc)
931 return ERR_PTR(-ENOMEM); 925 return ERR_PTR(-ENOMEM);
932 926
@@ -940,11 +934,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
940 934
941 node = btrfs_root_node(root); 935 node = btrfs_root_node(root);
942 start = node->start; 936 start = node->start;
943 level = btrfs_header_level(node);
944 generation = btrfs_header_generation(node); 937 generation = btrfs_header_generation(node);
945 free_extent_buffer(node); 938 free_extent_buffer(node);
946 939
947 ret = reada_add_block(rc, start, &max_key, level, generation); 940 ret = reada_add_block(rc, start, &max_key, generation);
948 if (ret) { 941 if (ret) {
949 kfree(rc); 942 kfree(rc);
950 return ERR_PTR(ret); 943 return ERR_PTR(ret);
@@ -959,8 +952,11 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
959int btrfs_reada_wait(void *handle) 952int btrfs_reada_wait(void *handle)
960{ 953{
961 struct reada_control *rc = handle; 954 struct reada_control *rc = handle;
955 struct btrfs_fs_info *fs_info = rc->root->fs_info;
962 956
963 while (atomic_read(&rc->elems)) { 957 while (atomic_read(&rc->elems)) {
958 if (!atomic_read(&fs_info->reada_works_cnt))
959 reada_start_machine(fs_info);
964 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, 960 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
965 5 * HZ); 961 5 * HZ);
966 dump_devs(rc->root->fs_info, 962 dump_devs(rc->root->fs_info,
@@ -977,9 +973,13 @@ int btrfs_reada_wait(void *handle)
977int btrfs_reada_wait(void *handle) 973int btrfs_reada_wait(void *handle)
978{ 974{
979 struct reada_control *rc = handle; 975 struct reada_control *rc = handle;
976 struct btrfs_fs_info *fs_info = rc->root->fs_info;
980 977
981 while (atomic_read(&rc->elems)) { 978 while (atomic_read(&rc->elems)) {
982 wait_event(rc->wait, atomic_read(&rc->elems) == 0); 979 if (!atomic_read(&fs_info->reada_works_cnt))
980 reada_start_machine(fs_info);
981 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
982 (HZ + 9) / 10);
983 } 983 }
984 984
985 kref_put(&rc->refcnt, reada_control_release); 985 kref_put(&rc->refcnt, reada_control_release);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 2c849b08a91b..9fcd6dfc3266 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -496,7 +496,7 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
496 struct btrfs_root *root) 496 struct btrfs_root *root)
497{ 497{
498 struct btrfs_root_item *item = &root->root_item; 498 struct btrfs_root_item *item = &root->root_item;
499 struct timespec ct = CURRENT_TIME; 499 struct timespec ct = current_fs_time(root->fs_info->sb);
500 500
501 spin_lock(&root->root_item_lock); 501 spin_lock(&root->root_item_lock);
502 btrfs_set_root_ctransid(item, trans->transid); 502 btrfs_set_root_ctransid(item, trans->transid);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 92bf5ee732fb..39dbdcbf4d13 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -461,7 +461,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
461 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 461 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
462 int ret; 462 int ret;
463 463
464 sctx = kzalloc(sizeof(*sctx), GFP_NOFS); 464 sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
465 if (!sctx) 465 if (!sctx)
466 goto nomem; 466 goto nomem;
467 atomic_set(&sctx->refs, 1); 467 atomic_set(&sctx->refs, 1);
@@ -472,7 +472,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
472 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { 472 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
473 struct scrub_bio *sbio; 473 struct scrub_bio *sbio;
474 474
475 sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 475 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
476 if (!sbio) 476 if (!sbio)
477 goto nomem; 477 goto nomem;
478 sctx->bios[i] = sbio; 478 sctx->bios[i] = sbio;
@@ -611,7 +611,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
611 u64 flags = 0; 611 u64 flags = 0;
612 u64 ref_root; 612 u64 ref_root;
613 u32 item_size; 613 u32 item_size;
614 u8 ref_level; 614 u8 ref_level = 0;
615 int ret; 615 int ret;
616 616
617 WARN_ON(sblock->page_count < 1); 617 WARN_ON(sblock->page_count < 1);
@@ -1654,7 +1654,7 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1654again: 1654again:
1655 if (!wr_ctx->wr_curr_bio) { 1655 if (!wr_ctx->wr_curr_bio) {
1656 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), 1656 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1657 GFP_NOFS); 1657 GFP_KERNEL);
1658 if (!wr_ctx->wr_curr_bio) { 1658 if (!wr_ctx->wr_curr_bio) {
1659 mutex_unlock(&wr_ctx->wr_lock); 1659 mutex_unlock(&wr_ctx->wr_lock);
1660 return -ENOMEM; 1660 return -ENOMEM;
@@ -1671,7 +1671,8 @@ again:
1671 sbio->dev = wr_ctx->tgtdev; 1671 sbio->dev = wr_ctx->tgtdev;
1672 bio = sbio->bio; 1672 bio = sbio->bio;
1673 if (!bio) { 1673 if (!bio) {
1674 bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); 1674 bio = btrfs_io_bio_alloc(GFP_KERNEL,
1675 wr_ctx->pages_per_wr_bio);
1675 if (!bio) { 1676 if (!bio) {
1676 mutex_unlock(&wr_ctx->wr_lock); 1677 mutex_unlock(&wr_ctx->wr_lock);
1677 return -ENOMEM; 1678 return -ENOMEM;
@@ -2076,7 +2077,8 @@ again:
2076 sbio->dev = spage->dev; 2077 sbio->dev = spage->dev;
2077 bio = sbio->bio; 2078 bio = sbio->bio;
2078 if (!bio) { 2079 if (!bio) {
2079 bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); 2080 bio = btrfs_io_bio_alloc(GFP_KERNEL,
2081 sctx->pages_per_rd_bio);
2080 if (!bio) 2082 if (!bio)
2081 return -ENOMEM; 2083 return -ENOMEM;
2082 sbio->bio = bio; 2084 sbio->bio = bio;
@@ -2241,7 +2243,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2241 struct scrub_block *sblock; 2243 struct scrub_block *sblock;
2242 int index; 2244 int index;
2243 2245
2244 sblock = kzalloc(sizeof(*sblock), GFP_NOFS); 2246 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2245 if (!sblock) { 2247 if (!sblock) {
2246 spin_lock(&sctx->stat_lock); 2248 spin_lock(&sctx->stat_lock);
2247 sctx->stat.malloc_errors++; 2249 sctx->stat.malloc_errors++;
@@ -2259,7 +2261,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2259 struct scrub_page *spage; 2261 struct scrub_page *spage;
2260 u64 l = min_t(u64, len, PAGE_SIZE); 2262 u64 l = min_t(u64, len, PAGE_SIZE);
2261 2263
2262 spage = kzalloc(sizeof(*spage), GFP_NOFS); 2264 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2263 if (!spage) { 2265 if (!spage) {
2264leave_nomem: 2266leave_nomem:
2265 spin_lock(&sctx->stat_lock); 2267 spin_lock(&sctx->stat_lock);
@@ -2286,7 +2288,7 @@ leave_nomem:
2286 spage->have_csum = 0; 2288 spage->have_csum = 0;
2287 } 2289 }
2288 sblock->page_count++; 2290 sblock->page_count++;
2289 spage->page = alloc_page(GFP_NOFS); 2291 spage->page = alloc_page(GFP_KERNEL);
2290 if (!spage->page) 2292 if (!spage->page)
2291 goto leave_nomem; 2293 goto leave_nomem;
2292 len -= l; 2294 len -= l;
@@ -2541,7 +2543,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
2541 struct scrub_block *sblock; 2543 struct scrub_block *sblock;
2542 int index; 2544 int index;
2543 2545
2544 sblock = kzalloc(sizeof(*sblock), GFP_NOFS); 2546 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2545 if (!sblock) { 2547 if (!sblock) {
2546 spin_lock(&sctx->stat_lock); 2548 spin_lock(&sctx->stat_lock);
2547 sctx->stat.malloc_errors++; 2549 sctx->stat.malloc_errors++;
@@ -2561,7 +2563,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
2561 struct scrub_page *spage; 2563 struct scrub_page *spage;
2562 u64 l = min_t(u64, len, PAGE_SIZE); 2564 u64 l = min_t(u64, len, PAGE_SIZE);
2563 2565
2564 spage = kzalloc(sizeof(*spage), GFP_NOFS); 2566 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2565 if (!spage) { 2567 if (!spage) {
2566leave_nomem: 2568leave_nomem:
2567 spin_lock(&sctx->stat_lock); 2569 spin_lock(&sctx->stat_lock);
@@ -2591,7 +2593,7 @@ leave_nomem:
2591 spage->have_csum = 0; 2593 spage->have_csum = 0;
2592 } 2594 }
2593 sblock->page_count++; 2595 sblock->page_count++;
2594 spage->page = alloc_page(GFP_NOFS); 2596 spage->page = alloc_page(GFP_KERNEL);
2595 if (!spage->page) 2597 if (!spage->page)
2596 goto leave_nomem; 2598 goto leave_nomem;
2597 len -= l; 2599 len -= l;
@@ -3857,16 +3859,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3857 return -EIO; 3859 return -EIO;
3858 } 3860 }
3859 3861
3860 btrfs_dev_replace_lock(&fs_info->dev_replace); 3862 btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
3861 if (dev->scrub_device || 3863 if (dev->scrub_device ||
3862 (!is_dev_replace && 3864 (!is_dev_replace &&
3863 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { 3865 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3864 btrfs_dev_replace_unlock(&fs_info->dev_replace); 3866 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
3865 mutex_unlock(&fs_info->scrub_lock); 3867 mutex_unlock(&fs_info->scrub_lock);
3866 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3868 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3867 return -EINPROGRESS; 3869 return -EINPROGRESS;
3868 } 3870 }
3869 btrfs_dev_replace_unlock(&fs_info->dev_replace); 3871 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
3870 3872
3871 ret = scrub_workers_get(fs_info, is_dev_replace); 3873 ret = scrub_workers_get(fs_info, is_dev_replace);
3872 if (ret) { 3874 if (ret) {
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 63a6152be04b..19b7bf4284ee 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -34,6 +34,7 @@
34#include "disk-io.h" 34#include "disk-io.h"
35#include "btrfs_inode.h" 35#include "btrfs_inode.h"
36#include "transaction.h" 36#include "transaction.h"
37#include "compression.h"
37 38
38static int g_verbose = 0; 39static int g_verbose = 0;
39 40
@@ -304,7 +305,7 @@ static struct fs_path *fs_path_alloc(void)
304{ 305{
305 struct fs_path *p; 306 struct fs_path *p;
306 307
307 p = kmalloc(sizeof(*p), GFP_NOFS); 308 p = kmalloc(sizeof(*p), GFP_KERNEL);
308 if (!p) 309 if (!p)
309 return NULL; 310 return NULL;
310 p->reversed = 0; 311 p->reversed = 0;
@@ -363,11 +364,11 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
363 * First time the inline_buf does not suffice 364 * First time the inline_buf does not suffice
364 */ 365 */
365 if (p->buf == p->inline_buf) { 366 if (p->buf == p->inline_buf) {
366 tmp_buf = kmalloc(len, GFP_NOFS); 367 tmp_buf = kmalloc(len, GFP_KERNEL);
367 if (tmp_buf) 368 if (tmp_buf)
368 memcpy(tmp_buf, p->buf, old_buf_len); 369 memcpy(tmp_buf, p->buf, old_buf_len);
369 } else { 370 } else {
370 tmp_buf = krealloc(p->buf, len, GFP_NOFS); 371 tmp_buf = krealloc(p->buf, len, GFP_KERNEL);
371 } 372 }
372 if (!tmp_buf) 373 if (!tmp_buf)
373 return -ENOMEM; 374 return -ENOMEM;
@@ -995,7 +996,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
995 * values are small. 996 * values are small.
996 */ 997 */
997 buf_len = PATH_MAX; 998 buf_len = PATH_MAX;
998 buf = kmalloc(buf_len, GFP_NOFS); 999 buf = kmalloc(buf_len, GFP_KERNEL);
999 if (!buf) { 1000 if (!buf) {
1000 ret = -ENOMEM; 1001 ret = -ENOMEM;
1001 goto out; 1002 goto out;
@@ -1042,7 +1043,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
1042 buf = NULL; 1043 buf = NULL;
1043 } else { 1044 } else {
1044 char *tmp = krealloc(buf, buf_len, 1045 char *tmp = krealloc(buf, buf_len,
1045 GFP_NOFS | __GFP_NOWARN); 1046 GFP_KERNEL | __GFP_NOWARN);
1046 1047
1047 if (!tmp) 1048 if (!tmp)
1048 kfree(buf); 1049 kfree(buf);
@@ -1303,7 +1304,7 @@ static int find_extent_clone(struct send_ctx *sctx,
1303 /* We only use this path under the commit sem */ 1304 /* We only use this path under the commit sem */
1304 tmp_path->need_commit_sem = 0; 1305 tmp_path->need_commit_sem = 0;
1305 1306
1306 backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS); 1307 backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_KERNEL);
1307 if (!backref_ctx) { 1308 if (!backref_ctx) {
1308 ret = -ENOMEM; 1309 ret = -ENOMEM;
1309 goto out; 1310 goto out;
@@ -1984,7 +1985,7 @@ static int name_cache_insert(struct send_ctx *sctx,
1984 nce_head = radix_tree_lookup(&sctx->name_cache, 1985 nce_head = radix_tree_lookup(&sctx->name_cache,
1985 (unsigned long)nce->ino); 1986 (unsigned long)nce->ino);
1986 if (!nce_head) { 1987 if (!nce_head) {
1987 nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS); 1988 nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL);
1988 if (!nce_head) { 1989 if (!nce_head) {
1989 kfree(nce); 1990 kfree(nce);
1990 return -ENOMEM; 1991 return -ENOMEM;
@@ -2179,7 +2180,7 @@ out_cache:
2179 /* 2180 /*
2180 * Store the result of the lookup in the name cache. 2181 * Store the result of the lookup in the name cache.
2181 */ 2182 */
2182 nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS); 2183 nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL);
2183 if (!nce) { 2184 if (!nce) {
2184 ret = -ENOMEM; 2185 ret = -ENOMEM;
2185 goto out; 2186 goto out;
@@ -2315,7 +2316,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
2315 if (!path) 2316 if (!path)
2316 return -ENOMEM; 2317 return -ENOMEM;
2317 2318
2318 name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_NOFS); 2319 name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
2319 if (!name) { 2320 if (!name) {
2320 btrfs_free_path(path); 2321 btrfs_free_path(path);
2321 return -ENOMEM; 2322 return -ENOMEM;
@@ -2730,7 +2731,7 @@ static int __record_ref(struct list_head *head, u64 dir,
2730{ 2731{
2731 struct recorded_ref *ref; 2732 struct recorded_ref *ref;
2732 2733
2733 ref = kmalloc(sizeof(*ref), GFP_NOFS); 2734 ref = kmalloc(sizeof(*ref), GFP_KERNEL);
2734 if (!ref) 2735 if (!ref)
2735 return -ENOMEM; 2736 return -ENOMEM;
2736 2737
@@ -2755,7 +2756,7 @@ static int dup_ref(struct recorded_ref *ref, struct list_head *list)
2755{ 2756{
2756 struct recorded_ref *new; 2757 struct recorded_ref *new;
2757 2758
2758 new = kmalloc(sizeof(*ref), GFP_NOFS); 2759 new = kmalloc(sizeof(*ref), GFP_KERNEL);
2759 if (!new) 2760 if (!new)
2760 return -ENOMEM; 2761 return -ENOMEM;
2761 2762
@@ -2818,7 +2819,7 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
2818 struct rb_node *parent = NULL; 2819 struct rb_node *parent = NULL;
2819 struct orphan_dir_info *entry, *odi; 2820 struct orphan_dir_info *entry, *odi;
2820 2821
2821 odi = kmalloc(sizeof(*odi), GFP_NOFS); 2822 odi = kmalloc(sizeof(*odi), GFP_KERNEL);
2822 if (!odi) 2823 if (!odi)
2823 return ERR_PTR(-ENOMEM); 2824 return ERR_PTR(-ENOMEM);
2824 odi->ino = dir_ino; 2825 odi->ino = dir_ino;
@@ -2973,7 +2974,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
2973 struct rb_node *parent = NULL; 2974 struct rb_node *parent = NULL;
2974 struct waiting_dir_move *entry, *dm; 2975 struct waiting_dir_move *entry, *dm;
2975 2976
2976 dm = kmalloc(sizeof(*dm), GFP_NOFS); 2977 dm = kmalloc(sizeof(*dm), GFP_KERNEL);
2977 if (!dm) 2978 if (!dm)
2978 return -ENOMEM; 2979 return -ENOMEM;
2979 dm->ino = ino; 2980 dm->ino = ino;
@@ -3040,7 +3041,7 @@ static int add_pending_dir_move(struct send_ctx *sctx,
3040 int exists = 0; 3041 int exists = 0;
3041 int ret; 3042 int ret;
3042 3043
3043 pm = kmalloc(sizeof(*pm), GFP_NOFS); 3044 pm = kmalloc(sizeof(*pm), GFP_KERNEL);
3044 if (!pm) 3045 if (!pm)
3045 return -ENOMEM; 3046 return -ENOMEM;
3046 pm->parent_ino = parent_ino; 3047 pm->parent_ino = parent_ino;
@@ -4280,7 +4281,7 @@ static int __find_xattr(int num, struct btrfs_key *di_key,
4280 strncmp(name, ctx->name, name_len) == 0) { 4281 strncmp(name, ctx->name, name_len) == 0) {
4281 ctx->found_idx = num; 4282 ctx->found_idx = num;
4282 ctx->found_data_len = data_len; 4283 ctx->found_data_len = data_len;
4283 ctx->found_data = kmemdup(data, data_len, GFP_NOFS); 4284 ctx->found_data = kmemdup(data, data_len, GFP_KERNEL);
4284 if (!ctx->found_data) 4285 if (!ctx->found_data)
4285 return -ENOMEM; 4286 return -ENOMEM;
4286 return 1; 4287 return 1;
@@ -4481,7 +4482,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
4481 while (index <= last_index) { 4482 while (index <= last_index) {
4482 unsigned cur_len = min_t(unsigned, len, 4483 unsigned cur_len = min_t(unsigned, len,
4483 PAGE_CACHE_SIZE - pg_offset); 4484 PAGE_CACHE_SIZE - pg_offset);
4484 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 4485 page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
4485 if (!page) { 4486 if (!page) {
4486 ret = -ENOMEM; 4487 ret = -ENOMEM;
4487 break; 4488 break;
@@ -5989,7 +5990,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5989 goto out; 5990 goto out;
5990 } 5991 }
5991 5992
5992 sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS); 5993 sctx = kzalloc(sizeof(struct send_ctx), GFP_KERNEL);
5993 if (!sctx) { 5994 if (!sctx) {
5994 ret = -ENOMEM; 5995 ret = -ENOMEM;
5995 goto out; 5996 goto out;
@@ -5997,7 +5998,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5997 5998
5998 INIT_LIST_HEAD(&sctx->new_refs); 5999 INIT_LIST_HEAD(&sctx->new_refs);
5999 INIT_LIST_HEAD(&sctx->deleted_refs); 6000 INIT_LIST_HEAD(&sctx->deleted_refs);
6000 INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS); 6001 INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
6001 INIT_LIST_HEAD(&sctx->name_cache_list); 6002 INIT_LIST_HEAD(&sctx->name_cache_list);
6002 6003
6003 sctx->flags = arg->flags; 6004 sctx->flags = arg->flags;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d41e09fe8e38..00b8f37cc306 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -303,7 +303,8 @@ enum {
303 Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree, 303 Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
304 Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard, 304 Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
305 Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow, 305 Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
306 Opt_datasum, Opt_treelog, Opt_noinode_cache, 306 Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_usebackuproot,
307 Opt_nologreplay, Opt_norecovery,
307#ifdef CONFIG_BTRFS_DEBUG 308#ifdef CONFIG_BTRFS_DEBUG
308 Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, 309 Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
309#endif 310#endif
@@ -335,6 +336,8 @@ static const match_table_t tokens = {
335 {Opt_noacl, "noacl"}, 336 {Opt_noacl, "noacl"},
336 {Opt_notreelog, "notreelog"}, 337 {Opt_notreelog, "notreelog"},
337 {Opt_treelog, "treelog"}, 338 {Opt_treelog, "treelog"},
339 {Opt_nologreplay, "nologreplay"},
340 {Opt_norecovery, "norecovery"},
338 {Opt_flushoncommit, "flushoncommit"}, 341 {Opt_flushoncommit, "flushoncommit"},
339 {Opt_noflushoncommit, "noflushoncommit"}, 342 {Opt_noflushoncommit, "noflushoncommit"},
340 {Opt_ratio, "metadata_ratio=%d"}, 343 {Opt_ratio, "metadata_ratio=%d"},
@@ -352,7 +355,8 @@ static const match_table_t tokens = {
352 {Opt_inode_cache, "inode_cache"}, 355 {Opt_inode_cache, "inode_cache"},
353 {Opt_noinode_cache, "noinode_cache"}, 356 {Opt_noinode_cache, "noinode_cache"},
354 {Opt_no_space_cache, "nospace_cache"}, 357 {Opt_no_space_cache, "nospace_cache"},
355 {Opt_recovery, "recovery"}, 358 {Opt_recovery, "recovery"}, /* deprecated */
359 {Opt_usebackuproot, "usebackuproot"},
356 {Opt_skip_balance, "skip_balance"}, 360 {Opt_skip_balance, "skip_balance"},
357 {Opt_check_integrity, "check_int"}, 361 {Opt_check_integrity, "check_int"},
358 {Opt_check_integrity_including_extent_data, "check_int_data"}, 362 {Opt_check_integrity_including_extent_data, "check_int_data"},
@@ -373,7 +377,8 @@ static const match_table_t tokens = {
373 * reading in a new superblock is parsed here. 377 * reading in a new superblock is parsed here.
374 * XXX JDM: This needs to be cleaned up for remount. 378 * XXX JDM: This needs to be cleaned up for remount.
375 */ 379 */
376int btrfs_parse_options(struct btrfs_root *root, char *options) 380int btrfs_parse_options(struct btrfs_root *root, char *options,
381 unsigned long new_flags)
377{ 382{
378 struct btrfs_fs_info *info = root->fs_info; 383 struct btrfs_fs_info *info = root->fs_info;
379 substring_t args[MAX_OPT_ARGS]; 384 substring_t args[MAX_OPT_ARGS];
@@ -393,8 +398,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
393 else if (cache_gen) 398 else if (cache_gen)
394 btrfs_set_opt(info->mount_opt, SPACE_CACHE); 399 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
395 400
401 /*
402 * Even the options are empty, we still need to do extra check
403 * against new flags
404 */
396 if (!options) 405 if (!options)
397 goto out; 406 goto check;
398 407
399 /* 408 /*
400 * strsep changes the string, duplicate it because parse_options 409 * strsep changes the string, duplicate it because parse_options
@@ -606,6 +615,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
606 btrfs_clear_and_info(root, NOTREELOG, 615 btrfs_clear_and_info(root, NOTREELOG,
607 "enabling tree log"); 616 "enabling tree log");
608 break; 617 break;
618 case Opt_norecovery:
619 case Opt_nologreplay:
620 btrfs_set_and_info(root, NOLOGREPLAY,
621 "disabling log replay at mount time");
622 break;
609 case Opt_flushoncommit: 623 case Opt_flushoncommit:
610 btrfs_set_and_info(root, FLUSHONCOMMIT, 624 btrfs_set_and_info(root, FLUSHONCOMMIT,
611 "turning on flush-on-commit"); 625 "turning on flush-on-commit");
@@ -696,8 +710,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
696 "disabling auto defrag"); 710 "disabling auto defrag");
697 break; 711 break;
698 case Opt_recovery: 712 case Opt_recovery:
699 btrfs_info(root->fs_info, "enabling auto recovery"); 713 btrfs_warn(root->fs_info,
700 btrfs_set_opt(info->mount_opt, RECOVERY); 714 "'recovery' is deprecated, use 'usebackuproot' instead");
715 case Opt_usebackuproot:
716 btrfs_info(root->fs_info,
717 "trying to use backup root at mount time");
718 btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
701 break; 719 break;
702 case Opt_skip_balance: 720 case Opt_skip_balance:
703 btrfs_set_opt(info->mount_opt, SKIP_BALANCE); 721 btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
@@ -792,6 +810,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
792 break; 810 break;
793 } 811 }
794 } 812 }
813check:
814 /*
815 * Extra check for current option against current flag
816 */
817 if (btrfs_test_opt(root, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) {
818 btrfs_err(root->fs_info,
819 "nologreplay must be used with ro mount option");
820 ret = -EINVAL;
821 }
795out: 822out:
796 if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) && 823 if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) &&
797 !btrfs_test_opt(root, FREE_SPACE_TREE) && 824 !btrfs_test_opt(root, FREE_SPACE_TREE) &&
@@ -1202,6 +1229,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
1202 seq_puts(seq, ",ssd"); 1229 seq_puts(seq, ",ssd");
1203 if (btrfs_test_opt(root, NOTREELOG)) 1230 if (btrfs_test_opt(root, NOTREELOG))
1204 seq_puts(seq, ",notreelog"); 1231 seq_puts(seq, ",notreelog");
1232 if (btrfs_test_opt(root, NOLOGREPLAY))
1233 seq_puts(seq, ",nologreplay");
1205 if (btrfs_test_opt(root, FLUSHONCOMMIT)) 1234 if (btrfs_test_opt(root, FLUSHONCOMMIT))
1206 seq_puts(seq, ",flushoncommit"); 1235 seq_puts(seq, ",flushoncommit");
1207 if (btrfs_test_opt(root, DISCARD)) 1236 if (btrfs_test_opt(root, DISCARD))
@@ -1228,8 +1257,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
1228 seq_puts(seq, ",inode_cache"); 1257 seq_puts(seq, ",inode_cache");
1229 if (btrfs_test_opt(root, SKIP_BALANCE)) 1258 if (btrfs_test_opt(root, SKIP_BALANCE))
1230 seq_puts(seq, ",skip_balance"); 1259 seq_puts(seq, ",skip_balance");
1231 if (btrfs_test_opt(root, RECOVERY))
1232 seq_puts(seq, ",recovery");
1233#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1260#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1234 if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) 1261 if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
1235 seq_puts(seq, ",check_int_data"); 1262 seq_puts(seq, ",check_int_data");
@@ -1685,7 +1712,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1685 } 1712 }
1686 } 1713 }
1687 1714
1688 ret = btrfs_parse_options(root, data); 1715 ret = btrfs_parse_options(root, data, *flags);
1689 if (ret) { 1716 if (ret) {
1690 ret = -EINVAL; 1717 ret = -EINVAL;
1691 goto restore; 1718 goto restore;
@@ -2163,6 +2190,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
2163 break; 2190 break;
2164 ret = !(fs_devices->num_devices == fs_devices->total_devices); 2191 ret = !(fs_devices->num_devices == fs_devices->total_devices);
2165 break; 2192 break;
2193 case BTRFS_IOC_GET_SUPPORTED_FEATURES:
2194 ret = btrfs_ioctl_get_supported_features((void __user*)arg);
2195 break;
2166 } 2196 }
2167 2197
2168 kfree(vol); 2198 kfree(vol);
@@ -2261,7 +2291,7 @@ static void btrfs_interface_exit(void)
2261 misc_deregister(&btrfs_misc); 2291 misc_deregister(&btrfs_misc);
2262} 2292}
2263 2293
2264static void btrfs_print_info(void) 2294static void btrfs_print_mod_info(void)
2265{ 2295{
2266 printk(KERN_INFO "Btrfs loaded" 2296 printk(KERN_INFO "Btrfs loaded"
2267#ifdef CONFIG_BTRFS_DEBUG 2297#ifdef CONFIG_BTRFS_DEBUG
@@ -2363,7 +2393,7 @@ static int __init init_btrfs_fs(void)
2363 2393
2364 btrfs_init_lockdep(); 2394 btrfs_init_lockdep();
2365 2395
2366 btrfs_print_info(); 2396 btrfs_print_mod_info();
2367 2397
2368 err = btrfs_run_sanity_tests(); 2398 err = btrfs_run_sanity_tests();
2369 if (err) 2399 if (err)
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 0e1e61a7ec23..f54bf450bad3 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -137,7 +137,6 @@ static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
137 void **slot; 137 void **slot;
138 138
139 spin_lock(&fs_info->buffer_lock); 139 spin_lock(&fs_info->buffer_lock);
140restart:
141 radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { 140 radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
142 struct extent_buffer *eb; 141 struct extent_buffer *eb;
143 142
@@ -147,7 +146,7 @@ restart:
147 /* Shouldn't happen but that kind of thinking creates CVE's */ 146 /* Shouldn't happen but that kind of thinking creates CVE's */
148 if (radix_tree_exception(eb)) { 147 if (radix_tree_exception(eb)) {
149 if (radix_tree_deref_retry(eb)) 148 if (radix_tree_deref_retry(eb))
150 goto restart; 149 slot = radix_tree_iter_retry(&iter);
151 continue; 150 continue;
152 } 151 }
153 spin_unlock(&fs_info->buffer_lock); 152 spin_unlock(&fs_info->buffer_lock);
@@ -189,12 +188,6 @@ btrfs_alloc_dummy_block_group(unsigned long length)
189 kfree(cache); 188 kfree(cache);
190 return NULL; 189 return NULL;
191 } 190 }
192 cache->fs_info = btrfs_alloc_dummy_fs_info();
193 if (!cache->fs_info) {
194 kfree(cache->free_space_ctl);
195 kfree(cache);
196 return NULL;
197 }
198 191
199 cache->key.objectid = 0; 192 cache->key.objectid = 0;
200 cache->key.offset = length; 193 cache->key.offset = length;
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index d05fe1ab4808..7cea4462acd5 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -485,6 +485,7 @@ static int run_test(test_func_t test_func, int bitmaps)
485 cache->bitmap_low_thresh = 0; 485 cache->bitmap_low_thresh = 0;
486 cache->bitmap_high_thresh = (u32)-1; 486 cache->bitmap_high_thresh = (u32)-1;
487 cache->needs_free_space = 1; 487 cache->needs_free_space = 1;
488 cache->fs_info = root->fs_info;
488 489
489 btrfs_init_dummy_trans(&trans); 490 btrfs_init_dummy_trans(&trans);
490 491
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index e2d3da02deee..863a6a3af1f8 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -22,6 +22,7 @@
22#include "../disk-io.h" 22#include "../disk-io.h"
23#include "../extent_io.h" 23#include "../extent_io.h"
24#include "../volumes.h" 24#include "../volumes.h"
25#include "../compression.h"
25 26
26static void insert_extent(struct btrfs_root *root, u64 start, u64 len, 27static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
27 u64 ram_bytes, u64 offset, u64 disk_bytenr, 28 u64 ram_bytes, u64 offset, u64 disk_bytenr,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b6031ce474f7..43885e51b882 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -637,6 +637,8 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
637 637
638 trans->block_rsv = &root->fs_info->trans_block_rsv; 638 trans->block_rsv = &root->fs_info->trans_block_rsv;
639 trans->bytes_reserved = num_bytes; 639 trans->bytes_reserved = num_bytes;
640 trace_btrfs_space_reservation(root->fs_info, "transaction",
641 trans->transid, num_bytes, 1);
640 642
641 return trans; 643 return trans;
642} 644}
@@ -1333,7 +1335,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1333 struct dentry *dentry; 1335 struct dentry *dentry;
1334 struct extent_buffer *tmp; 1336 struct extent_buffer *tmp;
1335 struct extent_buffer *old; 1337 struct extent_buffer *old;
1336 struct timespec cur_time = CURRENT_TIME; 1338 struct timespec cur_time;
1337 int ret = 0; 1339 int ret = 0;
1338 u64 to_reserve = 0; 1340 u64 to_reserve = 0;
1339 u64 index = 0; 1341 u64 index = 0;
@@ -1375,12 +1377,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1375 rsv = trans->block_rsv; 1377 rsv = trans->block_rsv;
1376 trans->block_rsv = &pending->block_rsv; 1378 trans->block_rsv = &pending->block_rsv;
1377 trans->bytes_reserved = trans->block_rsv->reserved; 1379 trans->bytes_reserved = trans->block_rsv->reserved;
1378 1380 trace_btrfs_space_reservation(root->fs_info, "transaction",
1381 trans->transid,
1382 trans->bytes_reserved, 1);
1379 dentry = pending->dentry; 1383 dentry = pending->dentry;
1380 parent_inode = pending->dir; 1384 parent_inode = pending->dir;
1381 parent_root = BTRFS_I(parent_inode)->root; 1385 parent_root = BTRFS_I(parent_inode)->root;
1382 record_root_in_trans(trans, parent_root); 1386 record_root_in_trans(trans, parent_root);
1383 1387
1388 cur_time = current_fs_time(parent_inode->i_sb);
1389
1384 /* 1390 /*
1385 * insert the directory item 1391 * insert the directory item
1386 */ 1392 */
@@ -1523,7 +1529,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1523 1529
1524 btrfs_i_size_write(parent_inode, parent_inode->i_size + 1530 btrfs_i_size_write(parent_inode, parent_inode->i_size +
1525 dentry->d_name.len * 2); 1531 dentry->d_name.len * 2);
1526 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 1532 parent_inode->i_mtime = parent_inode->i_ctime =
1533 current_fs_time(parent_inode->i_sb);
1527 ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode); 1534 ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
1528 if (ret) { 1535 if (ret) {
1529 btrfs_abort_transaction(trans, root, ret); 1536 btrfs_abort_transaction(trans, root, ret);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 978c3a810893..24d03c751149 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -26,6 +26,7 @@
26#include "print-tree.h" 26#include "print-tree.h"
27#include "backref.h" 27#include "backref.h"
28#include "hash.h" 28#include "hash.h"
29#include "compression.h"
29 30
30/* magic values for the inode_only field in btrfs_log_inode: 31/* magic values for the inode_only field in btrfs_log_inode:
31 * 32 *
@@ -1045,7 +1046,7 @@ again:
1045 1046
1046 /* 1047 /*
1047 * NOTE: we have searched root tree and checked the 1048 * NOTE: we have searched root tree and checked the
1048 * coresponding ref, it does not need to check again. 1049 * corresponding ref, it does not need to check again.
1049 */ 1050 */
1050 *search_done = 1; 1051 *search_done = 1;
1051 } 1052 }
@@ -4500,7 +4501,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4500 4501
4501 mutex_lock(&BTRFS_I(inode)->log_mutex); 4502 mutex_lock(&BTRFS_I(inode)->log_mutex);
4502 4503
4503 btrfs_get_logged_extents(inode, &logged_list, start, end); 4504 /*
4505 * Collect ordered extents only if we are logging data. This is to
4506 * ensure a subsequent request to log this inode in LOG_INODE_ALL mode
4507 * will process the ordered extents if they still exists at the time,
4508 * because when we collect them we test and set for the flag
4509 * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the
4510 * same ordered extents. The consequence for the LOG_INODE_ALL log mode
4511 * not processing the ordered extents is that we end up logging the
4512 * corresponding file extent items, based on the extent maps in the
4513 * inode's extent_map_tree's modified_list, without logging the
4514 * respective checksums (since the may still be only attached to the
4515 * ordered extents and have not been inserted in the csum tree by
4516 * btrfs_finish_ordered_io() yet).
4517 */
4518 if (inode_only == LOG_INODE_ALL)
4519 btrfs_get_logged_extents(inode, &logged_list, start, end);
4504 4520
4505 /* 4521 /*
4506 * a brute force approach to making sure we get the most uptodate 4522 * a brute force approach to making sure we get the most uptodate
@@ -4772,6 +4788,42 @@ out_unlock:
4772} 4788}
4773 4789
4774/* 4790/*
4791 * Check if we must fallback to a transaction commit when logging an inode.
4792 * This must be called after logging the inode and is used only in the context
4793 * when fsyncing an inode requires the need to log some other inode - in which
4794 * case we can't lock the i_mutex of each other inode we need to log as that
4795 * can lead to deadlocks with concurrent fsync against other inodes (as we can
4796 * log inodes up or down in the hierarchy) or rename operations for example. So
4797 * we take the log_mutex of the inode after we have logged it and then check for
4798 * its last_unlink_trans value - this is safe because any task setting
4799 * last_unlink_trans must take the log_mutex and it must do this before it does
4800 * the actual unlink operation, so if we do this check before a concurrent task
4801 * sets last_unlink_trans it means we've logged a consistent version/state of
4802 * all the inode items, otherwise we are not sure and must do a transaction
4803 * commit (the concurrent task migth have only updated last_unlink_trans before
4804 * we logged the inode or it might have also done the unlink).
4805 */
4806static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
4807 struct inode *inode)
4808{
4809 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
4810 bool ret = false;
4811
4812 mutex_lock(&BTRFS_I(inode)->log_mutex);
4813 if (BTRFS_I(inode)->last_unlink_trans > fs_info->last_trans_committed) {
4814 /*
4815 * Make sure any commits to the log are forced to be full
4816 * commits.
4817 */
4818 btrfs_set_log_full_commit(fs_info, trans);
4819 ret = true;
4820 }
4821 mutex_unlock(&BTRFS_I(inode)->log_mutex);
4822
4823 return ret;
4824}
4825
4826/*
4775 * follow the dentry parent pointers up the chain and see if any 4827 * follow the dentry parent pointers up the chain and see if any
4776 * of the directories in it require a full commit before they can 4828 * of the directories in it require a full commit before they can
4777 * be logged. Returns zero if nothing special needs to be done or 1 if 4829 * be logged. Returns zero if nothing special needs to be done or 1 if
@@ -4784,7 +4836,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
4784 u64 last_committed) 4836 u64 last_committed)
4785{ 4837{
4786 int ret = 0; 4838 int ret = 0;
4787 struct btrfs_root *root;
4788 struct dentry *old_parent = NULL; 4839 struct dentry *old_parent = NULL;
4789 struct inode *orig_inode = inode; 4840 struct inode *orig_inode = inode;
4790 4841
@@ -4816,14 +4867,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
4816 BTRFS_I(inode)->logged_trans = trans->transid; 4867 BTRFS_I(inode)->logged_trans = trans->transid;
4817 smp_mb(); 4868 smp_mb();
4818 4869
4819 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 4870 if (btrfs_must_commit_transaction(trans, inode)) {
4820 root = BTRFS_I(inode)->root;
4821
4822 /*
4823 * make sure any commits to the log are forced
4824 * to be full commits
4825 */
4826 btrfs_set_log_full_commit(root->fs_info, trans);
4827 ret = 1; 4871 ret = 1;
4828 break; 4872 break;
4829 } 4873 }
@@ -4982,6 +5026,9 @@ process_leaf:
4982 btrfs_release_path(path); 5026 btrfs_release_path(path);
4983 ret = btrfs_log_inode(trans, root, di_inode, 5027 ret = btrfs_log_inode(trans, root, di_inode,
4984 log_mode, 0, LLONG_MAX, ctx); 5028 log_mode, 0, LLONG_MAX, ctx);
5029 if (!ret &&
5030 btrfs_must_commit_transaction(trans, di_inode))
5031 ret = 1;
4985 iput(di_inode); 5032 iput(di_inode);
4986 if (ret) 5033 if (ret)
4987 goto next_dir_inode; 5034 goto next_dir_inode;
@@ -5096,6 +5143,9 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
5096 5143
5097 ret = btrfs_log_inode(trans, root, dir_inode, 5144 ret = btrfs_log_inode(trans, root, dir_inode,
5098 LOG_INODE_ALL, 0, LLONG_MAX, ctx); 5145 LOG_INODE_ALL, 0, LLONG_MAX, ctx);
5146 if (!ret &&
5147 btrfs_must_commit_transaction(trans, dir_inode))
5148 ret = 1;
5099 iput(dir_inode); 5149 iput(dir_inode);
5100 if (ret) 5150 if (ret)
5101 goto out; 5151 goto out;
@@ -5447,6 +5497,9 @@ error:
5447 * They revolve around files there were unlinked from the directory, and 5497 * They revolve around files there were unlinked from the directory, and
5448 * this function updates the parent directory so that a full commit is 5498 * this function updates the parent directory so that a full commit is
5449 * properly done if it is fsync'd later after the unlinks are done. 5499 * properly done if it is fsync'd later after the unlinks are done.
5500 *
5501 * Must be called before the unlink operations (updates to the subvolume tree,
5502 * inodes, etc) are done.
5450 */ 5503 */
5451void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 5504void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
5452 struct inode *dir, struct inode *inode, 5505 struct inode *dir, struct inode *inode,
@@ -5462,8 +5515,11 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
5462 * into the file. When the file is logged we check it and 5515 * into the file. When the file is logged we check it and
5463 * don't log the parents if the file is fully on disk. 5516 * don't log the parents if the file is fully on disk.
5464 */ 5517 */
5465 if (S_ISREG(inode->i_mode)) 5518 if (S_ISREG(inode->i_mode)) {
5519 mutex_lock(&BTRFS_I(inode)->log_mutex);
5466 BTRFS_I(inode)->last_unlink_trans = trans->transid; 5520 BTRFS_I(inode)->last_unlink_trans = trans->transid;
5521 mutex_unlock(&BTRFS_I(inode)->log_mutex);
5522 }
5467 5523
5468 /* 5524 /*
5469 * if this directory was already logged any new 5525 * if this directory was already logged any new
@@ -5494,7 +5550,29 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
5494 return; 5550 return;
5495 5551
5496record: 5552record:
5553 mutex_lock(&BTRFS_I(dir)->log_mutex);
5554 BTRFS_I(dir)->last_unlink_trans = trans->transid;
5555 mutex_unlock(&BTRFS_I(dir)->log_mutex);
5556}
5557
5558/*
5559 * Make sure that if someone attempts to fsync the parent directory of a deleted
5560 * snapshot, it ends up triggering a transaction commit. This is to guarantee
5561 * that after replaying the log tree of the parent directory's root we will not
5562 * see the snapshot anymore and at log replay time we will not see any log tree
5563 * corresponding to the deleted snapshot's root, which could lead to replaying
5564 * it after replaying the log tree of the parent directory (which would replay
5565 * the snapshot delete operation).
5566 *
5567 * Must be called before the actual snapshot destroy operation (updates to the
5568 * parent root and tree of tree roots trees, etc) are done.
5569 */
5570void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
5571 struct inode *dir)
5572{
5573 mutex_lock(&BTRFS_I(dir)->log_mutex);
5497 BTRFS_I(dir)->last_unlink_trans = trans->transid; 5574 BTRFS_I(dir)->last_unlink_trans = trans->transid;
5575 mutex_unlock(&BTRFS_I(dir)->log_mutex);
5498} 5576}
5499 5577
5500/* 5578/*
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 6916a781ea02..a9f1b75d080d 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -79,6 +79,8 @@ int btrfs_pin_log_trans(struct btrfs_root *root);
79void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 79void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
80 struct inode *dir, struct inode *inode, 80 struct inode *dir, struct inode *inode,
81 int for_rename); 81 int for_rename);
82void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
83 struct inode *dir);
82int btrfs_log_new_name(struct btrfs_trans_handle *trans, 84int btrfs_log_new_name(struct btrfs_trans_handle *trans,
83 struct inode *inode, struct inode *old_dir, 85 struct inode *inode, struct inode *old_dir,
84 struct dentry *parent); 86 struct dentry *parent);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 366b335946fa..e2b54d546b7c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -138,7 +138,7 @@ static struct btrfs_fs_devices *__alloc_fs_devices(void)
138{ 138{
139 struct btrfs_fs_devices *fs_devs; 139 struct btrfs_fs_devices *fs_devs;
140 140
141 fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS); 141 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
142 if (!fs_devs) 142 if (!fs_devs)
143 return ERR_PTR(-ENOMEM); 143 return ERR_PTR(-ENOMEM);
144 144
@@ -220,7 +220,7 @@ static struct btrfs_device *__alloc_device(void)
220{ 220{
221 struct btrfs_device *dev; 221 struct btrfs_device *dev;
222 222
223 dev = kzalloc(sizeof(*dev), GFP_NOFS); 223 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
224 if (!dev) 224 if (!dev)
225 return ERR_PTR(-ENOMEM); 225 return ERR_PTR(-ENOMEM);
226 226
@@ -733,7 +733,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
733 * uuid mutex so nothing we touch in here is going to disappear. 733 * uuid mutex so nothing we touch in here is going to disappear.
734 */ 734 */
735 if (orig_dev->name) { 735 if (orig_dev->name) {
736 name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); 736 name = rcu_string_strdup(orig_dev->name->str,
737 GFP_KERNEL);
737 if (!name) { 738 if (!name) {
738 kfree(device); 739 kfree(device);
739 goto error; 740 goto error;
@@ -1714,12 +1715,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1714 } while (read_seqretry(&root->fs_info->profiles_lock, seq)); 1715 } while (read_seqretry(&root->fs_info->profiles_lock, seq));
1715 1716
1716 num_devices = root->fs_info->fs_devices->num_devices; 1717 num_devices = root->fs_info->fs_devices->num_devices;
1717 btrfs_dev_replace_lock(&root->fs_info->dev_replace); 1718 btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0);
1718 if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { 1719 if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
1719 WARN_ON(num_devices < 1); 1720 WARN_ON(num_devices < 1);
1720 num_devices--; 1721 num_devices--;
1721 } 1722 }
1722 btrfs_dev_replace_unlock(&root->fs_info->dev_replace); 1723 btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0);
1723 1724
1724 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1725 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1725 ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; 1726 ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
@@ -2287,7 +2288,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2287 goto error; 2288 goto error;
2288 } 2289 }
2289 2290
2290 name = rcu_string_strdup(device_path, GFP_NOFS); 2291 name = rcu_string_strdup(device_path, GFP_KERNEL);
2291 if (!name) { 2292 if (!name) {
2292 kfree(device); 2293 kfree(device);
2293 ret = -ENOMEM; 2294 ret = -ENOMEM;
@@ -2748,7 +2749,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2748 em->start + em->len < chunk_offset) { 2749 em->start + em->len < chunk_offset) {
2749 /* 2750 /*
2750 * This is a logic error, but we don't want to just rely on the 2751 * This is a logic error, but we don't want to just rely on the
2751 * user having built with ASSERT enabled, so if ASSERT doens't 2752 * user having built with ASSERT enabled, so if ASSERT doesn't
2752 * do anything we still error out. 2753 * do anything we still error out.
2753 */ 2754 */
2754 ASSERT(0); 2755 ASSERT(0);
@@ -2966,7 +2967,7 @@ static int insert_balance_item(struct btrfs_root *root,
2966 } 2967 }
2967 2968
2968 key.objectid = BTRFS_BALANCE_OBJECTID; 2969 key.objectid = BTRFS_BALANCE_OBJECTID;
2969 key.type = BTRFS_BALANCE_ITEM_KEY; 2970 key.type = BTRFS_TEMPORARY_ITEM_KEY;
2970 key.offset = 0; 2971 key.offset = 0;
2971 2972
2972 ret = btrfs_insert_empty_item(trans, root, path, &key, 2973 ret = btrfs_insert_empty_item(trans, root, path, &key,
@@ -3015,7 +3016,7 @@ static int del_balance_item(struct btrfs_root *root)
3015 } 3016 }
3016 3017
3017 key.objectid = BTRFS_BALANCE_OBJECTID; 3018 key.objectid = BTRFS_BALANCE_OBJECTID;
3018 key.type = BTRFS_BALANCE_ITEM_KEY; 3019 key.type = BTRFS_TEMPORARY_ITEM_KEY;
3019 key.offset = 0; 3020 key.offset = 0;
3020 3021
3021 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3022 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
@@ -3686,12 +3687,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3686 } 3687 }
3687 3688
3688 num_devices = fs_info->fs_devices->num_devices; 3689 num_devices = fs_info->fs_devices->num_devices;
3689 btrfs_dev_replace_lock(&fs_info->dev_replace); 3690 btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
3690 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 3691 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3691 BUG_ON(num_devices < 1); 3692 BUG_ON(num_devices < 1);
3692 num_devices--; 3693 num_devices--;
3693 } 3694 }
3694 btrfs_dev_replace_unlock(&fs_info->dev_replace); 3695 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
3695 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 3696 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
3696 if (num_devices == 1) 3697 if (num_devices == 1)
3697 allowed |= BTRFS_BLOCK_GROUP_DUP; 3698 allowed |= BTRFS_BLOCK_GROUP_DUP;
@@ -3867,7 +3868,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
3867 return -ENOMEM; 3868 return -ENOMEM;
3868 3869
3869 key.objectid = BTRFS_BALANCE_OBJECTID; 3870 key.objectid = BTRFS_BALANCE_OBJECTID;
3870 key.type = BTRFS_BALANCE_ITEM_KEY; 3871 key.type = BTRFS_TEMPORARY_ITEM_KEY;
3871 key.offset = 0; 3872 key.offset = 0;
3872 3873
3873 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 3874 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
@@ -4118,7 +4119,7 @@ out:
4118 * Callback for btrfs_uuid_tree_iterate(). 4119 * Callback for btrfs_uuid_tree_iterate().
4119 * returns: 4120 * returns:
4120 * 0 check succeeded, the entry is not outdated. 4121 * 0 check succeeded, the entry is not outdated.
4121 * < 0 if an error occured. 4122 * < 0 if an error occurred.
4122 * > 0 if the check failed, which means the caller shall remove the entry. 4123 * > 0 if the check failed, which means the caller shall remove the entry.
4123 */ 4124 */
4124static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, 4125static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
@@ -5062,10 +5063,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5062 ret = 1; 5063 ret = 1;
5063 free_extent_map(em); 5064 free_extent_map(em);
5064 5065
5065 btrfs_dev_replace_lock(&fs_info->dev_replace); 5066 btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
5066 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) 5067 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
5067 ret++; 5068 ret++;
5068 btrfs_dev_replace_unlock(&fs_info->dev_replace); 5069 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
5069 5070
5070 return ret; 5071 return ret;
5071} 5072}
@@ -5325,10 +5326,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5325 if (!bbio_ret) 5326 if (!bbio_ret)
5326 goto out; 5327 goto out;
5327 5328
5328 btrfs_dev_replace_lock(dev_replace); 5329 btrfs_dev_replace_lock(dev_replace, 0);
5329 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 5330 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
5330 if (!dev_replace_is_ongoing) 5331 if (!dev_replace_is_ongoing)
5331 btrfs_dev_replace_unlock(dev_replace); 5332 btrfs_dev_replace_unlock(dev_replace, 0);
5333 else
5334 btrfs_dev_replace_set_lock_blocking(dev_replace);
5332 5335
5333 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 5336 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
5334 !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && 5337 !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
@@ -5751,8 +5754,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5751 bbio->mirror_num = map->num_stripes + 1; 5754 bbio->mirror_num = map->num_stripes + 1;
5752 } 5755 }
5753out: 5756out:
5754 if (dev_replace_is_ongoing) 5757 if (dev_replace_is_ongoing) {
5755 btrfs_dev_replace_unlock(dev_replace); 5758 btrfs_dev_replace_clear_lock_blocking(dev_replace);
5759 btrfs_dev_replace_unlock(dev_replace, 0);
5760 }
5756 free_extent_map(em); 5761 free_extent_map(em);
5757 return ret; 5762 return ret;
5758} 5763}
@@ -6705,8 +6710,8 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
6705 int item_size; 6710 int item_size;
6706 struct btrfs_dev_stats_item *ptr; 6711 struct btrfs_dev_stats_item *ptr;
6707 6712
6708 key.objectid = 0; 6713 key.objectid = BTRFS_DEV_STATS_OBJECTID;
6709 key.type = BTRFS_DEV_STATS_KEY; 6714 key.type = BTRFS_PERSISTENT_ITEM_KEY;
6710 key.offset = device->devid; 6715 key.offset = device->devid;
6711 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 6716 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
6712 if (ret) { 6717 if (ret) {
@@ -6753,8 +6758,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
6753 int ret; 6758 int ret;
6754 int i; 6759 int i;
6755 6760
6756 key.objectid = 0; 6761 key.objectid = BTRFS_DEV_STATS_OBJECTID;
6757 key.type = BTRFS_DEV_STATS_KEY; 6762 key.type = BTRFS_PERSISTENT_ITEM_KEY;
6758 key.offset = device->devid; 6763 key.offset = device->devid;
6759 6764
6760 path = btrfs_alloc_path(); 6765 path = btrfs_alloc_path();
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 6c68d6356197..145d2b89e62d 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -249,7 +249,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
249 goto out; 249 goto out;
250 250
251 inode_inc_iversion(inode); 251 inode_inc_iversion(inode);
252 inode->i_ctime = CURRENT_TIME; 252 inode->i_ctime = current_fs_time(inode->i_sb);
253 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); 253 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
254 ret = btrfs_update_inode(trans, root, inode); 254 ret = btrfs_update_inode(trans, root, inode);
255 BUG_ON(ret); 255 BUG_ON(ret);
@@ -260,16 +260,12 @@ out:
260 260
261ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) 261ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
262{ 262{
263 struct btrfs_key key, found_key; 263 struct btrfs_key key;
264 struct inode *inode = d_inode(dentry); 264 struct inode *inode = d_inode(dentry);
265 struct btrfs_root *root = BTRFS_I(inode)->root; 265 struct btrfs_root *root = BTRFS_I(inode)->root;
266 struct btrfs_path *path; 266 struct btrfs_path *path;
267 struct extent_buffer *leaf; 267 int ret = 0;
268 struct btrfs_dir_item *di;
269 int ret = 0, slot;
270 size_t total_size = 0, size_left = size; 268 size_t total_size = 0, size_left = size;
271 unsigned long name_ptr;
272 size_t name_len;
273 269
274 /* 270 /*
275 * ok we want all objects associated with this id. 271 * ok we want all objects associated with this id.
@@ -291,6 +287,13 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
291 goto err; 287 goto err;
292 288
293 while (1) { 289 while (1) {
290 struct extent_buffer *leaf;
291 int slot;
292 struct btrfs_dir_item *di;
293 struct btrfs_key found_key;
294 u32 item_size;
295 u32 cur;
296
294 leaf = path->nodes[0]; 297 leaf = path->nodes[0];
295 slot = path->slots[0]; 298 slot = path->slots[0];
296 299
@@ -316,31 +319,45 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
316 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 319 if (found_key.type > BTRFS_XATTR_ITEM_KEY)
317 break; 320 break;
318 if (found_key.type < BTRFS_XATTR_ITEM_KEY) 321 if (found_key.type < BTRFS_XATTR_ITEM_KEY)
319 goto next; 322 goto next_item;
320 323
321 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 324 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
322 if (verify_dir_item(root, leaf, di)) 325 item_size = btrfs_item_size_nr(leaf, slot);
323 goto next; 326 cur = 0;
324 327 while (cur < item_size) {
325 name_len = btrfs_dir_name_len(leaf, di); 328 u16 name_len = btrfs_dir_name_len(leaf, di);
326 total_size += name_len + 1; 329 u16 data_len = btrfs_dir_data_len(leaf, di);
330 u32 this_len = sizeof(*di) + name_len + data_len;
331 unsigned long name_ptr = (unsigned long)(di + 1);
332
333 if (verify_dir_item(root, leaf, di)) {
334 ret = -EIO;
335 goto err;
336 }
327 337
328 /* we are just looking for how big our buffer needs to be */ 338 total_size += name_len + 1;
329 if (!size) 339 /*
330 goto next; 340 * We are just looking for how big our buffer needs to
341 * be.
342 */
343 if (!size)
344 goto next;
331 345
332 if (!buffer || (name_len + 1) > size_left) { 346 if (!buffer || (name_len + 1) > size_left) {
333 ret = -ERANGE; 347 ret = -ERANGE;
334 goto err; 348 goto err;
335 } 349 }
336 350
337 name_ptr = (unsigned long)(di + 1); 351 read_extent_buffer(leaf, buffer, name_ptr, name_len);
338 read_extent_buffer(leaf, buffer, name_ptr, name_len); 352 buffer[name_len] = '\0';
339 buffer[name_len] = '\0';
340 353
341 size_left -= name_len + 1; 354 size_left -= name_len + 1;
342 buffer += name_len + 1; 355 buffer += name_len + 1;
343next: 356next:
357 cur += this_len;
358 di = (struct btrfs_dir_item *)((char *)di + this_len);
359 }
360next_item:
344 path->slots[0]++; 361 path->slots[0]++;
345 } 362 }
346 ret = total_size; 363 ret = total_size;
diff --git a/fs/buffer.c b/fs/buffer.c
index e1632abb4ca9..33be29675358 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -621,17 +621,17 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
621 * If warn is true, then emit a warning if the page is not uptodate and has 621 * If warn is true, then emit a warning if the page is not uptodate and has
622 * not been truncated. 622 * not been truncated.
623 * 623 *
624 * The caller must hold mem_cgroup_begin_page_stat() lock. 624 * The caller must hold lock_page_memcg().
625 */ 625 */
626static void __set_page_dirty(struct page *page, struct address_space *mapping, 626static void __set_page_dirty(struct page *page, struct address_space *mapping,
627 struct mem_cgroup *memcg, int warn) 627 int warn)
628{ 628{
629 unsigned long flags; 629 unsigned long flags;
630 630
631 spin_lock_irqsave(&mapping->tree_lock, flags); 631 spin_lock_irqsave(&mapping->tree_lock, flags);
632 if (page->mapping) { /* Race with truncate? */ 632 if (page->mapping) { /* Race with truncate? */
633 WARN_ON_ONCE(warn && !PageUptodate(page)); 633 WARN_ON_ONCE(warn && !PageUptodate(page));
634 account_page_dirtied(page, mapping, memcg); 634 account_page_dirtied(page, mapping);
635 radix_tree_tag_set(&mapping->page_tree, 635 radix_tree_tag_set(&mapping->page_tree,
636 page_index(page), PAGECACHE_TAG_DIRTY); 636 page_index(page), PAGECACHE_TAG_DIRTY);
637 } 637 }
@@ -666,7 +666,6 @@ static void __set_page_dirty(struct page *page, struct address_space *mapping,
666int __set_page_dirty_buffers(struct page *page) 666int __set_page_dirty_buffers(struct page *page)
667{ 667{
668 int newly_dirty; 668 int newly_dirty;
669 struct mem_cgroup *memcg;
670 struct address_space *mapping = page_mapping(page); 669 struct address_space *mapping = page_mapping(page);
671 670
672 if (unlikely(!mapping)) 671 if (unlikely(!mapping))
@@ -683,17 +682,17 @@ int __set_page_dirty_buffers(struct page *page)
683 } while (bh != head); 682 } while (bh != head);
684 } 683 }
685 /* 684 /*
686 * Use mem_group_begin_page_stat() to keep PageDirty synchronized with 685 * Lock out page->mem_cgroup migration to keep PageDirty
687 * per-memcg dirty page counters. 686 * synchronized with per-memcg dirty page counters.
688 */ 687 */
689 memcg = mem_cgroup_begin_page_stat(page); 688 lock_page_memcg(page);
690 newly_dirty = !TestSetPageDirty(page); 689 newly_dirty = !TestSetPageDirty(page);
691 spin_unlock(&mapping->private_lock); 690 spin_unlock(&mapping->private_lock);
692 691
693 if (newly_dirty) 692 if (newly_dirty)
694 __set_page_dirty(page, mapping, memcg, 1); 693 __set_page_dirty(page, mapping, 1);
695 694
696 mem_cgroup_end_page_stat(memcg); 695 unlock_page_memcg(page);
697 696
698 if (newly_dirty) 697 if (newly_dirty)
699 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 698 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1167,15 +1166,14 @@ void mark_buffer_dirty(struct buffer_head *bh)
1167 if (!test_set_buffer_dirty(bh)) { 1166 if (!test_set_buffer_dirty(bh)) {
1168 struct page *page = bh->b_page; 1167 struct page *page = bh->b_page;
1169 struct address_space *mapping = NULL; 1168 struct address_space *mapping = NULL;
1170 struct mem_cgroup *memcg;
1171 1169
1172 memcg = mem_cgroup_begin_page_stat(page); 1170 lock_page_memcg(page);
1173 if (!TestSetPageDirty(page)) { 1171 if (!TestSetPageDirty(page)) {
1174 mapping = page_mapping(page); 1172 mapping = page_mapping(page);
1175 if (mapping) 1173 if (mapping)
1176 __set_page_dirty(page, mapping, memcg, 0); 1174 __set_page_dirty(page, mapping, 0);
1177 } 1175 }
1178 mem_cgroup_end_page_stat(memcg); 1176 unlock_page_memcg(page);
1179 if (mapping) 1177 if (mapping)
1180 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1178 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1181 } 1179 }
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 452e98dd7560..1ee54ffd3a24 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -162,6 +162,8 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
162 size_t buflen, loff_t *pos) 162 size_t buflen, loff_t *pos)
163{ 163{
164 struct cachefiles_cache *cache = file->private_data; 164 struct cachefiles_cache *cache = file->private_data;
165 unsigned long long b_released;
166 unsigned f_released;
165 char buffer[256]; 167 char buffer[256];
166 int n; 168 int n;
167 169
@@ -174,6 +176,8 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
174 cachefiles_has_space(cache, 0, 0); 176 cachefiles_has_space(cache, 0, 0);
175 177
176 /* summarise */ 178 /* summarise */
179 f_released = atomic_xchg(&cache->f_released, 0);
180 b_released = atomic_long_xchg(&cache->b_released, 0);
177 clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags); 181 clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
178 182
179 n = snprintf(buffer, sizeof(buffer), 183 n = snprintf(buffer, sizeof(buffer),
@@ -183,15 +187,18 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
183 " fstop=%llx" 187 " fstop=%llx"
184 " brun=%llx" 188 " brun=%llx"
185 " bcull=%llx" 189 " bcull=%llx"
186 " bstop=%llx", 190 " bstop=%llx"
191 " freleased=%x"
192 " breleased=%llx",
187 test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0', 193 test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0',
188 (unsigned long long) cache->frun, 194 (unsigned long long) cache->frun,
189 (unsigned long long) cache->fcull, 195 (unsigned long long) cache->fcull,
190 (unsigned long long) cache->fstop, 196 (unsigned long long) cache->fstop,
191 (unsigned long long) cache->brun, 197 (unsigned long long) cache->brun,
192 (unsigned long long) cache->bcull, 198 (unsigned long long) cache->bcull,
193 (unsigned long long) cache->bstop 199 (unsigned long long) cache->bstop,
194 ); 200 f_released,
201 b_released);
195 202
196 if (n > buflen) 203 if (n > buflen)
197 return -EMSGSIZE; 204 return -EMSGSIZE;
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 675a3332d72f..861d611b8c05 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -291,15 +291,8 @@ static void cachefiles_drop_object(struct fscache_object *_object)
291 } 291 }
292 292
293 /* note that the object is now inactive */ 293 /* note that the object is now inactive */
294 if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { 294 if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
295 write_lock(&cache->active_lock); 295 cachefiles_mark_object_inactive(cache, object);
296 if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE,
297 &object->flags))
298 BUG();
299 rb_erase(&object->active_node, &cache->active_nodes);
300 wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
301 write_unlock(&cache->active_lock);
302 }
303 296
304 dput(object->dentry); 297 dput(object->dentry);
305 object->dentry = NULL; 298 object->dentry = NULL;
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 9c4b737a54df..2fcde1a34b7c 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -66,6 +66,8 @@ struct cachefiles_cache {
66 struct rb_root active_nodes; /* active nodes (can't be culled) */ 66 struct rb_root active_nodes; /* active nodes (can't be culled) */
67 rwlock_t active_lock; /* lock for active_nodes */ 67 rwlock_t active_lock; /* lock for active_nodes */
68 atomic_t gravecounter; /* graveyard uniquifier */ 68 atomic_t gravecounter; /* graveyard uniquifier */
69 atomic_t f_released; /* number of objects released lately */
70 atomic_long_t b_released; /* number of blocks released lately */
69 unsigned frun_percent; /* when to stop culling (% files) */ 71 unsigned frun_percent; /* when to stop culling (% files) */
70 unsigned fcull_percent; /* when to start culling (% files) */ 72 unsigned fcull_percent; /* when to start culling (% files) */
71 unsigned fstop_percent; /* when to stop allocating (% files) */ 73 unsigned fstop_percent; /* when to stop allocating (% files) */
@@ -157,6 +159,8 @@ extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
157/* 159/*
158 * namei.c 160 * namei.c
159 */ 161 */
162extern void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
163 struct cachefiles_object *object);
160extern int cachefiles_delete_object(struct cachefiles_cache *cache, 164extern int cachefiles_delete_object(struct cachefiles_cache *cache,
161 struct cachefiles_object *object); 165 struct cachefiles_object *object);
162extern int cachefiles_walk_to_object(struct cachefiles_object *parent, 166extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 1c2334c163dd..4ae75006e73b 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -258,6 +258,28 @@ requeue:
258} 258}
259 259
260/* 260/*
261 * Mark an object as being inactive.
262 */
263void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
264 struct cachefiles_object *object)
265{
266 write_lock(&cache->active_lock);
267 rb_erase(&object->active_node, &cache->active_nodes);
268 clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
269 write_unlock(&cache->active_lock);
270
271 wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
272
273 /* This object can now be culled, so we need to let the daemon know
274 * that there is something it can remove if it needs to.
275 */
276 atomic_long_add(d_backing_inode(object->dentry)->i_blocks,
277 &cache->b_released);
278 if (atomic_inc_return(&cache->f_released))
279 cachefiles_state_changed(cache);
280}
281
282/*
261 * delete an object representation from the cache 283 * delete an object representation from the cache
262 * - file backed objects are unlinked 284 * - file backed objects are unlinked
263 * - directory backed objects are stuffed into the graveyard for userspace to 285 * - directory backed objects are stuffed into the graveyard for userspace to
@@ -684,11 +706,7 @@ mark_active_timed_out:
684 706
685check_error: 707check_error:
686 _debug("check error %d", ret); 708 _debug("check error %d", ret);
687 write_lock(&cache->active_lock); 709 cachefiles_mark_object_inactive(cache, object);
688 rb_erase(&object->active_node, &cache->active_nodes);
689 clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
690 wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
691 write_unlock(&cache->active_lock);
692release_dentry: 710release_dentry:
693 dput(object->dentry); 711 dput(object->dentry);
694 object->dentry = NULL; 712 object->dentry = NULL;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 19adeb0ef82a..fc5cae2a0db2 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -175,8 +175,8 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
175 175
176static int ceph_releasepage(struct page *page, gfp_t g) 176static int ceph_releasepage(struct page *page, gfp_t g)
177{ 177{
178 struct inode *inode = page->mapping ? page->mapping->host : NULL; 178 dout("%p releasepage %p idx %lu\n", page->mapping->host,
179 dout("%p releasepage %p idx %lu\n", inode, page, page->index); 179 page, page->index);
180 WARN_ON(PageDirty(page)); 180 WARN_ON(PageDirty(page));
181 181
182 /* Can we release the page from the cache? */ 182 /* Can we release the page from the cache? */
@@ -276,7 +276,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
276 for (i = 0; i < num_pages; i++) { 276 for (i = 0; i < num_pages; i++) {
277 struct page *page = osd_data->pages[i]; 277 struct page *page = osd_data->pages[i];
278 278
279 if (rc < 0 && rc != ENOENT) 279 if (rc < 0 && rc != -ENOENT)
280 goto unlock; 280 goto unlock;
281 if (bytes < (int)PAGE_CACHE_SIZE) { 281 if (bytes < (int)PAGE_CACHE_SIZE) {
282 /* zero (remainder of) page */ 282 /* zero (remainder of) page */
@@ -606,71 +606,71 @@ static void writepages_finish(struct ceph_osd_request *req,
606 struct inode *inode = req->r_inode; 606 struct inode *inode = req->r_inode;
607 struct ceph_inode_info *ci = ceph_inode(inode); 607 struct ceph_inode_info *ci = ceph_inode(inode);
608 struct ceph_osd_data *osd_data; 608 struct ceph_osd_data *osd_data;
609 unsigned wrote;
610 struct page *page; 609 struct page *page;
611 int num_pages; 610 int num_pages, total_pages = 0;
612 int i; 611 int i, j;
612 int rc = req->r_result;
613 struct ceph_snap_context *snapc = req->r_snapc; 613 struct ceph_snap_context *snapc = req->r_snapc;
614 struct address_space *mapping = inode->i_mapping; 614 struct address_space *mapping = inode->i_mapping;
615 int rc = req->r_result;
616 u64 bytes = req->r_ops[0].extent.length;
617 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 615 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
618 long writeback_stat; 616 bool remove_page;
619 unsigned issued = ceph_caps_issued(ci);
620 617
621 osd_data = osd_req_op_extent_osd_data(req, 0); 618
622 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); 619 dout("writepages_finish %p rc %d\n", inode, rc);
623 num_pages = calc_pages_for((u64)osd_data->alignment, 620 if (rc < 0)
624 (u64)osd_data->length);
625 if (rc >= 0) {
626 /*
627 * Assume we wrote the pages we originally sent. The
628 * osd might reply with fewer pages if our writeback
629 * raced with a truncation and was adjusted at the osd,
630 * so don't believe the reply.
631 */
632 wrote = num_pages;
633 } else {
634 wrote = 0;
635 mapping_set_error(mapping, rc); 621 mapping_set_error(mapping, rc);
636 }
637 dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
638 inode, rc, bytes, wrote);
639 622
640 /* clean all pages */ 623 /*
641 for (i = 0; i < num_pages; i++) { 624 * We lost the cache cap, need to truncate the page before
642 page = osd_data->pages[i]; 625 * it is unlocked, otherwise we'd truncate it later in the
643 BUG_ON(!page); 626 * page truncation thread, possibly losing some data that
644 WARN_ON(!PageUptodate(page)); 627 * raced its way in
628 */
629 remove_page = !(ceph_caps_issued(ci) &
630 (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
645 631
646 writeback_stat = 632 /* clean all pages */
647 atomic_long_dec_return(&fsc->writeback_count); 633 for (i = 0; i < req->r_num_ops; i++) {
648 if (writeback_stat < 634 if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
649 CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) 635 break;
650 clear_bdi_congested(&fsc->backing_dev_info,
651 BLK_RW_ASYNC);
652 636
653 ceph_put_snap_context(page_snap_context(page)); 637 osd_data = osd_req_op_extent_osd_data(req, i);
654 page->private = 0; 638 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
655 ClearPagePrivate(page); 639 num_pages = calc_pages_for((u64)osd_data->alignment,
656 dout("unlocking %d %p\n", i, page); 640 (u64)osd_data->length);
657 end_page_writeback(page); 641 total_pages += num_pages;
642 for (j = 0; j < num_pages; j++) {
643 page = osd_data->pages[j];
644 BUG_ON(!page);
645 WARN_ON(!PageUptodate(page));
646
647 if (atomic_long_dec_return(&fsc->writeback_count) <
648 CONGESTION_OFF_THRESH(
649 fsc->mount_options->congestion_kb))
650 clear_bdi_congested(&fsc->backing_dev_info,
651 BLK_RW_ASYNC);
652
653 ceph_put_snap_context(page_snap_context(page));
654 page->private = 0;
655 ClearPagePrivate(page);
656 dout("unlocking %p\n", page);
657 end_page_writeback(page);
658
659 if (remove_page)
660 generic_error_remove_page(inode->i_mapping,
661 page);
658 662
659 /* 663 unlock_page(page);
660 * We lost the cache cap, need to truncate the page before 664 }
661 * it is unlocked, otherwise we'd truncate it later in the 665 dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
662 * page truncation thread, possibly losing some data that 666 inode, osd_data->length, rc >= 0 ? num_pages : 0);
663 * raced its way in
664 */
665 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
666 generic_error_remove_page(inode->i_mapping, page);
667 667
668 unlock_page(page); 668 ceph_release_pages(osd_data->pages, num_pages);
669 } 669 }
670 dout("%p wrote+cleaned %d pages\n", inode, wrote);
671 ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
672 670
673 ceph_release_pages(osd_data->pages, num_pages); 671 ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
672
673 osd_data = osd_req_op_extent_osd_data(req, 0);
674 if (osd_data->pages_from_pool) 674 if (osd_data->pages_from_pool)
675 mempool_free(osd_data->pages, 675 mempool_free(osd_data->pages,
676 ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); 676 ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
@@ -778,17 +778,15 @@ retry:
778 while (!done && index <= end) { 778 while (!done && index <= end) {
779 unsigned i; 779 unsigned i;
780 int first; 780 int first;
781 pgoff_t next; 781 pgoff_t strip_unit_end = 0;
782 int pvec_pages, locked_pages; 782 int num_ops = 0, op_idx;
783 struct page **pages = NULL; 783 int pvec_pages, locked_pages = 0;
784 struct page **pages = NULL, **data_pages;
784 mempool_t *pool = NULL; /* Becomes non-null if mempool used */ 785 mempool_t *pool = NULL; /* Becomes non-null if mempool used */
785 struct page *page; 786 struct page *page;
786 int want; 787 int want;
787 u64 offset, len; 788 u64 offset = 0, len = 0;
788 long writeback_stat;
789 789
790 next = 0;
791 locked_pages = 0;
792 max_pages = max_pages_ever; 790 max_pages = max_pages_ever;
793 791
794get_more_pages: 792get_more_pages:
@@ -824,8 +822,8 @@ get_more_pages:
824 unlock_page(page); 822 unlock_page(page);
825 break; 823 break;
826 } 824 }
827 if (next && (page->index != next)) { 825 if (strip_unit_end && (page->index > strip_unit_end)) {
828 dout("not consecutive %p\n", page); 826 dout("end of strip unit %p\n", page);
829 unlock_page(page); 827 unlock_page(page);
830 break; 828 break;
831 } 829 }
@@ -867,36 +865,31 @@ get_more_pages:
867 /* 865 /*
868 * We have something to write. If this is 866 * We have something to write. If this is
869 * the first locked page this time through, 867 * the first locked page this time through,
870 * allocate an osd request and a page array 868 * calculate max possinle write size and
871 * that it will use. 869 * allocate a page array
872 */ 870 */
873 if (locked_pages == 0) { 871 if (locked_pages == 0) {
874 BUG_ON(pages); 872 u64 objnum;
873 u64 objoff;
874
875 /* prepare async write request */ 875 /* prepare async write request */
876 offset = (u64)page_offset(page); 876 offset = (u64)page_offset(page);
877 len = wsize; 877 len = wsize;
878 req = ceph_osdc_new_request(&fsc->client->osdc, 878
879 &ci->i_layout, vino, 879 rc = ceph_calc_file_object_mapping(&ci->i_layout,
880 offset, &len, 0, 880 offset, len,
881 do_sync ? 2 : 1, 881 &objnum, &objoff,
882 CEPH_OSD_OP_WRITE, 882 &len);
883 CEPH_OSD_FLAG_WRITE | 883 if (rc < 0) {
884 CEPH_OSD_FLAG_ONDISK,
885 snapc, truncate_seq,
886 truncate_size, true);
887 if (IS_ERR(req)) {
888 rc = PTR_ERR(req);
889 unlock_page(page); 884 unlock_page(page);
890 break; 885 break;
891 } 886 }
892 887
893 if (do_sync) 888 num_ops = 1 + do_sync;
894 osd_req_op_init(req, 1, 889 strip_unit_end = page->index +
895 CEPH_OSD_OP_STARTSYNC, 0); 890 ((len - 1) >> PAGE_CACHE_SHIFT);
896
897 req->r_callback = writepages_finish;
898 req->r_inode = inode;
899 891
892 BUG_ON(pages);
900 max_pages = calc_pages_for(0, (u64)len); 893 max_pages = calc_pages_for(0, (u64)len);
901 pages = kmalloc(max_pages * sizeof (*pages), 894 pages = kmalloc(max_pages * sizeof (*pages),
902 GFP_NOFS); 895 GFP_NOFS);
@@ -905,6 +898,20 @@ get_more_pages:
905 pages = mempool_alloc(pool, GFP_NOFS); 898 pages = mempool_alloc(pool, GFP_NOFS);
906 BUG_ON(!pages); 899 BUG_ON(!pages);
907 } 900 }
901
902 len = 0;
903 } else if (page->index !=
904 (offset + len) >> PAGE_CACHE_SHIFT) {
905 if (num_ops >= (pool ? CEPH_OSD_SLAB_OPS :
906 CEPH_OSD_MAX_OPS)) {
907 redirty_page_for_writepage(wbc, page);
908 unlock_page(page);
909 break;
910 }
911
912 num_ops++;
913 offset = (u64)page_offset(page);
914 len = 0;
908 } 915 }
909 916
910 /* note position of first page in pvec */ 917 /* note position of first page in pvec */
@@ -913,18 +920,16 @@ get_more_pages:
913 dout("%p will write page %p idx %lu\n", 920 dout("%p will write page %p idx %lu\n",
914 inode, page, page->index); 921 inode, page, page->index);
915 922
916 writeback_stat = 923 if (atomic_long_inc_return(&fsc->writeback_count) >
917 atomic_long_inc_return(&fsc->writeback_count); 924 CONGESTION_ON_THRESH(
918 if (writeback_stat > CONGESTION_ON_THRESH(
919 fsc->mount_options->congestion_kb)) { 925 fsc->mount_options->congestion_kb)) {
920 set_bdi_congested(&fsc->backing_dev_info, 926 set_bdi_congested(&fsc->backing_dev_info,
921 BLK_RW_ASYNC); 927 BLK_RW_ASYNC);
922 } 928 }
923 929
924 set_page_writeback(page);
925 pages[locked_pages] = page; 930 pages[locked_pages] = page;
926 locked_pages++; 931 locked_pages++;
927 next = page->index + 1; 932 len += PAGE_CACHE_SIZE;
928 } 933 }
929 934
930 /* did we get anything? */ 935 /* did we get anything? */
@@ -944,38 +949,119 @@ get_more_pages:
944 /* shift unused pages over in the pvec... we 949 /* shift unused pages over in the pvec... we
945 * will need to release them below. */ 950 * will need to release them below. */
946 for (j = i; j < pvec_pages; j++) { 951 for (j = i; j < pvec_pages; j++) {
947 dout(" pvec leftover page %p\n", 952 dout(" pvec leftover page %p\n", pvec.pages[j]);
948 pvec.pages[j]);
949 pvec.pages[j-i+first] = pvec.pages[j]; 953 pvec.pages[j-i+first] = pvec.pages[j];
950 } 954 }
951 pvec.nr -= i-first; 955 pvec.nr -= i-first;
952 } 956 }
953 957
954 /* Format the osd request message and submit the write */ 958new_request:
955 offset = page_offset(pages[0]); 959 offset = page_offset(pages[0]);
956 len = (u64)locked_pages << PAGE_CACHE_SHIFT; 960 len = wsize;
957 if (snap_size == -1) { 961
958 len = min(len, (u64)i_size_read(inode) - offset); 962 req = ceph_osdc_new_request(&fsc->client->osdc,
959 /* writepages_finish() clears writeback pages 963 &ci->i_layout, vino,
960 * according to the data length, so make sure 964 offset, &len, 0, num_ops,
961 * data length covers all locked pages */ 965 CEPH_OSD_OP_WRITE,
962 len = max(len, 1 + 966 CEPH_OSD_FLAG_WRITE |
963 ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT)); 967 CEPH_OSD_FLAG_ONDISK,
964 } else { 968 snapc, truncate_seq,
965 len = min(len, snap_size - offset); 969 truncate_size, false);
970 if (IS_ERR(req)) {
971 req = ceph_osdc_new_request(&fsc->client->osdc,
972 &ci->i_layout, vino,
973 offset, &len, 0,
974 min(num_ops,
975 CEPH_OSD_SLAB_OPS),
976 CEPH_OSD_OP_WRITE,
977 CEPH_OSD_FLAG_WRITE |
978 CEPH_OSD_FLAG_ONDISK,
979 snapc, truncate_seq,
980 truncate_size, true);
981 BUG_ON(IS_ERR(req));
966 } 982 }
967 dout("writepages got %d pages at %llu~%llu\n", 983 BUG_ON(len < page_offset(pages[locked_pages - 1]) +
968 locked_pages, offset, len); 984 PAGE_CACHE_SIZE - offset);
985
986 req->r_callback = writepages_finish;
987 req->r_inode = inode;
969 988
970 osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, 989 /* Format the osd request message and submit the write */
990 len = 0;
991 data_pages = pages;
992 op_idx = 0;
993 for (i = 0; i < locked_pages; i++) {
994 u64 cur_offset = page_offset(pages[i]);
995 if (offset + len != cur_offset) {
996 if (op_idx + do_sync + 1 == req->r_num_ops)
997 break;
998 osd_req_op_extent_dup_last(req, op_idx,
999 cur_offset - offset);
1000 dout("writepages got pages at %llu~%llu\n",
1001 offset, len);
1002 osd_req_op_extent_osd_data_pages(req, op_idx,
1003 data_pages, len, 0,
971 !!pool, false); 1004 !!pool, false);
1005 osd_req_op_extent_update(req, op_idx, len);
972 1006
973 pages = NULL; /* request message now owns the pages array */ 1007 len = 0;
974 pool = NULL; 1008 offset = cur_offset;
1009 data_pages = pages + i;
1010 op_idx++;
1011 }
975 1012
976 /* Update the write op length in case we changed it */ 1013 set_page_writeback(pages[i]);
1014 len += PAGE_CACHE_SIZE;
1015 }
1016
1017 if (snap_size != -1) {
1018 len = min(len, snap_size - offset);
1019 } else if (i == locked_pages) {
1020 /* writepages_finish() clears writeback pages
1021 * according to the data length, so make sure
1022 * data length covers all locked pages */
1023 u64 min_len = len + 1 - PAGE_CACHE_SIZE;
1024 len = min(len, (u64)i_size_read(inode) - offset);
1025 len = max(len, min_len);
1026 }
1027 dout("writepages got pages at %llu~%llu\n", offset, len);
1028
1029 osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
1030 0, !!pool, false);
1031 osd_req_op_extent_update(req, op_idx, len);
977 1032
978 osd_req_op_extent_update(req, 0, len); 1033 if (do_sync) {
1034 op_idx++;
1035 osd_req_op_init(req, op_idx, CEPH_OSD_OP_STARTSYNC, 0);
1036 }
1037 BUG_ON(op_idx + 1 != req->r_num_ops);
1038
1039 pool = NULL;
1040 if (i < locked_pages) {
1041 BUG_ON(num_ops <= req->r_num_ops);
1042 num_ops -= req->r_num_ops;
1043 num_ops += do_sync;
1044 locked_pages -= i;
1045
1046 /* allocate new pages array for next request */
1047 data_pages = pages;
1048 pages = kmalloc(locked_pages * sizeof (*pages),
1049 GFP_NOFS);
1050 if (!pages) {
1051 pool = fsc->wb_pagevec_pool;
1052 pages = mempool_alloc(pool, GFP_NOFS);
1053 BUG_ON(!pages);
1054 }
1055 memcpy(pages, data_pages + i,
1056 locked_pages * sizeof(*pages));
1057 memset(data_pages + i, 0,
1058 locked_pages * sizeof(*pages));
1059 } else {
1060 BUG_ON(num_ops != req->r_num_ops);
1061 index = pages[i - 1]->index + 1;
1062 /* request message now owns the pages array */
1063 pages = NULL;
1064 }
979 1065
980 vino = ceph_vino(inode); 1066 vino = ceph_vino(inode);
981 ceph_osdc_build_request(req, offset, snapc, vino.snap, 1067 ceph_osdc_build_request(req, offset, snapc, vino.snap,
@@ -985,9 +1071,10 @@ get_more_pages:
985 BUG_ON(rc); 1071 BUG_ON(rc);
986 req = NULL; 1072 req = NULL;
987 1073
988 /* continue? */ 1074 wbc->nr_to_write -= i;
989 index = next; 1075 if (pages)
990 wbc->nr_to_write -= locked_pages; 1076 goto new_request;
1077
991 if (wbc->nr_to_write <= 0) 1078 if (wbc->nr_to_write <= 0)
992 done = 1; 1079 done = 1;
993 1080
@@ -1522,7 +1609,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
1522 ceph_vino(inode), 0, &len, 0, 1, 1609 ceph_vino(inode), 0, &len, 0, 1,
1523 CEPH_OSD_OP_CREATE, 1610 CEPH_OSD_OP_CREATE,
1524 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, 1611 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
1525 ceph_empty_snapc, 0, 0, false); 1612 NULL, 0, 0, false);
1526 if (IS_ERR(req)) { 1613 if (IS_ERR(req)) {
1527 err = PTR_ERR(req); 1614 err = PTR_ERR(req);
1528 goto out; 1615 goto out;
@@ -1540,9 +1627,8 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
1540 ceph_vino(inode), 0, &len, 1, 3, 1627 ceph_vino(inode), 0, &len, 1, 3,
1541 CEPH_OSD_OP_WRITE, 1628 CEPH_OSD_OP_WRITE,
1542 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, 1629 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
1543 ceph_empty_snapc, 1630 NULL, ci->i_truncate_seq,
1544 ci->i_truncate_seq, ci->i_truncate_size, 1631 ci->i_truncate_size, false);
1545 false);
1546 if (IS_ERR(req)) { 1632 if (IS_ERR(req)) {
1547 err = PTR_ERR(req); 1633 err = PTR_ERR(req);
1548 goto out; 1634 goto out;
@@ -1663,8 +1749,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
1663 goto out; 1749 goto out;
1664 } 1750 }
1665 1751
1666 rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, 1752 rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
1667 ceph_empty_snapc,
1668 1, false, GFP_NOFS); 1753 1, false, GFP_NOFS);
1669 if (!rd_req) { 1754 if (!rd_req) {
1670 err = -ENOMEM; 1755 err = -ENOMEM;
@@ -1678,8 +1763,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
1678 "%llx.00000000", ci->i_vino.ino); 1763 "%llx.00000000", ci->i_vino.ino);
1679 rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name); 1764 rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
1680 1765
1681 wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, 1766 wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
1682 ceph_empty_snapc,
1683 1, false, GFP_NOFS); 1767 1, false, GFP_NOFS);
1684 if (!wr_req) { 1768 if (!wr_req) {
1685 err = -ENOMEM; 1769 err = -ENOMEM;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 6fe0ad26a7df..de17bb232ff8 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -991,7 +991,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
991 u32 seq, u64 flush_tid, u64 oldest_flush_tid, 991 u32 seq, u64 flush_tid, u64 oldest_flush_tid,
992 u32 issue_seq, u32 mseq, u64 size, u64 max_size, 992 u32 issue_seq, u32 mseq, u64 size, u64 max_size,
993 struct timespec *mtime, struct timespec *atime, 993 struct timespec *mtime, struct timespec *atime,
994 u64 time_warp_seq, 994 struct timespec *ctime, u64 time_warp_seq,
995 kuid_t uid, kgid_t gid, umode_t mode, 995 kuid_t uid, kgid_t gid, umode_t mode,
996 u64 xattr_version, 996 u64 xattr_version,
997 struct ceph_buffer *xattrs_buf, 997 struct ceph_buffer *xattrs_buf,
@@ -1042,6 +1042,8 @@ static int send_cap_msg(struct ceph_mds_session *session,
1042 ceph_encode_timespec(&fc->mtime, mtime); 1042 ceph_encode_timespec(&fc->mtime, mtime);
1043 if (atime) 1043 if (atime)
1044 ceph_encode_timespec(&fc->atime, atime); 1044 ceph_encode_timespec(&fc->atime, atime);
1045 if (ctime)
1046 ceph_encode_timespec(&fc->ctime, ctime);
1045 fc->time_warp_seq = cpu_to_le32(time_warp_seq); 1047 fc->time_warp_seq = cpu_to_le32(time_warp_seq);
1046 1048
1047 fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid)); 1049 fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid));
@@ -1116,7 +1118,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1116 int held, revoking, dropping, keep; 1118 int held, revoking, dropping, keep;
1117 u64 seq, issue_seq, mseq, time_warp_seq, follows; 1119 u64 seq, issue_seq, mseq, time_warp_seq, follows;
1118 u64 size, max_size; 1120 u64 size, max_size;
1119 struct timespec mtime, atime; 1121 struct timespec mtime, atime, ctime;
1120 int wake = 0; 1122 int wake = 0;
1121 umode_t mode; 1123 umode_t mode;
1122 kuid_t uid; 1124 kuid_t uid;
@@ -1180,6 +1182,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1180 ci->i_requested_max_size = max_size; 1182 ci->i_requested_max_size = max_size;
1181 mtime = inode->i_mtime; 1183 mtime = inode->i_mtime;
1182 atime = inode->i_atime; 1184 atime = inode->i_atime;
1185 ctime = inode->i_ctime;
1183 time_warp_seq = ci->i_time_warp_seq; 1186 time_warp_seq = ci->i_time_warp_seq;
1184 uid = inode->i_uid; 1187 uid = inode->i_uid;
1185 gid = inode->i_gid; 1188 gid = inode->i_gid;
@@ -1198,7 +1201,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1198 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, 1201 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1199 op, keep, want, flushing, seq, 1202 op, keep, want, flushing, seq,
1200 flush_tid, oldest_flush_tid, issue_seq, mseq, 1203 flush_tid, oldest_flush_tid, issue_seq, mseq,
1201 size, max_size, &mtime, &atime, time_warp_seq, 1204 size, max_size, &mtime, &atime, &ctime, time_warp_seq,
1202 uid, gid, mode, xattr_version, xattr_blob, 1205 uid, gid, mode, xattr_version, xattr_blob,
1203 follows, inline_data); 1206 follows, inline_data);
1204 if (ret < 0) { 1207 if (ret < 0) {
@@ -1320,7 +1323,7 @@ retry:
1320 capsnap->dirty, 0, capsnap->flush_tid, 0, 1323 capsnap->dirty, 0, capsnap->flush_tid, 0,
1321 0, mseq, capsnap->size, 0, 1324 0, mseq, capsnap->size, 0,
1322 &capsnap->mtime, &capsnap->atime, 1325 &capsnap->mtime, &capsnap->atime,
1323 capsnap->time_warp_seq, 1326 &capsnap->ctime, capsnap->time_warp_seq,
1324 capsnap->uid, capsnap->gid, capsnap->mode, 1327 capsnap->uid, capsnap->gid, capsnap->mode,
1325 capsnap->xattr_version, capsnap->xattr_blob, 1328 capsnap->xattr_version, capsnap->xattr_blob,
1326 capsnap->follows, capsnap->inline_data); 1329 capsnap->follows, capsnap->inline_data);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index fd11fb231a2e..fadc243dfb28 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -38,7 +38,7 @@ int ceph_init_dentry(struct dentry *dentry)
38 if (dentry->d_fsdata) 38 if (dentry->d_fsdata)
39 return 0; 39 return 0;
40 40
41 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_KERNEL | __GFP_ZERO); 41 di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
42 if (!di) 42 if (!di)
43 return -ENOMEM; /* oh well */ 43 return -ENOMEM; /* oh well */
44 44
@@ -68,23 +68,6 @@ out_unlock:
68 return 0; 68 return 0;
69} 69}
70 70
71struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry)
72{
73 struct inode *inode = NULL;
74
75 if (!dentry)
76 return NULL;
77
78 spin_lock(&dentry->d_lock);
79 if (!IS_ROOT(dentry)) {
80 inode = d_inode(dentry->d_parent);
81 ihold(inode);
82 }
83 spin_unlock(&dentry->d_lock);
84 return inode;
85}
86
87
88/* 71/*
89 * for readdir, we encode the directory frag and offset within that 72 * for readdir, we encode the directory frag and offset within that
90 * frag into f_pos. 73 * frag into f_pos.
@@ -624,6 +607,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
624 struct ceph_mds_client *mdsc = fsc->mdsc; 607 struct ceph_mds_client *mdsc = fsc->mdsc;
625 struct ceph_mds_request *req; 608 struct ceph_mds_request *req;
626 int op; 609 int op;
610 int mask;
627 int err; 611 int err;
628 612
629 dout("lookup %p dentry %p '%pd'\n", 613 dout("lookup %p dentry %p '%pd'\n",
@@ -666,8 +650,12 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
666 return ERR_CAST(req); 650 return ERR_CAST(req);
667 req->r_dentry = dget(dentry); 651 req->r_dentry = dget(dentry);
668 req->r_num_caps = 2; 652 req->r_num_caps = 2;
669 /* we only need inode linkage */ 653
670 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); 654 mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
655 if (ceph_security_xattr_wanted(dir))
656 mask |= CEPH_CAP_XATTR_SHARED;
657 req->r_args.getattr.mask = cpu_to_le32(mask);
658
671 req->r_locked_dir = dir; 659 req->r_locked_dir = dir;
672 err = ceph_mdsc_do_request(mdsc, NULL, req); 660 err = ceph_mdsc_do_request(mdsc, NULL, req);
673 err = ceph_handle_snapdir(req, dentry, err); 661 err = ceph_handle_snapdir(req, dentry, err);
@@ -1095,6 +1083,7 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
1095static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) 1083static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
1096{ 1084{
1097 int valid = 0; 1085 int valid = 0;
1086 struct dentry *parent;
1098 struct inode *dir; 1087 struct inode *dir;
1099 1088
1100 if (flags & LOOKUP_RCU) 1089 if (flags & LOOKUP_RCU)
@@ -1103,7 +1092,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
1103 dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry, 1092 dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
1104 dentry, d_inode(dentry), ceph_dentry(dentry)->offset); 1093 dentry, d_inode(dentry), ceph_dentry(dentry)->offset);
1105 1094
1106 dir = ceph_get_dentry_parent_inode(dentry); 1095 parent = dget_parent(dentry);
1096 dir = d_inode(parent);
1107 1097
1108 /* always trust cached snapped dentries, snapdir dentry */ 1098 /* always trust cached snapped dentries, snapdir dentry */
1109 if (ceph_snap(dir) != CEPH_NOSNAP) { 1099 if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -1121,13 +1111,48 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
1121 valid = 1; 1111 valid = 1;
1122 } 1112 }
1123 1113
1114 if (!valid) {
1115 struct ceph_mds_client *mdsc =
1116 ceph_sb_to_client(dir->i_sb)->mdsc;
1117 struct ceph_mds_request *req;
1118 int op, mask, err;
1119
1120 op = ceph_snap(dir) == CEPH_SNAPDIR ?
1121 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
1122 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
1123 if (!IS_ERR(req)) {
1124 req->r_dentry = dget(dentry);
1125 req->r_num_caps = 2;
1126
1127 mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
1128 if (ceph_security_xattr_wanted(dir))
1129 mask |= CEPH_CAP_XATTR_SHARED;
1130 req->r_args.getattr.mask = mask;
1131
1132 req->r_locked_dir = dir;
1133 err = ceph_mdsc_do_request(mdsc, NULL, req);
1134 if (err == 0 || err == -ENOENT) {
1135 if (dentry == req->r_dentry) {
1136 valid = !d_unhashed(dentry);
1137 } else {
1138 d_invalidate(req->r_dentry);
1139 err = -EAGAIN;
1140 }
1141 }
1142 ceph_mdsc_put_request(req);
1143 dout("d_revalidate %p lookup result=%d\n",
1144 dentry, err);
1145 }
1146 }
1147
1124 dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); 1148 dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
1125 if (valid) { 1149 if (valid) {
1126 ceph_dentry_lru_touch(dentry); 1150 ceph_dentry_lru_touch(dentry);
1127 } else { 1151 } else {
1128 ceph_dir_clear_complete(dir); 1152 ceph_dir_clear_complete(dir);
1129 } 1153 }
1130 iput(dir); 1154
1155 dput(parent);
1131 return valid; 1156 return valid;
1132} 1157}
1133 1158
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 3b3172357326..6e72c98162d5 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -71,12 +71,18 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
71 inode = ceph_find_inode(sb, vino); 71 inode = ceph_find_inode(sb, vino);
72 if (!inode) { 72 if (!inode) {
73 struct ceph_mds_request *req; 73 struct ceph_mds_request *req;
74 int mask;
74 75
75 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO, 76 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
76 USE_ANY_MDS); 77 USE_ANY_MDS);
77 if (IS_ERR(req)) 78 if (IS_ERR(req))
78 return ERR_CAST(req); 79 return ERR_CAST(req);
79 80
81 mask = CEPH_STAT_CAP_INODE;
82 if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
83 mask |= CEPH_CAP_XATTR_SHARED;
84 req->r_args.getattr.mask = cpu_to_le32(mask);
85
80 req->r_ino1 = vino; 86 req->r_ino1 = vino;
81 req->r_num_caps = 1; 87 req->r_num_caps = 1;
82 err = ceph_mdsc_do_request(mdsc, NULL, req); 88 err = ceph_mdsc_do_request(mdsc, NULL, req);
@@ -128,6 +134,7 @@ static struct dentry *__get_parent(struct super_block *sb,
128 struct ceph_mds_request *req; 134 struct ceph_mds_request *req;
129 struct inode *inode; 135 struct inode *inode;
130 struct dentry *dentry; 136 struct dentry *dentry;
137 int mask;
131 int err; 138 int err;
132 139
133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT, 140 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT,
@@ -144,6 +151,12 @@ static struct dentry *__get_parent(struct super_block *sb,
144 .snap = CEPH_NOSNAP, 151 .snap = CEPH_NOSNAP,
145 }; 152 };
146 } 153 }
154
155 mask = CEPH_STAT_CAP_INODE;
156 if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
157 mask |= CEPH_CAP_XATTR_SHARED;
158 req->r_args.getattr.mask = cpu_to_le32(mask);
159
147 req->r_num_caps = 1; 160 req->r_num_caps = 1;
148 err = ceph_mdsc_do_request(mdsc, NULL, req); 161 err = ceph_mdsc_do_request(mdsc, NULL, req);
149 inode = req->r_target_inode; 162 inode = req->r_target_inode;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index eb9028e8cfc5..ef38f01c1795 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -157,7 +157,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
157 case S_IFDIR: 157 case S_IFDIR:
158 dout("init_file %p %p 0%o (regular)\n", inode, file, 158 dout("init_file %p %p 0%o (regular)\n", inode, file,
159 inode->i_mode); 159 inode->i_mode);
160 cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO); 160 cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
161 if (cf == NULL) { 161 if (cf == NULL) {
162 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ 162 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
163 return -ENOMEM; 163 return -ENOMEM;
@@ -300,6 +300,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
300 struct ceph_mds_request *req; 300 struct ceph_mds_request *req;
301 struct dentry *dn; 301 struct dentry *dn;
302 struct ceph_acls_info acls = {}; 302 struct ceph_acls_info acls = {};
303 int mask;
303 int err; 304 int err;
304 305
305 dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n", 306 dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
@@ -335,6 +336,12 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
335 acls.pagelist = NULL; 336 acls.pagelist = NULL;
336 } 337 }
337 } 338 }
339
340 mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
341 if (ceph_security_xattr_wanted(dir))
342 mask |= CEPH_CAP_XATTR_SHARED;
343 req->r_args.open.mask = cpu_to_le32(mask);
344
338 req->r_locked_dir = dir; /* caller holds dir->i_mutex */ 345 req->r_locked_dir = dir; /* caller holds dir->i_mutex */
339 err = ceph_mdsc_do_request(mdsc, 346 err = ceph_mdsc_do_request(mdsc,
340 (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, 347 (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
@@ -725,7 +732,6 @@ static void ceph_aio_retry_work(struct work_struct *work)
725 ret = ceph_osdc_start_request(req->r_osdc, req, false); 732 ret = ceph_osdc_start_request(req->r_osdc, req, false);
726out: 733out:
727 if (ret < 0) { 734 if (ret < 0) {
728 BUG_ON(ret == -EOLDSNAPC);
729 req->r_result = ret; 735 req->r_result = ret;
730 ceph_aio_complete_req(req, NULL); 736 ceph_aio_complete_req(req, NULL);
731 } 737 }
@@ -783,7 +789,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
783 int num_pages = 0; 789 int num_pages = 0;
784 int flags; 790 int flags;
785 int ret; 791 int ret;
786 struct timespec mtime = CURRENT_TIME; 792 struct timespec mtime = current_fs_time(inode->i_sb);
787 size_t count = iov_iter_count(iter); 793 size_t count = iov_iter_count(iter);
788 loff_t pos = iocb->ki_pos; 794 loff_t pos = iocb->ki_pos;
789 bool write = iov_iter_rw(iter) == WRITE; 795 bool write = iov_iter_rw(iter) == WRITE;
@@ -949,7 +955,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
949 ret = ceph_osdc_start_request(req->r_osdc, 955 ret = ceph_osdc_start_request(req->r_osdc,
950 req, false); 956 req, false);
951 if (ret < 0) { 957 if (ret < 0) {
952 BUG_ON(ret == -EOLDSNAPC);
953 req->r_result = ret; 958 req->r_result = ret;
954 ceph_aio_complete_req(req, NULL); 959 ceph_aio_complete_req(req, NULL);
955 } 960 }
@@ -988,7 +993,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
988 int flags; 993 int flags;
989 int check_caps = 0; 994 int check_caps = 0;
990 int ret; 995 int ret;
991 struct timespec mtime = CURRENT_TIME; 996 struct timespec mtime = current_fs_time(inode->i_sb);
992 size_t count = iov_iter_count(from); 997 size_t count = iov_iter_count(from);
993 998
994 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) 999 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 5849b88bbed3..ed58b168904a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -549,6 +549,10 @@ int ceph_fill_file_size(struct inode *inode, int issued,
549 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || 549 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
550 (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) { 550 (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
551 dout("size %lld -> %llu\n", inode->i_size, size); 551 dout("size %lld -> %llu\n", inode->i_size, size);
552 if (size > 0 && S_ISDIR(inode->i_mode)) {
553 pr_err("fill_file_size non-zero size for directory\n");
554 size = 0;
555 }
552 i_size_write(inode, size); 556 i_size_write(inode, size);
553 inode->i_blocks = (size + (1<<9) - 1) >> 9; 557 inode->i_blocks = (size + (1<<9) - 1) >> 9;
554 ci->i_reported_size = size; 558 ci->i_reported_size = size;
@@ -977,13 +981,8 @@ out_unlock:
977/* 981/*
978 * splice a dentry to an inode. 982 * splice a dentry to an inode.
979 * caller must hold directory i_mutex for this to be safe. 983 * caller must hold directory i_mutex for this to be safe.
980 *
981 * we will only rehash the resulting dentry if @prehash is
982 * true; @prehash will be set to false (for the benefit of
983 * the caller) if we fail.
984 */ 984 */
985static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, 985static struct dentry *splice_dentry(struct dentry *dn, struct inode *in)
986 bool *prehash)
987{ 986{
988 struct dentry *realdn; 987 struct dentry *realdn;
989 988
@@ -996,8 +995,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
996 if (IS_ERR(realdn)) { 995 if (IS_ERR(realdn)) {
997 pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", 996 pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
998 PTR_ERR(realdn), dn, in, ceph_vinop(in)); 997 PTR_ERR(realdn), dn, in, ceph_vinop(in));
999 if (prehash)
1000 *prehash = false; /* don't rehash on error */
1001 dn = realdn; /* note realdn contains the error */ 998 dn = realdn; /* note realdn contains the error */
1002 goto out; 999 goto out;
1003 } else if (realdn) { 1000 } else if (realdn) {
@@ -1013,8 +1010,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
1013 dout("dn %p attached to %p ino %llx.%llx\n", 1010 dout("dn %p attached to %p ino %llx.%llx\n",
1014 dn, d_inode(dn), ceph_vinop(d_inode(dn))); 1011 dn, d_inode(dn), ceph_vinop(d_inode(dn)));
1015 } 1012 }
1016 if ((!prehash || *prehash) && d_unhashed(dn))
1017 d_rehash(dn);
1018out: 1013out:
1019 return dn; 1014 return dn;
1020} 1015}
@@ -1247,10 +1242,8 @@ retry_lookup:
1247 dout("d_delete %p\n", dn); 1242 dout("d_delete %p\n", dn);
1248 d_delete(dn); 1243 d_delete(dn);
1249 } else { 1244 } else {
1250 dout("d_instantiate %p NULL\n", dn);
1251 d_instantiate(dn, NULL);
1252 if (have_lease && d_unhashed(dn)) 1245 if (have_lease && d_unhashed(dn))
1253 d_rehash(dn); 1246 d_add(dn, NULL);
1254 update_dentry_lease(dn, rinfo->dlease, 1247 update_dentry_lease(dn, rinfo->dlease,
1255 session, 1248 session,
1256 req->r_request_started); 1249 req->r_request_started);
@@ -1262,7 +1255,7 @@ retry_lookup:
1262 if (d_really_is_negative(dn)) { 1255 if (d_really_is_negative(dn)) {
1263 ceph_dir_clear_ordered(dir); 1256 ceph_dir_clear_ordered(dir);
1264 ihold(in); 1257 ihold(in);
1265 dn = splice_dentry(dn, in, &have_lease); 1258 dn = splice_dentry(dn, in);
1266 if (IS_ERR(dn)) { 1259 if (IS_ERR(dn)) {
1267 err = PTR_ERR(dn); 1260 err = PTR_ERR(dn);
1268 goto done; 1261 goto done;
@@ -1272,6 +1265,7 @@ retry_lookup:
1272 dout(" %p links to %p %llx.%llx, not %llx.%llx\n", 1265 dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
1273 dn, d_inode(dn), ceph_vinop(d_inode(dn)), 1266 dn, d_inode(dn), ceph_vinop(d_inode(dn)),
1274 ceph_vinop(in)); 1267 ceph_vinop(in));
1268 d_invalidate(dn);
1275 have_lease = false; 1269 have_lease = false;
1276 } 1270 }
1277 1271
@@ -1292,7 +1286,7 @@ retry_lookup:
1292 dout(" linking snapped dir %p to dn %p\n", in, dn); 1286 dout(" linking snapped dir %p to dn %p\n", in, dn);
1293 ceph_dir_clear_ordered(dir); 1287 ceph_dir_clear_ordered(dir);
1294 ihold(in); 1288 ihold(in);
1295 dn = splice_dentry(dn, in, NULL); 1289 dn = splice_dentry(dn, in);
1296 if (IS_ERR(dn)) { 1290 if (IS_ERR(dn)) {
1297 err = PTR_ERR(dn); 1291 err = PTR_ERR(dn);
1298 goto done; 1292 goto done;
@@ -1360,15 +1354,20 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
1360 1354
1361 if (!ctl->page || pgoff != page_index(ctl->page)) { 1355 if (!ctl->page || pgoff != page_index(ctl->page)) {
1362 ceph_readdir_cache_release(ctl); 1356 ceph_readdir_cache_release(ctl);
1363 ctl->page = grab_cache_page(&dir->i_data, pgoff); 1357 if (idx == 0)
1358 ctl->page = grab_cache_page(&dir->i_data, pgoff);
1359 else
1360 ctl->page = find_lock_page(&dir->i_data, pgoff);
1364 if (!ctl->page) { 1361 if (!ctl->page) {
1365 ctl->index = -1; 1362 ctl->index = -1;
1366 return -ENOMEM; 1363 return idx == 0 ? -ENOMEM : 0;
1367 } 1364 }
1368 /* reading/filling the cache are serialized by 1365 /* reading/filling the cache are serialized by
1369 * i_mutex, no need to use page lock */ 1366 * i_mutex, no need to use page lock */
1370 unlock_page(ctl->page); 1367 unlock_page(ctl->page);
1371 ctl->dentries = kmap(ctl->page); 1368 ctl->dentries = kmap(ctl->page);
1369 if (idx == 0)
1370 memset(ctl->dentries, 0, PAGE_CACHE_SIZE);
1372 } 1371 }
1373 1372
1374 if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) && 1373 if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
@@ -1391,7 +1390,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1391 struct qstr dname; 1390 struct qstr dname;
1392 struct dentry *dn; 1391 struct dentry *dn;
1393 struct inode *in; 1392 struct inode *in;
1394 int err = 0, ret, i; 1393 int err = 0, skipped = 0, ret, i;
1395 struct inode *snapdir = NULL; 1394 struct inode *snapdir = NULL;
1396 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; 1395 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
1397 struct ceph_dentry_info *di; 1396 struct ceph_dentry_info *di;
@@ -1503,7 +1502,17 @@ retry_lookup:
1503 } 1502 }
1504 1503
1505 if (d_really_is_negative(dn)) { 1504 if (d_really_is_negative(dn)) {
1506 struct dentry *realdn = splice_dentry(dn, in, NULL); 1505 struct dentry *realdn;
1506
1507 if (ceph_security_xattr_deadlock(in)) {
1508 dout(" skip splicing dn %p to inode %p"
1509 " (security xattr deadlock)\n", dn, in);
1510 iput(in);
1511 skipped++;
1512 goto next_item;
1513 }
1514
1515 realdn = splice_dentry(dn, in);
1507 if (IS_ERR(realdn)) { 1516 if (IS_ERR(realdn)) {
1508 err = PTR_ERR(realdn); 1517 err = PTR_ERR(realdn);
1509 d_drop(dn); 1518 d_drop(dn);
@@ -1520,7 +1529,7 @@ retry_lookup:
1520 req->r_session, 1529 req->r_session,
1521 req->r_request_started); 1530 req->r_request_started);
1522 1531
1523 if (err == 0 && cache_ctl.index >= 0) { 1532 if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
1524 ret = fill_readdir_cache(d_inode(parent), dn, 1533 ret = fill_readdir_cache(d_inode(parent), dn,
1525 &cache_ctl, req); 1534 &cache_ctl, req);
1526 if (ret < 0) 1535 if (ret < 0)
@@ -1531,7 +1540,7 @@ next_item:
1531 dput(dn); 1540 dput(dn);
1532 } 1541 }
1533out: 1542out:
1534 if (err == 0) { 1543 if (err == 0 && skipped == 0) {
1535 req->r_did_prepopulate = true; 1544 req->r_did_prepopulate = true;
1536 req->r_readdir_cache_idx = cache_ctl.index; 1545 req->r_readdir_cache_idx = cache_ctl.index;
1537 } 1546 }
@@ -1961,7 +1970,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1961 if (dirtied) { 1970 if (dirtied) {
1962 inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, 1971 inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
1963 &prealloc_cf); 1972 &prealloc_cf);
1964 inode->i_ctime = CURRENT_TIME; 1973 inode->i_ctime = current_fs_time(inode->i_sb);
1965 } 1974 }
1966 1975
1967 release &= issued; 1976 release &= issued;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 911d64d865f1..44852c3ae531 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1729,7 +1729,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1729 init_completion(&req->r_safe_completion); 1729 init_completion(&req->r_safe_completion);
1730 INIT_LIST_HEAD(&req->r_unsafe_item); 1730 INIT_LIST_HEAD(&req->r_unsafe_item);
1731 1731
1732 req->r_stamp = CURRENT_TIME; 1732 req->r_stamp = current_fs_time(mdsc->fsc->sb);
1733 1733
1734 req->r_op = op; 1734 req->r_op = op;
1735 req->r_direct_mode = mode; 1735 req->r_direct_mode = mode;
@@ -2540,6 +2540,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2540 2540
2541 /* insert trace into our cache */ 2541 /* insert trace into our cache */
2542 mutex_lock(&req->r_fill_mutex); 2542 mutex_lock(&req->r_fill_mutex);
2543 current->journal_info = req;
2543 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); 2544 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
2544 if (err == 0) { 2545 if (err == 0) {
2545 if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || 2546 if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
@@ -2547,6 +2548,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2547 ceph_readdir_prepopulate(req, req->r_session); 2548 ceph_readdir_prepopulate(req, req->r_session);
2548 ceph_unreserve_caps(mdsc, &req->r_caps_reservation); 2549 ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
2549 } 2550 }
2551 current->journal_info = NULL;
2550 mutex_unlock(&req->r_fill_mutex); 2552 mutex_unlock(&req->r_fill_mutex);
2551 2553
2552 up_read(&mdsc->snap_rwsem); 2554 up_read(&mdsc->snap_rwsem);
@@ -3764,7 +3766,6 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
3764 dout("handle_map epoch %u len %d\n", epoch, (int)maplen); 3766 dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
3765 3767
3766 /* do we need it? */ 3768 /* do we need it? */
3767 ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
3768 mutex_lock(&mdsc->mutex); 3769 mutex_lock(&mdsc->mutex);
3769 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { 3770 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
3770 dout("handle_map epoch %u <= our %u\n", 3771 dout("handle_map epoch %u <= our %u\n",
@@ -3791,6 +3792,8 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
3791 mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; 3792 mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
3792 3793
3793 __wake_requests(mdsc, &mdsc->waiting_for_map); 3794 __wake_requests(mdsc, &mdsc->waiting_for_map);
3795 ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP,
3796 mdsc->mdsmap->m_epoch);
3794 3797
3795 mutex_unlock(&mdsc->mutex); 3798 mutex_unlock(&mdsc->mutex);
3796 schedule_delayed(mdsc); 3799 schedule_delayed(mdsc);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 4aa7122a8d38..9caaa7ffc93f 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -296,8 +296,6 @@ static int cmpu64_rev(const void *a, const void *b)
296} 296}
297 297
298 298
299struct ceph_snap_context *ceph_empty_snapc;
300
301/* 299/*
302 * build the snap context for a given realm. 300 * build the snap context for a given realm.
303 */ 301 */
@@ -987,17 +985,3 @@ out:
987 up_write(&mdsc->snap_rwsem); 985 up_write(&mdsc->snap_rwsem);
988 return; 986 return;
989} 987}
990
991int __init ceph_snap_init(void)
992{
993 ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
994 if (!ceph_empty_snapc)
995 return -ENOMEM;
996 ceph_empty_snapc->seq = 1;
997 return 0;
998}
999
1000void ceph_snap_exit(void)
1001{
1002 ceph_put_snap_context(ceph_empty_snapc);
1003}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index ca4d5e8457f1..c973043deb0e 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -439,8 +439,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
439 439
440 if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT) 440 if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
441 seq_puts(m, ",dirstat"); 441 seq_puts(m, ",dirstat");
442 if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0) 442 if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES))
443 seq_puts(m, ",norbytes"); 443 seq_puts(m, ",rbytes");
444 if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) 444 if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
445 seq_puts(m, ",noasyncreaddir"); 445 seq_puts(m, ",noasyncreaddir");
446 if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0) 446 if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
@@ -530,7 +530,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
530 goto fail; 530 goto fail;
531 } 531 }
532 fsc->client->extra_mon_dispatch = extra_mon_dispatch; 532 fsc->client->extra_mon_dispatch = extra_mon_dispatch;
533 fsc->client->monc.want_mdsmap = 1; 533 ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
534 534
535 fsc->mount_options = fsopt; 535 fsc->mount_options = fsopt;
536 536
@@ -793,22 +793,20 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
793 struct dentry *root; 793 struct dentry *root;
794 int first = 0; /* first vfsmount for this super_block */ 794 int first = 0; /* first vfsmount for this super_block */
795 795
796 dout("mount start\n"); 796 dout("mount start %p\n", fsc);
797 mutex_lock(&fsc->client->mount_mutex); 797 mutex_lock(&fsc->client->mount_mutex);
798 798
799 err = __ceph_open_session(fsc->client, started); 799 if (!fsc->sb->s_root) {
800 if (err < 0) 800 err = __ceph_open_session(fsc->client, started);
801 goto out; 801 if (err < 0)
802 goto out;
802 803
803 dout("mount opening root\n"); 804 dout("mount opening root\n");
804 root = open_root_dentry(fsc, "", started); 805 root = open_root_dentry(fsc, "", started);
805 if (IS_ERR(root)) { 806 if (IS_ERR(root)) {
806 err = PTR_ERR(root); 807 err = PTR_ERR(root);
807 goto out; 808 goto out;
808 } 809 }
809 if (fsc->sb->s_root) {
810 dput(root);
811 } else {
812 fsc->sb->s_root = root; 810 fsc->sb->s_root = root;
813 first = 1; 811 first = 1;
814 812
@@ -818,6 +816,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
818 } 816 }
819 817
820 if (path[0] == 0) { 818 if (path[0] == 0) {
819 root = fsc->sb->s_root;
821 dget(root); 820 dget(root);
822 } else { 821 } else {
823 dout("mount opening base mountpoint\n"); 822 dout("mount opening base mountpoint\n");
@@ -833,16 +832,14 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
833 mutex_unlock(&fsc->client->mount_mutex); 832 mutex_unlock(&fsc->client->mount_mutex);
834 return root; 833 return root;
835 834
836out:
837 mutex_unlock(&fsc->client->mount_mutex);
838 return ERR_PTR(err);
839
840fail: 835fail:
841 if (first) { 836 if (first) {
842 dput(fsc->sb->s_root); 837 dput(fsc->sb->s_root);
843 fsc->sb->s_root = NULL; 838 fsc->sb->s_root = NULL;
844 } 839 }
845 goto out; 840out:
841 mutex_unlock(&fsc->client->mount_mutex);
842 return ERR_PTR(err);
846} 843}
847 844
848static int ceph_set_super(struct super_block *s, void *data) 845static int ceph_set_super(struct super_block *s, void *data)
@@ -1042,19 +1039,14 @@ static int __init init_ceph(void)
1042 1039
1043 ceph_flock_init(); 1040 ceph_flock_init();
1044 ceph_xattr_init(); 1041 ceph_xattr_init();
1045 ret = ceph_snap_init();
1046 if (ret)
1047 goto out_xattr;
1048 ret = register_filesystem(&ceph_fs_type); 1042 ret = register_filesystem(&ceph_fs_type);
1049 if (ret) 1043 if (ret)
1050 goto out_snap; 1044 goto out_xattr;
1051 1045
1052 pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); 1046 pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
1053 1047
1054 return 0; 1048 return 0;
1055 1049
1056out_snap:
1057 ceph_snap_exit();
1058out_xattr: 1050out_xattr:
1059 ceph_xattr_exit(); 1051 ceph_xattr_exit();
1060 destroy_caches(); 1052 destroy_caches();
@@ -1066,7 +1058,6 @@ static void __exit exit_ceph(void)
1066{ 1058{
1067 dout("exit_ceph\n"); 1059 dout("exit_ceph\n");
1068 unregister_filesystem(&ceph_fs_type); 1060 unregister_filesystem(&ceph_fs_type);
1069 ceph_snap_exit();
1070 ceph_xattr_exit(); 1061 ceph_xattr_exit();
1071 destroy_caches(); 1062 destroy_caches();
1072} 1063}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 9c458eb52245..e705c4d612d7 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -37,8 +37,7 @@
37#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ 37#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */
38#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ 38#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */
39 39
40#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \ 40#define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE
41 CEPH_MOUNT_OPT_DCACHE)
42 41
43#define ceph_set_mount_opt(fsc, opt) \ 42#define ceph_set_mount_opt(fsc, opt) \
44 (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; 43 (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
@@ -469,7 +468,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
469#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */ 468#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */
470#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ 469#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */
471#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ 470#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */
472 471#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */
473 472
474static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, 473static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
475 long long release_count, 474 long long release_count,
@@ -721,7 +720,6 @@ static inline int default_congestion_kb(void)
721 720
722 721
723/* snap.c */ 722/* snap.c */
724extern struct ceph_snap_context *ceph_empty_snapc;
725struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, 723struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
726 u64 ino); 724 u64 ino);
727extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, 725extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
@@ -738,8 +736,6 @@ extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
738extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, 736extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
739 struct ceph_cap_snap *capsnap); 737 struct ceph_cap_snap *capsnap);
740extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); 738extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
741extern int ceph_snap_init(void);
742extern void ceph_snap_exit(void);
743 739
744/* 740/*
745 * a cap_snap is "pending" if it is still awaiting an in-progress 741 * a cap_snap is "pending" if it is still awaiting an in-progress
@@ -808,6 +804,20 @@ extern void __init ceph_xattr_init(void);
808extern void ceph_xattr_exit(void); 804extern void ceph_xattr_exit(void);
809extern const struct xattr_handler *ceph_xattr_handlers[]; 805extern const struct xattr_handler *ceph_xattr_handlers[];
810 806
807#ifdef CONFIG_SECURITY
808extern bool ceph_security_xattr_deadlock(struct inode *in);
809extern bool ceph_security_xattr_wanted(struct inode *in);
810#else
811static inline bool ceph_security_xattr_deadlock(struct inode *in)
812{
813 return false;
814}
815static inline bool ceph_security_xattr_wanted(struct inode *in)
816{
817 return false;
818}
819#endif
820
811/* acl.c */ 821/* acl.c */
812struct ceph_acls_info { 822struct ceph_acls_info {
813 void *default_acl; 823 void *default_acl;
@@ -947,7 +957,6 @@ extern void ceph_dentry_lru_touch(struct dentry *dn);
947extern void ceph_dentry_lru_del(struct dentry *dn); 957extern void ceph_dentry_lru_del(struct dentry *dn);
948extern void ceph_invalidate_dentry_lease(struct dentry *dentry); 958extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
949extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); 959extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
950extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
951extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl); 960extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
952 961
953/* 962/*
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 819163d8313b..9410abdef3ce 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -714,31 +714,62 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
714 } 714 }
715} 715}
716 716
717static inline int __get_request_mask(struct inode *in) {
718 struct ceph_mds_request *req = current->journal_info;
719 int mask = 0;
720 if (req && req->r_target_inode == in) {
721 if (req->r_op == CEPH_MDS_OP_LOOKUP ||
722 req->r_op == CEPH_MDS_OP_LOOKUPINO ||
723 req->r_op == CEPH_MDS_OP_LOOKUPPARENT ||
724 req->r_op == CEPH_MDS_OP_GETATTR) {
725 mask = le32_to_cpu(req->r_args.getattr.mask);
726 } else if (req->r_op == CEPH_MDS_OP_OPEN ||
727 req->r_op == CEPH_MDS_OP_CREATE) {
728 mask = le32_to_cpu(req->r_args.open.mask);
729 }
730 }
731 return mask;
732}
733
717ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, 734ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
718 size_t size) 735 size_t size)
719{ 736{
720 struct ceph_inode_info *ci = ceph_inode(inode); 737 struct ceph_inode_info *ci = ceph_inode(inode);
721 int err;
722 struct ceph_inode_xattr *xattr; 738 struct ceph_inode_xattr *xattr;
723 struct ceph_vxattr *vxattr = NULL; 739 struct ceph_vxattr *vxattr = NULL;
740 int req_mask;
741 int err;
724 742
725 if (!ceph_is_valid_xattr(name)) 743 if (!ceph_is_valid_xattr(name))
726 return -ENODATA; 744 return -ENODATA;
727 745
728 /* let's see if a virtual xattr was requested */ 746 /* let's see if a virtual xattr was requested */
729 vxattr = ceph_match_vxattr(inode, name); 747 vxattr = ceph_match_vxattr(inode, name);
730 if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { 748 if (vxattr) {
731 err = vxattr->getxattr_cb(ci, value, size); 749 err = -ENODATA;
750 if (!(vxattr->exists_cb && !vxattr->exists_cb(ci)))
751 err = vxattr->getxattr_cb(ci, value, size);
732 return err; 752 return err;
733 } 753 }
734 754
755 req_mask = __get_request_mask(inode);
756
735 spin_lock(&ci->i_ceph_lock); 757 spin_lock(&ci->i_ceph_lock);
736 dout("getxattr %p ver=%lld index_ver=%lld\n", inode, 758 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
737 ci->i_xattrs.version, ci->i_xattrs.index_version); 759 ci->i_xattrs.version, ci->i_xattrs.index_version);
738 760
739 if (ci->i_xattrs.version == 0 || 761 if (ci->i_xattrs.version == 0 ||
740 !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) { 762 !((req_mask & CEPH_CAP_XATTR_SHARED) ||
763 __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1))) {
741 spin_unlock(&ci->i_ceph_lock); 764 spin_unlock(&ci->i_ceph_lock);
765
766 /* security module gets xattr while filling trace */
767 if (current->journal_info != NULL) {
768 pr_warn_ratelimited("sync getxattr %p "
769 "during filling trace\n", inode);
770 return -EBUSY;
771 }
772
742 /* get xattrs from mds (if we don't already have them) */ 773 /* get xattrs from mds (if we don't already have them) */
743 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true); 774 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true);
744 if (err) 775 if (err)
@@ -765,6 +796,9 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
765 796
766 memcpy(value, xattr->val, xattr->val_len); 797 memcpy(value, xattr->val, xattr->val_len);
767 798
799 if (current->journal_info != NULL &&
800 !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
801 ci->i_ceph_flags |= CEPH_I_SEC_INITED;
768out: 802out:
769 spin_unlock(&ci->i_ceph_lock); 803 spin_unlock(&ci->i_ceph_lock);
770 return err; 804 return err;
@@ -999,7 +1033,7 @@ retry:
999 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, 1033 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
1000 &prealloc_cf); 1034 &prealloc_cf);
1001 ci->i_xattrs.dirty = true; 1035 ci->i_xattrs.dirty = true;
1002 inode->i_ctime = CURRENT_TIME; 1036 inode->i_ctime = current_fs_time(inode->i_sb);
1003 } 1037 }
1004 1038
1005 spin_unlock(&ci->i_ceph_lock); 1039 spin_unlock(&ci->i_ceph_lock);
@@ -1015,7 +1049,15 @@ do_sync:
1015do_sync_unlocked: 1049do_sync_unlocked:
1016 if (lock_snap_rwsem) 1050 if (lock_snap_rwsem)
1017 up_read(&mdsc->snap_rwsem); 1051 up_read(&mdsc->snap_rwsem);
1018 err = ceph_sync_setxattr(dentry, name, value, size, flags); 1052
1053 /* security module set xattr while filling trace */
1054 if (current->journal_info != NULL) {
1055 pr_warn_ratelimited("sync setxattr %p "
1056 "during filling trace\n", inode);
1057 err = -EBUSY;
1058 } else {
1059 err = ceph_sync_setxattr(dentry, name, value, size, flags);
1060 }
1019out: 1061out:
1020 ceph_free_cap_flush(prealloc_cf); 1062 ceph_free_cap_flush(prealloc_cf);
1021 kfree(newname); 1063 kfree(newname);
@@ -1136,7 +1178,7 @@ retry:
1136 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, 1178 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
1137 &prealloc_cf); 1179 &prealloc_cf);
1138 ci->i_xattrs.dirty = true; 1180 ci->i_xattrs.dirty = true;
1139 inode->i_ctime = CURRENT_TIME; 1181 inode->i_ctime = current_fs_time(inode->i_sb);
1140 spin_unlock(&ci->i_ceph_lock); 1182 spin_unlock(&ci->i_ceph_lock);
1141 if (lock_snap_rwsem) 1183 if (lock_snap_rwsem)
1142 up_read(&mdsc->snap_rwsem); 1184 up_read(&mdsc->snap_rwsem);
@@ -1164,3 +1206,25 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
1164 1206
1165 return __ceph_removexattr(dentry, name); 1207 return __ceph_removexattr(dentry, name);
1166} 1208}
1209
1210#ifdef CONFIG_SECURITY
1211bool ceph_security_xattr_wanted(struct inode *in)
1212{
1213 return in->i_security != NULL;
1214}
1215
1216bool ceph_security_xattr_deadlock(struct inode *in)
1217{
1218 struct ceph_inode_info *ci;
1219 bool ret;
1220 if (in->i_security == NULL)
1221 return false;
1222 ci = ceph_inode(in);
1223 spin_lock(&ci->i_ceph_lock);
1224 ret = !(ci->i_ceph_flags & CEPH_I_SEC_INITED) &&
1225 !(ci->i_xattrs.version > 0 &&
1226 __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0));
1227 spin_unlock(&ci->i_ceph_lock);
1228 return ret;
1229}
1230#endif
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 50b268483302..788e19195991 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -255,7 +255,6 @@ static const struct file_operations cifs_debug_data_proc_fops = {
255static ssize_t cifs_stats_proc_write(struct file *file, 255static ssize_t cifs_stats_proc_write(struct file *file,
256 const char __user *buffer, size_t count, loff_t *ppos) 256 const char __user *buffer, size_t count, loff_t *ppos)
257{ 257{
258 char c;
259 bool bv; 258 bool bv;
260 int rc; 259 int rc;
261 struct list_head *tmp1, *tmp2, *tmp3; 260 struct list_head *tmp1, *tmp2, *tmp3;
@@ -263,11 +262,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
263 struct cifs_ses *ses; 262 struct cifs_ses *ses;
264 struct cifs_tcon *tcon; 263 struct cifs_tcon *tcon;
265 264
266 rc = get_user(c, buffer); 265 rc = kstrtobool_from_user(buffer, count, &bv);
267 if (rc) 266 if (rc == 0) {
268 return rc;
269
270 if (strtobool(&c, &bv) == 0) {
271#ifdef CONFIG_CIFS_STATS2 267#ifdef CONFIG_CIFS_STATS2
272 atomic_set(&totBufAllocCount, 0); 268 atomic_set(&totBufAllocCount, 0);
273 atomic_set(&totSmBufAllocCount, 0); 269 atomic_set(&totSmBufAllocCount, 0);
@@ -290,6 +286,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
290 } 286 }
291 } 287 }
292 spin_unlock(&cifs_tcp_ses_lock); 288 spin_unlock(&cifs_tcp_ses_lock);
289 } else {
290 return rc;
293 } 291 }
294 292
295 return count; 293 return count;
@@ -433,17 +431,17 @@ static int cifsFYI_proc_open(struct inode *inode, struct file *file)
433static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer, 431static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer,
434 size_t count, loff_t *ppos) 432 size_t count, loff_t *ppos)
435{ 433{
436 char c; 434 char c[2] = { '\0' };
437 bool bv; 435 bool bv;
438 int rc; 436 int rc;
439 437
440 rc = get_user(c, buffer); 438 rc = get_user(c[0], buffer);
441 if (rc) 439 if (rc)
442 return rc; 440 return rc;
443 if (strtobool(&c, &bv) == 0) 441 if (strtobool(c, &bv) == 0)
444 cifsFYI = bv; 442 cifsFYI = bv;
445 else if ((c > '1') && (c <= '9')) 443 else if ((c[0] > '1') && (c[0] <= '9'))
446 cifsFYI = (int) (c - '0'); /* see cifs_debug.h for meanings */ 444 cifsFYI = (int) (c[0] - '0'); /* see cifs_debug.h for meanings */
447 445
448 return count; 446 return count;
449} 447}
@@ -471,20 +469,12 @@ static int cifs_linux_ext_proc_open(struct inode *inode, struct file *file)
471static ssize_t cifs_linux_ext_proc_write(struct file *file, 469static ssize_t cifs_linux_ext_proc_write(struct file *file,
472 const char __user *buffer, size_t count, loff_t *ppos) 470 const char __user *buffer, size_t count, loff_t *ppos)
473{ 471{
474 char c;
475 bool bv;
476 int rc; 472 int rc;
477 473
478 rc = get_user(c, buffer); 474 rc = kstrtobool_from_user(buffer, count, &linuxExtEnabled);
479 if (rc) 475 if (rc)
480 return rc; 476 return rc;
481 477
482 rc = strtobool(&c, &bv);
483 if (rc)
484 return rc;
485
486 linuxExtEnabled = bv;
487
488 return count; 478 return count;
489} 479}
490 480
@@ -511,20 +501,12 @@ static int cifs_lookup_cache_proc_open(struct inode *inode, struct file *file)
511static ssize_t cifs_lookup_cache_proc_write(struct file *file, 501static ssize_t cifs_lookup_cache_proc_write(struct file *file,
512 const char __user *buffer, size_t count, loff_t *ppos) 502 const char __user *buffer, size_t count, loff_t *ppos)
513{ 503{
514 char c;
515 bool bv;
516 int rc; 504 int rc;
517 505
518 rc = get_user(c, buffer); 506 rc = kstrtobool_from_user(buffer, count, &lookupCacheEnabled);
519 if (rc) 507 if (rc)
520 return rc; 508 return rc;
521 509
522 rc = strtobool(&c, &bv);
523 if (rc)
524 return rc;
525
526 lookupCacheEnabled = bv;
527
528 return count; 510 return count;
529} 511}
530 512
@@ -551,20 +533,12 @@ static int traceSMB_proc_open(struct inode *inode, struct file *file)
551static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer, 533static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer,
552 size_t count, loff_t *ppos) 534 size_t count, loff_t *ppos)
553{ 535{
554 char c;
555 bool bv;
556 int rc; 536 int rc;
557 537
558 rc = get_user(c, buffer); 538 rc = kstrtobool_from_user(buffer, count, &traceSMB);
559 if (rc) 539 if (rc)
560 return rc; 540 return rc;
561 541
562 rc = strtobool(&c, &bv);
563 if (rc)
564 return rc;
565
566 traceSMB = bv;
567
568 return count; 542 return count;
569} 543}
570 544
@@ -622,7 +596,6 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
622 int rc; 596 int rc;
623 unsigned int flags; 597 unsigned int flags;
624 char flags_string[12]; 598 char flags_string[12];
625 char c;
626 bool bv; 599 bool bv;
627 600
628 if ((count < 1) || (count > 11)) 601 if ((count < 1) || (count > 11))
@@ -635,11 +608,10 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
635 608
636 if (count < 3) { 609 if (count < 3) {
637 /* single char or single char followed by null */ 610 /* single char or single char followed by null */
638 c = flags_string[0]; 611 if (strtobool(flags_string, &bv) == 0) {
639 if (strtobool(&c, &bv) == 0) {
640 global_secflags = bv ? CIFSSEC_MAX : CIFSSEC_DEF; 612 global_secflags = bv ? CIFSSEC_MAX : CIFSSEC_DEF;
641 return count; 613 return count;
642 } else if (!isdigit(c)) { 614 } else if (!isdigit(flags_string[0])) {
643 cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", 615 cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
644 flags_string); 616 flags_string);
645 return -EINVAL; 617 return -EINVAL;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 66cf0f9fff89..c611ca2339d7 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -25,7 +25,7 @@
25void cifs_dump_mem(char *label, void *data, int length); 25void cifs_dump_mem(char *label, void *data, int length);
26void cifs_dump_detail(void *); 26void cifs_dump_detail(void *);
27void cifs_dump_mids(struct TCP_Server_Info *); 27void cifs_dump_mids(struct TCP_Server_Info *);
28extern int traceSMB; /* flag which enables the function below */ 28extern bool traceSMB; /* flag which enables the function below */
29void dump_smb(void *, int); 29void dump_smb(void *, int);
30#define CIFS_INFO 0x01 30#define CIFS_INFO 0x01
31#define CIFS_RC 0x02 31#define CIFS_RC 0x02
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index e682b36a210f..4897dacf8944 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -33,6 +33,7 @@
33#include <linux/ctype.h> 33#include <linux/ctype.h>
34#include <linux/random.h> 34#include <linux/random.h>
35#include <linux/highmem.h> 35#include <linux/highmem.h>
36#include <crypto/skcipher.h>
36 37
37static int 38static int
38cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server) 39cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server)
@@ -789,38 +790,46 @@ int
789calc_seckey(struct cifs_ses *ses) 790calc_seckey(struct cifs_ses *ses)
790{ 791{
791 int rc; 792 int rc;
792 struct crypto_blkcipher *tfm_arc4; 793 struct crypto_skcipher *tfm_arc4;
793 struct scatterlist sgin, sgout; 794 struct scatterlist sgin, sgout;
794 struct blkcipher_desc desc; 795 struct skcipher_request *req;
795 unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */ 796 unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */
796 797
797 get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE); 798 get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
798 799
799 tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); 800 tfm_arc4 = crypto_alloc_skcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
800 if (IS_ERR(tfm_arc4)) { 801 if (IS_ERR(tfm_arc4)) {
801 rc = PTR_ERR(tfm_arc4); 802 rc = PTR_ERR(tfm_arc4);
802 cifs_dbg(VFS, "could not allocate crypto API arc4\n"); 803 cifs_dbg(VFS, "could not allocate crypto API arc4\n");
803 return rc; 804 return rc;
804 } 805 }
805 806
806 desc.tfm = tfm_arc4; 807 rc = crypto_skcipher_setkey(tfm_arc4, ses->auth_key.response,
807
808 rc = crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
809 CIFS_SESS_KEY_SIZE); 808 CIFS_SESS_KEY_SIZE);
810 if (rc) { 809 if (rc) {
811 cifs_dbg(VFS, "%s: Could not set response as a key\n", 810 cifs_dbg(VFS, "%s: Could not set response as a key\n",
812 __func__); 811 __func__);
813 return rc; 812 goto out_free_cipher;
813 }
814
815 req = skcipher_request_alloc(tfm_arc4, GFP_KERNEL);
816 if (!req) {
817 rc = -ENOMEM;
818 cifs_dbg(VFS, "could not allocate crypto API arc4 request\n");
819 goto out_free_cipher;
814 } 820 }
815 821
816 sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE); 822 sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE);
817 sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); 823 sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
818 824
819 rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE); 825 skcipher_request_set_callback(req, 0, NULL, NULL);
826 skcipher_request_set_crypt(req, &sgin, &sgout, CIFS_CPHTXT_SIZE, NULL);
827
828 rc = crypto_skcipher_encrypt(req);
829 skcipher_request_free(req);
820 if (rc) { 830 if (rc) {
821 cifs_dbg(VFS, "could not encrypt session key rc: %d\n", rc); 831 cifs_dbg(VFS, "could not encrypt session key rc: %d\n", rc);
822 crypto_free_blkcipher(tfm_arc4); 832 goto out_free_cipher;
823 return rc;
824 } 833 }
825 834
826 /* make secondary_key/nonce as session key */ 835 /* make secondary_key/nonce as session key */
@@ -828,7 +837,8 @@ calc_seckey(struct cifs_ses *ses)
828 /* and make len as that of session key only */ 837 /* and make len as that of session key only */
829 ses->auth_key.len = CIFS_SESS_KEY_SIZE; 838 ses->auth_key.len = CIFS_SESS_KEY_SIZE;
830 839
831 crypto_free_blkcipher(tfm_arc4); 840out_free_cipher:
841 crypto_free_skcipher(tfm_arc4);
832 842
833 return rc; 843 return rc;
834} 844}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 2eea40353e60..1d86fc620e5c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -54,10 +54,10 @@
54#endif 54#endif
55 55
56int cifsFYI = 0; 56int cifsFYI = 0;
57int traceSMB = 0; 57bool traceSMB;
58bool enable_oplocks = true; 58bool enable_oplocks = true;
59unsigned int linuxExtEnabled = 1; 59bool linuxExtEnabled = true;
60unsigned int lookupCacheEnabled = 1; 60bool lookupCacheEnabled = true;
61unsigned int global_secflags = CIFSSEC_DEF; 61unsigned int global_secflags = CIFSSEC_DEF;
62/* unsigned int ntlmv2_support = 0; */ 62/* unsigned int ntlmv2_support = 0; */
63unsigned int sign_CIFS_PDUs = 1; 63unsigned int sign_CIFS_PDUs = 1;
@@ -642,9 +642,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
642 while (*s && *s != sep) 642 while (*s && *s != sep)
643 s++; 643 s++;
644 644
645 inode_lock(dir); 645 child = lookup_one_len_unlocked(p, dentry, s - p);
646 child = lookup_one_len(p, dentry, s - p);
647 inode_unlock(dir);
648 dput(dentry); 646 dput(dentry);
649 dentry = child; 647 dentry = child;
650 } while (!IS_ERR(dentry)); 648 } while (!IS_ERR(dentry));
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a25b2513f146..d21da9f05bae 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1596,11 +1596,11 @@ GLOBAL_EXTERN atomic_t midCount;
1596 1596
1597/* Misc globals */ 1597/* Misc globals */
1598GLOBAL_EXTERN bool enable_oplocks; /* enable or disable oplocks */ 1598GLOBAL_EXTERN bool enable_oplocks; /* enable or disable oplocks */
1599GLOBAL_EXTERN unsigned int lookupCacheEnabled; 1599GLOBAL_EXTERN bool lookupCacheEnabled;
1600GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent 1600GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent
1601 with more secure ntlmssp2 challenge/resp */ 1601 with more secure ntlmssp2 challenge/resp */
1602GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */ 1602GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */
1603GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/ 1603GLOBAL_EXTERN bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
1604GLOBAL_EXTERN unsigned int CIFSMaxBufSize; /* max size not including hdr */ 1604GLOBAL_EXTERN unsigned int CIFSMaxBufSize; /* max size not including hdr */
1605GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ 1605GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
1606GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ 1606GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index a4232ec4f2ba..699b7868108f 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -23,6 +23,7 @@
23 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 23 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24*/ 24*/
25 25
26#include <crypto/skcipher.h>
26#include <linux/module.h> 27#include <linux/module.h>
27#include <linux/slab.h> 28#include <linux/slab.h>
28#include <linux/fs.h> 29#include <linux/fs.h>
@@ -70,31 +71,42 @@ smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
70{ 71{
71 int rc; 72 int rc;
72 unsigned char key2[8]; 73 unsigned char key2[8];
73 struct crypto_blkcipher *tfm_des; 74 struct crypto_skcipher *tfm_des;
74 struct scatterlist sgin, sgout; 75 struct scatterlist sgin, sgout;
75 struct blkcipher_desc desc; 76 struct skcipher_request *req;
76 77
77 str_to_key(key, key2); 78 str_to_key(key, key2);
78 79
79 tfm_des = crypto_alloc_blkcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC); 80 tfm_des = crypto_alloc_skcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC);
80 if (IS_ERR(tfm_des)) { 81 if (IS_ERR(tfm_des)) {
81 rc = PTR_ERR(tfm_des); 82 rc = PTR_ERR(tfm_des);
82 cifs_dbg(VFS, "could not allocate des crypto API\n"); 83 cifs_dbg(VFS, "could not allocate des crypto API\n");
83 goto smbhash_err; 84 goto smbhash_err;
84 } 85 }
85 86
86 desc.tfm = tfm_des; 87 req = skcipher_request_alloc(tfm_des, GFP_KERNEL);
88 if (!req) {
89 rc = -ENOMEM;
90 cifs_dbg(VFS, "could not allocate des crypto API\n");
91 goto smbhash_free_skcipher;
92 }
87 93
88 crypto_blkcipher_setkey(tfm_des, key2, 8); 94 crypto_skcipher_setkey(tfm_des, key2, 8);
89 95
90 sg_init_one(&sgin, in, 8); 96 sg_init_one(&sgin, in, 8);
91 sg_init_one(&sgout, out, 8); 97 sg_init_one(&sgout, out, 8);
92 98
93 rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8); 99 skcipher_request_set_callback(req, 0, NULL, NULL);
100 skcipher_request_set_crypt(req, &sgin, &sgout, 8, NULL);
101
102 rc = crypto_skcipher_encrypt(req);
94 if (rc) 103 if (rc)
95 cifs_dbg(VFS, "could not encrypt crypt key rc: %d\n", rc); 104 cifs_dbg(VFS, "could not encrypt crypt key rc: %d\n", rc);
96 105
97 crypto_free_blkcipher(tfm_des); 106 skcipher_request_free(req);
107
108smbhash_free_skcipher:
109 crypto_free_skcipher(tfm_des);
98smbhash_err: 110smbhash_err:
99 return rc; 111 return rc;
100} 112}
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 6402eaf8ab95..bd01b92aad98 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1040,28 +1040,6 @@ COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS)
1040/* PPPOX */ 1040/* PPPOX */
1041COMPATIBLE_IOCTL(PPPOEIOCSFWD) 1041COMPATIBLE_IOCTL(PPPOEIOCSFWD)
1042COMPATIBLE_IOCTL(PPPOEIOCDFWD) 1042COMPATIBLE_IOCTL(PPPOEIOCDFWD)
1043/* ppdev */
1044COMPATIBLE_IOCTL(PPSETMODE)
1045COMPATIBLE_IOCTL(PPRSTATUS)
1046COMPATIBLE_IOCTL(PPRCONTROL)
1047COMPATIBLE_IOCTL(PPWCONTROL)
1048COMPATIBLE_IOCTL(PPFCONTROL)
1049COMPATIBLE_IOCTL(PPRDATA)
1050COMPATIBLE_IOCTL(PPWDATA)
1051COMPATIBLE_IOCTL(PPCLAIM)
1052COMPATIBLE_IOCTL(PPRELEASE)
1053COMPATIBLE_IOCTL(PPYIELD)
1054COMPATIBLE_IOCTL(PPEXCL)
1055COMPATIBLE_IOCTL(PPDATADIR)
1056COMPATIBLE_IOCTL(PPNEGOT)
1057COMPATIBLE_IOCTL(PPWCTLONIRQ)
1058COMPATIBLE_IOCTL(PPCLRIRQ)
1059COMPATIBLE_IOCTL(PPSETPHASE)
1060COMPATIBLE_IOCTL(PPGETMODES)
1061COMPATIBLE_IOCTL(PPGETMODE)
1062COMPATIBLE_IOCTL(PPGETPHASE)
1063COMPATIBLE_IOCTL(PPGETFLAGS)
1064COMPATIBLE_IOCTL(PPSETFLAGS)
1065/* Big A */ 1043/* Big A */
1066/* sparc only */ 1044/* sparc only */
1067/* Big Q for sound/OSS */ 1045/* Big Q for sound/OSS */
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index f419519ec41f..ea59c891fc53 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -432,14 +432,9 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
432 (sd->s_type & CONFIGFS_ITEM_BIN_ATTR) ? 432 (sd->s_type & CONFIGFS_ITEM_BIN_ATTR) ?
433 configfs_init_bin_file : 433 configfs_init_bin_file :
434 configfs_init_file); 434 configfs_init_file);
435 if (error) { 435 if (error)
436 configfs_put(sd); 436 configfs_put(sd);
437 return error; 437 return error;
438 }
439
440 d_rehash(dentry);
441
442 return 0;
443} 438}
444 439
445static struct dentry * configfs_lookup(struct inode *dir, 440static struct dentry * configfs_lookup(struct inode *dir,
@@ -701,23 +696,29 @@ static int populate_groups(struct config_group *group)
701{ 696{
702 struct config_group *new_group; 697 struct config_group *new_group;
703 int ret = 0; 698 int ret = 0;
704 int i;
705
706 if (group->default_groups) {
707 for (i = 0; group->default_groups[i]; i++) {
708 new_group = group->default_groups[i];
709 699
710 ret = create_default_group(group, new_group); 700 list_for_each_entry(new_group, &group->default_groups, group_entry) {
711 if (ret) { 701 ret = create_default_group(group, new_group);
712 detach_groups(group); 702 if (ret) {
713 break; 703 detach_groups(group);
714 } 704 break;
715 } 705 }
716 } 706 }
717 707
718 return ret; 708 return ret;
719} 709}
720 710
711void configfs_remove_default_groups(struct config_group *group)
712{
713 struct config_group *g, *n;
714
715 list_for_each_entry_safe(g, n, &group->default_groups, group_entry) {
716 list_del(&g->group_entry);
717 config_item_put(&g->cg_item);
718 }
719}
720EXPORT_SYMBOL(configfs_remove_default_groups);
721
721/* 722/*
722 * All of link_obj/unlink_obj/link_group/unlink_group require that 723 * All of link_obj/unlink_obj/link_group/unlink_group require that
723 * subsys->su_mutex is held. 724 * subsys->su_mutex is held.
@@ -766,15 +767,10 @@ static void link_obj(struct config_item *parent_item, struct config_item *item)
766 767
767static void unlink_group(struct config_group *group) 768static void unlink_group(struct config_group *group)
768{ 769{
769 int i;
770 struct config_group *new_group; 770 struct config_group *new_group;
771 771
772 if (group->default_groups) { 772 list_for_each_entry(new_group, &group->default_groups, group_entry)
773 for (i = 0; group->default_groups[i]; i++) { 773 unlink_group(new_group);
774 new_group = group->default_groups[i];
775 unlink_group(new_group);
776 }
777 }
778 774
779 group->cg_subsys = NULL; 775 group->cg_subsys = NULL;
780 unlink_obj(&group->cg_item); 776 unlink_obj(&group->cg_item);
@@ -782,7 +778,6 @@ static void unlink_group(struct config_group *group)
782 778
783static void link_group(struct config_group *parent_group, struct config_group *group) 779static void link_group(struct config_group *parent_group, struct config_group *group)
784{ 780{
785 int i;
786 struct config_group *new_group; 781 struct config_group *new_group;
787 struct configfs_subsystem *subsys = NULL; /* gcc is a turd */ 782 struct configfs_subsystem *subsys = NULL; /* gcc is a turd */
788 783
@@ -796,12 +791,8 @@ static void link_group(struct config_group *parent_group, struct config_group *g
796 BUG(); 791 BUG();
797 group->cg_subsys = subsys; 792 group->cg_subsys = subsys;
798 793
799 if (group->default_groups) { 794 list_for_each_entry(new_group, &group->default_groups, group_entry)
800 for (i = 0; group->default_groups[i]; i++) { 795 link_group(group, new_group);
801 new_group = group->default_groups[i];
802 link_group(group, new_group);
803 }
804 }
805} 796}
806 797
807/* 798/*
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index cee087d8f7e0..03d124ae27d7 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -75,7 +75,8 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
75 sd_iattr->ia_mode = sd->s_mode; 75 sd_iattr->ia_mode = sd->s_mode;
76 sd_iattr->ia_uid = GLOBAL_ROOT_UID; 76 sd_iattr->ia_uid = GLOBAL_ROOT_UID;
77 sd_iattr->ia_gid = GLOBAL_ROOT_GID; 77 sd_iattr->ia_gid = GLOBAL_ROOT_GID;
78 sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; 78 sd_iattr->ia_atime = sd_iattr->ia_mtime =
79 sd_iattr->ia_ctime = current_fs_time(inode->i_sb);
79 sd->s_iattr = sd_iattr; 80 sd->s_iattr = sd_iattr;
80 } 81 }
81 /* attributes were changed atleast once in past */ 82 /* attributes were changed atleast once in past */
@@ -111,7 +112,8 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
111static inline void set_default_inode_attr(struct inode * inode, umode_t mode) 112static inline void set_default_inode_attr(struct inode * inode, umode_t mode)
112{ 113{
113 inode->i_mode = mode; 114 inode->i_mode = mode;
114 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 115 inode->i_atime = inode->i_mtime =
116 inode->i_ctime = current_fs_time(inode->i_sb);
115} 117}
116 118
117static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) 119static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
@@ -195,13 +197,21 @@ int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct in
195 return -ENOMEM; 197 return -ENOMEM;
196 198
197 p_inode = d_inode(dentry->d_parent); 199 p_inode = d_inode(dentry->d_parent);
198 p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; 200 p_inode->i_mtime = p_inode->i_ctime = current_fs_time(p_inode->i_sb);
199 configfs_set_inode_lock_class(sd, inode); 201 configfs_set_inode_lock_class(sd, inode);
200 202
201 init(inode); 203 init(inode);
202 d_instantiate(dentry, inode); 204 if (S_ISDIR(mode) || S_ISLNK(mode)) {
203 if (S_ISDIR(mode) || S_ISLNK(mode)) 205 /*
206 * ->symlink(), ->mkdir(), configfs_register_subsystem() or
207 * create_default_group() - already hashed.
208 */
209 d_instantiate(dentry, inode);
204 dget(dentry); /* pin link and directory dentries in core */ 210 dget(dentry); /* pin link and directory dentries in core */
211 } else {
212 /* ->lookup() */
213 d_add(dentry, inode);
214 }
205 return error; 215 return error;
206} 216}
207 217
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index b863a09cd2f1..8b2a994042dd 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -182,6 +182,7 @@ void config_group_init(struct config_group *group)
182{ 182{
183 config_item_init(&group->cg_item); 183 config_item_init(&group->cg_item);
184 INIT_LIST_HEAD(&group->cg_children); 184 INIT_LIST_HEAD(&group->cg_children);
185 INIT_LIST_HEAD(&group->default_groups);
185} 186}
186EXPORT_SYMBOL(config_group_init); 187EXPORT_SYMBOL(config_group_init);
187 188
diff --git a/fs/coredump.c b/fs/coredump.c
index 9ea87e9fdccf..47c32c3bfa1d 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -32,6 +32,9 @@
32#include <linux/pipe_fs_i.h> 32#include <linux/pipe_fs_i.h>
33#include <linux/oom.h> 33#include <linux/oom.h>
34#include <linux/compat.h> 34#include <linux/compat.h>
35#include <linux/sched.h>
36#include <linux/fs.h>
37#include <linux/path.h>
35#include <linux/timekeeping.h> 38#include <linux/timekeeping.h>
36 39
37#include <asm/uaccess.h> 40#include <asm/uaccess.h>
@@ -649,6 +652,8 @@ void do_coredump(const siginfo_t *siginfo)
649 } 652 }
650 } else { 653 } else {
651 struct inode *inode; 654 struct inode *inode;
655 int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW |
656 O_LARGEFILE | O_EXCL;
652 657
653 if (cprm.limit < binfmt->min_coredump) 658 if (cprm.limit < binfmt->min_coredump)
654 goto fail_unlock; 659 goto fail_unlock;
@@ -687,10 +692,27 @@ void do_coredump(const siginfo_t *siginfo)
687 * what matters is that at least one of the two processes 692 * what matters is that at least one of the two processes
688 * writes its coredump successfully, not which one. 693 * writes its coredump successfully, not which one.
689 */ 694 */
690 cprm.file = filp_open(cn.corename, 695 if (need_suid_safe) {
691 O_CREAT | 2 | O_NOFOLLOW | 696 /*
692 O_LARGEFILE | O_EXCL, 697 * Using user namespaces, normal user tasks can change
693 0600); 698 * their current->fs->root to point to arbitrary
699 * directories. Since the intention of the "only dump
700 * with a fully qualified path" rule is to control where
701 * coredumps may be placed using root privileges,
702 * current->fs->root must not be used. Instead, use the
703 * root directory of init_task.
704 */
705 struct path root;
706
707 task_lock(&init_task);
708 get_fs_root(init_task.fs, &root);
709 task_unlock(&init_task);
710 cprm.file = file_open_root(root.dentry, root.mnt,
711 cn.corename, open_flags, 0600);
712 path_put(&root);
713 } else {
714 cprm.file = filp_open(cn.corename, open_flags, 0600);
715 }
694 if (IS_ERR(cprm.file)) 716 if (IS_ERR(cprm.file))
695 goto fail_unlock; 717 goto fail_unlock;
696 718
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
new file mode 100644
index 000000000000..92348faf9865
--- /dev/null
+++ b/fs/crypto/Kconfig
@@ -0,0 +1,18 @@
1config FS_ENCRYPTION
2 tristate "FS Encryption (Per-file encryption)"
3 depends on BLOCK
4 select CRYPTO
5 select CRYPTO_AES
6 select CRYPTO_CBC
7 select CRYPTO_ECB
8 select CRYPTO_XTS
9 select CRYPTO_CTS
10 select CRYPTO_CTR
11 select CRYPTO_SHA256
12 select KEYS
13 select ENCRYPTED_KEYS
14 help
15 Enable encryption of files and directories. This
16 feature is similar to ecryptfs, but it is more memory
17 efficient since it avoids caching the encrypted and
18 decrypted pages in the page cache.
diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
new file mode 100644
index 000000000000..f17684c48739
--- /dev/null
+++ b/fs/crypto/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o
2
3fscrypto-y := crypto.o fname.o policy.o keyinfo.o
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
new file mode 100644
index 000000000000..06cd1a22240b
--- /dev/null
+++ b/fs/crypto/crypto.c
@@ -0,0 +1,555 @@
1/*
2 * This contains encryption functions for per-file encryption.
3 *
4 * Copyright (C) 2015, Google, Inc.
5 * Copyright (C) 2015, Motorola Mobility
6 *
7 * Written by Michael Halcrow, 2014.
8 *
9 * Filename encryption additions
10 * Uday Savagaonkar, 2014
11 * Encryption policy handling additions
12 * Ildar Muslukhov, 2014
13 * Add fscrypt_pullback_bio_page()
14 * Jaegeuk Kim, 2015.
15 *
16 * This has not yet undergone a rigorous security audit.
17 *
18 * The usage of AES-XTS should conform to recommendations in NIST
19 * Special Publication 800-38E and IEEE P1619/D16.
20 */
21
22#include <linux/pagemap.h>
23#include <linux/mempool.h>
24#include <linux/module.h>
25#include <linux/scatterlist.h>
26#include <linux/ratelimit.h>
27#include <linux/bio.h>
28#include <linux/dcache.h>
29#include <linux/fscrypto.h>
30#include <linux/ecryptfs.h>
31
32static unsigned int num_prealloc_crypto_pages = 32;
33static unsigned int num_prealloc_crypto_ctxs = 128;
34
35module_param(num_prealloc_crypto_pages, uint, 0444);
36MODULE_PARM_DESC(num_prealloc_crypto_pages,
37 "Number of crypto pages to preallocate");
38module_param(num_prealloc_crypto_ctxs, uint, 0444);
39MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
40 "Number of crypto contexts to preallocate");
41
42static mempool_t *fscrypt_bounce_page_pool = NULL;
43
44static LIST_HEAD(fscrypt_free_ctxs);
45static DEFINE_SPINLOCK(fscrypt_ctx_lock);
46
47static struct workqueue_struct *fscrypt_read_workqueue;
48static DEFINE_MUTEX(fscrypt_init_mutex);
49
50static struct kmem_cache *fscrypt_ctx_cachep;
51struct kmem_cache *fscrypt_info_cachep;
52
53/**
54 * fscrypt_release_ctx() - Releases an encryption context
55 * @ctx: The encryption context to release.
56 *
57 * If the encryption context was allocated from the pre-allocated pool, returns
58 * it to that pool. Else, frees it.
59 *
60 * If there's a bounce page in the context, this frees that.
61 */
62void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
63{
64 unsigned long flags;
65
66 if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) {
67 mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool);
68 ctx->w.bounce_page = NULL;
69 }
70 ctx->w.control_page = NULL;
71 if (ctx->flags & FS_CTX_REQUIRES_FREE_ENCRYPT_FL) {
72 kmem_cache_free(fscrypt_ctx_cachep, ctx);
73 } else {
74 spin_lock_irqsave(&fscrypt_ctx_lock, flags);
75 list_add(&ctx->free_list, &fscrypt_free_ctxs);
76 spin_unlock_irqrestore(&fscrypt_ctx_lock, flags);
77 }
78}
79EXPORT_SYMBOL(fscrypt_release_ctx);
80
81/**
82 * fscrypt_get_ctx() - Gets an encryption context
83 * @inode: The inode for which we are doing the crypto
84 *
85 * Allocates and initializes an encryption context.
86 *
87 * Return: An allocated and initialized encryption context on success; error
88 * value or NULL otherwise.
89 */
90struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode)
91{
92 struct fscrypt_ctx *ctx = NULL;
93 struct fscrypt_info *ci = inode->i_crypt_info;
94 unsigned long flags;
95
96 if (ci == NULL)
97 return ERR_PTR(-ENOKEY);
98
99 /*
100 * We first try getting the ctx from a free list because in
101 * the common case the ctx will have an allocated and
102 * initialized crypto tfm, so it's probably a worthwhile
103 * optimization. For the bounce page, we first try getting it
104 * from the kernel allocator because that's just about as fast
105 * as getting it from a list and because a cache of free pages
106 * should generally be a "last resort" option for a filesystem
107 * to be able to do its job.
108 */
109 spin_lock_irqsave(&fscrypt_ctx_lock, flags);
110 ctx = list_first_entry_or_null(&fscrypt_free_ctxs,
111 struct fscrypt_ctx, free_list);
112 if (ctx)
113 list_del(&ctx->free_list);
114 spin_unlock_irqrestore(&fscrypt_ctx_lock, flags);
115 if (!ctx) {
116 ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS);
117 if (!ctx)
118 return ERR_PTR(-ENOMEM);
119 ctx->flags |= FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
120 } else {
121 ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
122 }
123 ctx->flags &= ~FS_WRITE_PATH_FL;
124 return ctx;
125}
126EXPORT_SYMBOL(fscrypt_get_ctx);
127
128/**
129 * fscrypt_complete() - The completion callback for page encryption
130 * @req: The asynchronous encryption request context
131 * @res: The result of the encryption operation
132 */
133static void fscrypt_complete(struct crypto_async_request *req, int res)
134{
135 struct fscrypt_completion_result *ecr = req->data;
136
137 if (res == -EINPROGRESS)
138 return;
139 ecr->res = res;
140 complete(&ecr->completion);
141}
142
143typedef enum {
144 FS_DECRYPT = 0,
145 FS_ENCRYPT,
146} fscrypt_direction_t;
147
148static int do_page_crypto(struct inode *inode,
149 fscrypt_direction_t rw, pgoff_t index,
150 struct page *src_page, struct page *dest_page)
151{
152 u8 xts_tweak[FS_XTS_TWEAK_SIZE];
153 struct skcipher_request *req = NULL;
154 DECLARE_FS_COMPLETION_RESULT(ecr);
155 struct scatterlist dst, src;
156 struct fscrypt_info *ci = inode->i_crypt_info;
157 struct crypto_skcipher *tfm = ci->ci_ctfm;
158 int res = 0;
159
160 req = skcipher_request_alloc(tfm, GFP_NOFS);
161 if (!req) {
162 printk_ratelimited(KERN_ERR
163 "%s: crypto_request_alloc() failed\n",
164 __func__);
165 return -ENOMEM;
166 }
167
168 skcipher_request_set_callback(
169 req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
170 fscrypt_complete, &ecr);
171
172 BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index));
173 memcpy(xts_tweak, &index, sizeof(index));
174 memset(&xts_tweak[sizeof(index)], 0,
175 FS_XTS_TWEAK_SIZE - sizeof(index));
176
177 sg_init_table(&dst, 1);
178 sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
179 sg_init_table(&src, 1);
180 sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
181 skcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
182 xts_tweak);
183 if (rw == FS_DECRYPT)
184 res = crypto_skcipher_decrypt(req);
185 else
186 res = crypto_skcipher_encrypt(req);
187 if (res == -EINPROGRESS || res == -EBUSY) {
188 BUG_ON(req->base.data != &ecr);
189 wait_for_completion(&ecr.completion);
190 res = ecr.res;
191 }
192 skcipher_request_free(req);
193 if (res) {
194 printk_ratelimited(KERN_ERR
195 "%s: crypto_skcipher_encrypt() returned %d\n",
196 __func__, res);
197 return res;
198 }
199 return 0;
200}
201
202static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx)
203{
204 ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool,
205 GFP_NOWAIT);
206 if (ctx->w.bounce_page == NULL)
207 return ERR_PTR(-ENOMEM);
208 ctx->flags |= FS_WRITE_PATH_FL;
209 return ctx->w.bounce_page;
210}
211
212/**
213 * fscypt_encrypt_page() - Encrypts a page
214 * @inode: The inode for which the encryption should take place
215 * @plaintext_page: The page to encrypt. Must be locked.
216 *
217 * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
218 * encryption context.
219 *
220 * Called on the page write path. The caller must call
221 * fscrypt_restore_control_page() on the returned ciphertext page to
222 * release the bounce buffer and the encryption context.
223 *
224 * Return: An allocated page with the encrypted content on success. Else, an
225 * error value or NULL.
226 */
227struct page *fscrypt_encrypt_page(struct inode *inode,
228 struct page *plaintext_page)
229{
230 struct fscrypt_ctx *ctx;
231 struct page *ciphertext_page = NULL;
232 int err;
233
234 BUG_ON(!PageLocked(plaintext_page));
235
236 ctx = fscrypt_get_ctx(inode);
237 if (IS_ERR(ctx))
238 return (struct page *)ctx;
239
240 /* The encryption operation will require a bounce page. */
241 ciphertext_page = alloc_bounce_page(ctx);
242 if (IS_ERR(ciphertext_page))
243 goto errout;
244
245 ctx->w.control_page = plaintext_page;
246 err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index,
247 plaintext_page, ciphertext_page);
248 if (err) {
249 ciphertext_page = ERR_PTR(err);
250 goto errout;
251 }
252 SetPagePrivate(ciphertext_page);
253 set_page_private(ciphertext_page, (unsigned long)ctx);
254 lock_page(ciphertext_page);
255 return ciphertext_page;
256
257errout:
258 fscrypt_release_ctx(ctx);
259 return ciphertext_page;
260}
261EXPORT_SYMBOL(fscrypt_encrypt_page);
262
263/**
264 * f2crypt_decrypt_page() - Decrypts a page in-place
265 * @page: The page to decrypt. Must be locked.
266 *
267 * Decrypts page in-place using the ctx encryption context.
268 *
269 * Called from the read completion callback.
270 *
271 * Return: Zero on success, non-zero otherwise.
272 */
273int fscrypt_decrypt_page(struct page *page)
274{
275 BUG_ON(!PageLocked(page));
276
277 return do_page_crypto(page->mapping->host,
278 FS_DECRYPT, page->index, page, page);
279}
280EXPORT_SYMBOL(fscrypt_decrypt_page);
281
282int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
283 sector_t pblk, unsigned int len)
284{
285 struct fscrypt_ctx *ctx;
286 struct page *ciphertext_page = NULL;
287 struct bio *bio;
288 int ret, err = 0;
289
290 BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE);
291
292 ctx = fscrypt_get_ctx(inode);
293 if (IS_ERR(ctx))
294 return PTR_ERR(ctx);
295
296 ciphertext_page = alloc_bounce_page(ctx);
297 if (IS_ERR(ciphertext_page)) {
298 err = PTR_ERR(ciphertext_page);
299 goto errout;
300 }
301
302 while (len--) {
303 err = do_page_crypto(inode, FS_ENCRYPT, lblk,
304 ZERO_PAGE(0), ciphertext_page);
305 if (err)
306 goto errout;
307
308 bio = bio_alloc(GFP_KERNEL, 1);
309 if (!bio) {
310 err = -ENOMEM;
311 goto errout;
312 }
313 bio->bi_bdev = inode->i_sb->s_bdev;
314 bio->bi_iter.bi_sector =
315 pblk << (inode->i_sb->s_blocksize_bits - 9);
316 ret = bio_add_page(bio, ciphertext_page,
317 inode->i_sb->s_blocksize, 0);
318 if (ret != inode->i_sb->s_blocksize) {
319 /* should never happen! */
320 WARN_ON(1);
321 bio_put(bio);
322 err = -EIO;
323 goto errout;
324 }
325 err = submit_bio_wait(WRITE, bio);
326 if ((err == 0) && bio->bi_error)
327 err = -EIO;
328 bio_put(bio);
329 if (err)
330 goto errout;
331 lblk++;
332 pblk++;
333 }
334 err = 0;
335errout:
336 fscrypt_release_ctx(ctx);
337 return err;
338}
339EXPORT_SYMBOL(fscrypt_zeroout_range);
340
341/*
342 * Validate dentries for encrypted directories to make sure we aren't
343 * potentially caching stale data after a key has been added or
344 * removed.
345 */
346static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
347{
348 struct inode *dir = d_inode(dentry->d_parent);
349 struct fscrypt_info *ci = dir->i_crypt_info;
350 int dir_has_key, cached_with_key;
351
352 if (!dir->i_sb->s_cop->is_encrypted(dir))
353 return 0;
354
355 if (ci && ci->ci_keyring_key &&
356 (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
357 (1 << KEY_FLAG_REVOKED) |
358 (1 << KEY_FLAG_DEAD))))
359 ci = NULL;
360
361 /* this should eventually be an flag in d_flags */
362 spin_lock(&dentry->d_lock);
363 cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY;
364 spin_unlock(&dentry->d_lock);
365 dir_has_key = (ci != NULL);
366
367 /*
368 * If the dentry was cached without the key, and it is a
369 * negative dentry, it might be a valid name. We can't check
370 * if the key has since been made available due to locking
371 * reasons, so we fail the validation so ext4_lookup() can do
372 * this check.
373 *
374 * We also fail the validation if the dentry was created with
375 * the key present, but we no longer have the key, or vice versa.
376 */
377 if ((!cached_with_key && d_is_negative(dentry)) ||
378 (!cached_with_key && dir_has_key) ||
379 (cached_with_key && !dir_has_key))
380 return 0;
381 return 1;
382}
383
384const struct dentry_operations fscrypt_d_ops = {
385 .d_revalidate = fscrypt_d_revalidate,
386};
387EXPORT_SYMBOL(fscrypt_d_ops);
388
389/*
390 * Call fscrypt_decrypt_page on every single page, reusing the encryption
391 * context.
392 */
393static void completion_pages(struct work_struct *work)
394{
395 struct fscrypt_ctx *ctx =
396 container_of(work, struct fscrypt_ctx, r.work);
397 struct bio *bio = ctx->r.bio;
398 struct bio_vec *bv;
399 int i;
400
401 bio_for_each_segment_all(bv, bio, i) {
402 struct page *page = bv->bv_page;
403 int ret = fscrypt_decrypt_page(page);
404
405 if (ret) {
406 WARN_ON_ONCE(1);
407 SetPageError(page);
408 } else {
409 SetPageUptodate(page);
410 }
411 unlock_page(page);
412 }
413 fscrypt_release_ctx(ctx);
414 bio_put(bio);
415}
416
417void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio)
418{
419 INIT_WORK(&ctx->r.work, completion_pages);
420 ctx->r.bio = bio;
421 queue_work(fscrypt_read_workqueue, &ctx->r.work);
422}
423EXPORT_SYMBOL(fscrypt_decrypt_bio_pages);
424
425void fscrypt_pullback_bio_page(struct page **page, bool restore)
426{
427 struct fscrypt_ctx *ctx;
428 struct page *bounce_page;
429
430 /* The bounce data pages are unmapped. */
431 if ((*page)->mapping)
432 return;
433
434 /* The bounce data page is unmapped. */
435 bounce_page = *page;
436 ctx = (struct fscrypt_ctx *)page_private(bounce_page);
437
438 /* restore control page */
439 *page = ctx->w.control_page;
440
441 if (restore)
442 fscrypt_restore_control_page(bounce_page);
443}
444EXPORT_SYMBOL(fscrypt_pullback_bio_page);
445
446void fscrypt_restore_control_page(struct page *page)
447{
448 struct fscrypt_ctx *ctx;
449
450 ctx = (struct fscrypt_ctx *)page_private(page);
451 set_page_private(page, (unsigned long)NULL);
452 ClearPagePrivate(page);
453 unlock_page(page);
454 fscrypt_release_ctx(ctx);
455}
456EXPORT_SYMBOL(fscrypt_restore_control_page);
457
458static void fscrypt_destroy(void)
459{
460 struct fscrypt_ctx *pos, *n;
461
462 list_for_each_entry_safe(pos, n, &fscrypt_free_ctxs, free_list)
463 kmem_cache_free(fscrypt_ctx_cachep, pos);
464 INIT_LIST_HEAD(&fscrypt_free_ctxs);
465 mempool_destroy(fscrypt_bounce_page_pool);
466 fscrypt_bounce_page_pool = NULL;
467}
468
469/**
470 * fscrypt_initialize() - allocate major buffers for fs encryption.
471 *
472 * We only call this when we start accessing encrypted files, since it
473 * results in memory getting allocated that wouldn't otherwise be used.
474 *
475 * Return: Zero on success, non-zero otherwise.
476 */
477int fscrypt_initialize(void)
478{
479 int i, res = -ENOMEM;
480
481 if (fscrypt_bounce_page_pool)
482 return 0;
483
484 mutex_lock(&fscrypt_init_mutex);
485 if (fscrypt_bounce_page_pool)
486 goto already_initialized;
487
488 for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
489 struct fscrypt_ctx *ctx;
490
491 ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS);
492 if (!ctx)
493 goto fail;
494 list_add(&ctx->free_list, &fscrypt_free_ctxs);
495 }
496
497 fscrypt_bounce_page_pool =
498 mempool_create_page_pool(num_prealloc_crypto_pages, 0);
499 if (!fscrypt_bounce_page_pool)
500 goto fail;
501
502already_initialized:
503 mutex_unlock(&fscrypt_init_mutex);
504 return 0;
505fail:
506 fscrypt_destroy();
507 mutex_unlock(&fscrypt_init_mutex);
508 return res;
509}
510EXPORT_SYMBOL(fscrypt_initialize);
511
512/**
513 * fscrypt_init() - Set up for fs encryption.
514 */
515static int __init fscrypt_init(void)
516{
517 fscrypt_read_workqueue = alloc_workqueue("fscrypt_read_queue",
518 WQ_HIGHPRI, 0);
519 if (!fscrypt_read_workqueue)
520 goto fail;
521
522 fscrypt_ctx_cachep = KMEM_CACHE(fscrypt_ctx, SLAB_RECLAIM_ACCOUNT);
523 if (!fscrypt_ctx_cachep)
524 goto fail_free_queue;
525
526 fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT);
527 if (!fscrypt_info_cachep)
528 goto fail_free_ctx;
529
530 return 0;
531
532fail_free_ctx:
533 kmem_cache_destroy(fscrypt_ctx_cachep);
534fail_free_queue:
535 destroy_workqueue(fscrypt_read_workqueue);
536fail:
537 return -ENOMEM;
538}
539module_init(fscrypt_init)
540
541/**
542 * fscrypt_exit() - Shutdown the fs encryption system
543 */
544static void __exit fscrypt_exit(void)
545{
546 fscrypt_destroy();
547
548 if (fscrypt_read_workqueue)
549 destroy_workqueue(fscrypt_read_workqueue);
550 kmem_cache_destroy(fscrypt_ctx_cachep);
551 kmem_cache_destroy(fscrypt_info_cachep);
552}
553module_exit(fscrypt_exit);
554
555MODULE_LICENSE("GPL");
diff --git a/fs/f2fs/crypto_fname.c b/fs/crypto/fname.c
index ab377d496a39..5d6d49113efa 100644
--- a/fs/f2fs/crypto_fname.c
+++ b/fs/crypto/fname.c
@@ -1,46 +1,32 @@
1/* 1/*
2 * linux/fs/f2fs/crypto_fname.c 2 * This contains functions for filename crypto management
3 *
4 * Copied from linux/fs/ext4/crypto.c
5 * 3 *
6 * Copyright (C) 2015, Google, Inc. 4 * Copyright (C) 2015, Google, Inc.
7 * Copyright (C) 2015, Motorola Mobility 5 * Copyright (C) 2015, Motorola Mobility
8 * 6 *
9 * This contains functions for filename crypto management in f2fs
10 *
11 * Written by Uday Savagaonkar, 2014. 7 * Written by Uday Savagaonkar, 2014.
12 * 8 * Modified by Jaegeuk Kim, 2015.
13 * Adjust f2fs dentry structure
14 * Jaegeuk Kim, 2015.
15 * 9 *
16 * This has not yet undergone a rigorous security audit. 10 * This has not yet undergone a rigorous security audit.
17 */ 11 */
18#include <crypto/hash.h> 12
19#include <crypto/sha.h>
20#include <keys/encrypted-type.h> 13#include <keys/encrypted-type.h>
21#include <keys/user-type.h> 14#include <keys/user-type.h>
22#include <linux/crypto.h>
23#include <linux/gfp.h>
24#include <linux/kernel.h>
25#include <linux/key.h>
26#include <linux/list.h>
27#include <linux/mempool.h>
28#include <linux/random.h>
29#include <linux/scatterlist.h> 15#include <linux/scatterlist.h>
30#include <linux/spinlock_types.h>
31#include <linux/f2fs_fs.h>
32#include <linux/ratelimit.h> 16#include <linux/ratelimit.h>
17#include <linux/fscrypto.h>
33 18
34#include "f2fs.h" 19static u32 size_round_up(size_t size, size_t blksize)
35#include "f2fs_crypto.h" 20{
36#include "xattr.h" 21 return ((size + blksize - 1) / blksize) * blksize;
22}
37 23
38/** 24/**
39 * f2fs_dir_crypt_complete() - 25 * dir_crypt_complete() -
40 */ 26 */
41static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res) 27static void dir_crypt_complete(struct crypto_async_request *req, int res)
42{ 28{
43 struct f2fs_completion_result *ecr = req->data; 29 struct fscrypt_completion_result *ecr = req->data;
44 30
45 if (res == -EINPROGRESS) 31 if (res == -EINPROGRESS)
46 return; 32 return;
@@ -48,45 +34,35 @@ static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res)
48 complete(&ecr->completion); 34 complete(&ecr->completion);
49} 35}
50 36
51bool f2fs_valid_filenames_enc_mode(uint32_t mode)
52{
53 return (mode == F2FS_ENCRYPTION_MODE_AES_256_CTS);
54}
55
56static unsigned max_name_len(struct inode *inode)
57{
58 return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize :
59 F2FS_NAME_LEN;
60}
61
62/** 37/**
63 * f2fs_fname_encrypt() - 38 * fname_encrypt() -
64 * 39 *
65 * This function encrypts the input filename, and returns the length of the 40 * This function encrypts the input filename, and returns the length of the
66 * ciphertext. Errors are returned as negative numbers. We trust the caller to 41 * ciphertext. Errors are returned as negative numbers. We trust the caller to
67 * allocate sufficient memory to oname string. 42 * allocate sufficient memory to oname string.
68 */ 43 */
69static int f2fs_fname_encrypt(struct inode *inode, 44static int fname_encrypt(struct inode *inode,
70 const struct qstr *iname, struct f2fs_str *oname) 45 const struct qstr *iname, struct fscrypt_str *oname)
71{ 46{
72 u32 ciphertext_len; 47 u32 ciphertext_len;
73 struct ablkcipher_request *req = NULL; 48 struct skcipher_request *req = NULL;
74 DECLARE_F2FS_COMPLETION_RESULT(ecr); 49 DECLARE_FS_COMPLETION_RESULT(ecr);
75 struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; 50 struct fscrypt_info *ci = inode->i_crypt_info;
76 struct crypto_ablkcipher *tfm = ci->ci_ctfm; 51 struct crypto_skcipher *tfm = ci->ci_ctfm;
77 int res = 0; 52 int res = 0;
78 char iv[F2FS_CRYPTO_BLOCK_SIZE]; 53 char iv[FS_CRYPTO_BLOCK_SIZE];
79 struct scatterlist src_sg, dst_sg; 54 struct scatterlist src_sg, dst_sg;
80 int padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK); 55 int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
81 char *workbuf, buf[32], *alloc_buf = NULL; 56 char *workbuf, buf[32], *alloc_buf = NULL;
82 unsigned lim = max_name_len(inode); 57 unsigned lim;
83 58
59 lim = inode->i_sb->s_cop->max_namelen(inode);
84 if (iname->len <= 0 || iname->len > lim) 60 if (iname->len <= 0 || iname->len > lim)
85 return -EIO; 61 return -EIO;
86 62
87 ciphertext_len = (iname->len < F2FS_CRYPTO_BLOCK_SIZE) ? 63 ciphertext_len = (iname->len < FS_CRYPTO_BLOCK_SIZE) ?
88 F2FS_CRYPTO_BLOCK_SIZE : iname->len; 64 FS_CRYPTO_BLOCK_SIZE : iname->len;
89 ciphertext_len = f2fs_fname_crypto_round_up(ciphertext_len, padding); 65 ciphertext_len = size_round_up(ciphertext_len, padding);
90 ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len; 66 ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len;
91 67
92 if (ciphertext_len <= sizeof(buf)) { 68 if (ciphertext_len <= sizeof(buf)) {
@@ -99,16 +75,16 @@ static int f2fs_fname_encrypt(struct inode *inode,
99 } 75 }
100 76
101 /* Allocate request */ 77 /* Allocate request */
102 req = ablkcipher_request_alloc(tfm, GFP_NOFS); 78 req = skcipher_request_alloc(tfm, GFP_NOFS);
103 if (!req) { 79 if (!req) {
104 printk_ratelimited(KERN_ERR 80 printk_ratelimited(KERN_ERR
105 "%s: crypto_request_alloc() failed\n", __func__); 81 "%s: crypto_request_alloc() failed\n", __func__);
106 kfree(alloc_buf); 82 kfree(alloc_buf);
107 return -ENOMEM; 83 return -ENOMEM;
108 } 84 }
109 ablkcipher_request_set_callback(req, 85 skcipher_request_set_callback(req,
110 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, 86 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
111 f2fs_dir_crypt_complete, &ecr); 87 dir_crypt_complete, &ecr);
112 88
113 /* Copy the input */ 89 /* Copy the input */
114 memcpy(workbuf, iname->name, iname->len); 90 memcpy(workbuf, iname->name, iname->len);
@@ -116,79 +92,78 @@ static int f2fs_fname_encrypt(struct inode *inode,
116 memset(workbuf + iname->len, 0, ciphertext_len - iname->len); 92 memset(workbuf + iname->len, 0, ciphertext_len - iname->len);
117 93
118 /* Initialize IV */ 94 /* Initialize IV */
119 memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE); 95 memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
120 96
121 /* Create encryption request */ 97 /* Create encryption request */
122 sg_init_one(&src_sg, workbuf, ciphertext_len); 98 sg_init_one(&src_sg, workbuf, ciphertext_len);
123 sg_init_one(&dst_sg, oname->name, ciphertext_len); 99 sg_init_one(&dst_sg, oname->name, ciphertext_len);
124 ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); 100 skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
125 res = crypto_ablkcipher_encrypt(req); 101 res = crypto_skcipher_encrypt(req);
126 if (res == -EINPROGRESS || res == -EBUSY) { 102 if (res == -EINPROGRESS || res == -EBUSY) {
127 BUG_ON(req->base.data != &ecr);
128 wait_for_completion(&ecr.completion); 103 wait_for_completion(&ecr.completion);
129 res = ecr.res; 104 res = ecr.res;
130 } 105 }
131 kfree(alloc_buf); 106 kfree(alloc_buf);
132 ablkcipher_request_free(req); 107 skcipher_request_free(req);
133 if (res < 0) { 108 if (res < 0)
134 printk_ratelimited(KERN_ERR 109 printk_ratelimited(KERN_ERR
135 "%s: Error (error code %d)\n", __func__, res); 110 "%s: Error (error code %d)\n", __func__, res);
136 } 111
137 oname->len = ciphertext_len; 112 oname->len = ciphertext_len;
138 return res; 113 return res;
139} 114}
140 115
141/* 116/*
142 * f2fs_fname_decrypt() 117 * fname_decrypt()
143 * This function decrypts the input filename, and returns 118 * This function decrypts the input filename, and returns
144 * the length of the plaintext. 119 * the length of the plaintext.
145 * Errors are returned as negative numbers. 120 * Errors are returned as negative numbers.
146 * We trust the caller to allocate sufficient memory to oname string. 121 * We trust the caller to allocate sufficient memory to oname string.
147 */ 122 */
148static int f2fs_fname_decrypt(struct inode *inode, 123static int fname_decrypt(struct inode *inode,
149 const struct f2fs_str *iname, struct f2fs_str *oname) 124 const struct fscrypt_str *iname,
125 struct fscrypt_str *oname)
150{ 126{
151 struct ablkcipher_request *req = NULL; 127 struct skcipher_request *req = NULL;
152 DECLARE_F2FS_COMPLETION_RESULT(ecr); 128 DECLARE_FS_COMPLETION_RESULT(ecr);
153 struct scatterlist src_sg, dst_sg; 129 struct scatterlist src_sg, dst_sg;
154 struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; 130 struct fscrypt_info *ci = inode->i_crypt_info;
155 struct crypto_ablkcipher *tfm = ci->ci_ctfm; 131 struct crypto_skcipher *tfm = ci->ci_ctfm;
156 int res = 0; 132 int res = 0;
157 char iv[F2FS_CRYPTO_BLOCK_SIZE]; 133 char iv[FS_CRYPTO_BLOCK_SIZE];
158 unsigned lim = max_name_len(inode); 134 unsigned lim;
159 135
136 lim = inode->i_sb->s_cop->max_namelen(inode);
160 if (iname->len <= 0 || iname->len > lim) 137 if (iname->len <= 0 || iname->len > lim)
161 return -EIO; 138 return -EIO;
162 139
163 /* Allocate request */ 140 /* Allocate request */
164 req = ablkcipher_request_alloc(tfm, GFP_NOFS); 141 req = skcipher_request_alloc(tfm, GFP_NOFS);
165 if (!req) { 142 if (!req) {
166 printk_ratelimited(KERN_ERR 143 printk_ratelimited(KERN_ERR
167 "%s: crypto_request_alloc() failed\n", __func__); 144 "%s: crypto_request_alloc() failed\n", __func__);
168 return -ENOMEM; 145 return -ENOMEM;
169 } 146 }
170 ablkcipher_request_set_callback(req, 147 skcipher_request_set_callback(req,
171 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, 148 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
172 f2fs_dir_crypt_complete, &ecr); 149 dir_crypt_complete, &ecr);
173 150
174 /* Initialize IV */ 151 /* Initialize IV */
175 memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE); 152 memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
176 153
177 /* Create decryption request */ 154 /* Create decryption request */
178 sg_init_one(&src_sg, iname->name, iname->len); 155 sg_init_one(&src_sg, iname->name, iname->len);
179 sg_init_one(&dst_sg, oname->name, oname->len); 156 sg_init_one(&dst_sg, oname->name, oname->len);
180 ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); 157 skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
181 res = crypto_ablkcipher_decrypt(req); 158 res = crypto_skcipher_decrypt(req);
182 if (res == -EINPROGRESS || res == -EBUSY) { 159 if (res == -EINPROGRESS || res == -EBUSY) {
183 BUG_ON(req->base.data != &ecr);
184 wait_for_completion(&ecr.completion); 160 wait_for_completion(&ecr.completion);
185 res = ecr.res; 161 res = ecr.res;
186 } 162 }
187 ablkcipher_request_free(req); 163 skcipher_request_free(req);
188 if (res < 0) { 164 if (res < 0) {
189 printk_ratelimited(KERN_ERR 165 printk_ratelimited(KERN_ERR
190 "%s: Error in f2fs_fname_decrypt (error code %d)\n", 166 "%s: Error (error code %d)\n", __func__, res);
191 __func__, res);
192 return res; 167 return res;
193 } 168 }
194 169
@@ -200,7 +175,7 @@ static const char *lookup_table =
200 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; 175 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
201 176
202/** 177/**
203 * f2fs_fname_encode_digest() - 178 * digest_encode() -
204 * 179 *
205 * Encodes the input digest using characters from the set [a-zA-Z0-9_+]. 180 * Encodes the input digest using characters from the set [a-zA-Z0-9_+].
206 * The encoded string is roughly 4/3 times the size of the input string. 181 * The encoded string is roughly 4/3 times the size of the input string.
@@ -249,148 +224,152 @@ static int digest_decode(const char *src, int len, char *dst)
249 return cp - dst; 224 return cp - dst;
250} 225}
251 226
252/** 227u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen)
253 * f2fs_fname_crypto_round_up() -
254 *
255 * Return: The next multiple of block size
256 */
257u32 f2fs_fname_crypto_round_up(u32 size, u32 blksize)
258{ 228{
259 return ((size + blksize - 1) / blksize) * blksize; 229 int padding = 32;
230 struct fscrypt_info *ci = inode->i_crypt_info;
231
232 if (ci)
233 padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
234 if (ilen < FS_CRYPTO_BLOCK_SIZE)
235 ilen = FS_CRYPTO_BLOCK_SIZE;
236 return size_round_up(ilen, padding);
260} 237}
238EXPORT_SYMBOL(fscrypt_fname_encrypted_size);
261 239
262/** 240/**
263 * f2fs_fname_crypto_alloc_obuff() - 241 * fscrypt_fname_crypto_alloc_obuff() -
264 * 242 *
265 * Allocates an output buffer that is sufficient for the crypto operation 243 * Allocates an output buffer that is sufficient for the crypto operation
266 * specified by the context and the direction. 244 * specified by the context and the direction.
267 */ 245 */
268int f2fs_fname_crypto_alloc_buffer(struct inode *inode, 246int fscrypt_fname_alloc_buffer(struct inode *inode,
269 u32 ilen, struct f2fs_str *crypto_str) 247 u32 ilen, struct fscrypt_str *crypto_str)
270{ 248{
271 unsigned int olen; 249 unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen);
272 int padding = 16;
273 struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
274 250
275 if (ci)
276 padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK);
277 if (padding < F2FS_CRYPTO_BLOCK_SIZE)
278 padding = F2FS_CRYPTO_BLOCK_SIZE;
279 olen = f2fs_fname_crypto_round_up(ilen, padding);
280 crypto_str->len = olen; 251 crypto_str->len = olen;
281 if (olen < F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2) 252 if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2)
282 olen = F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2; 253 olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2;
283 /* Allocated buffer can hold one more character to null-terminate the 254 /*
284 * string */ 255 * Allocated buffer can hold one more character to null-terminate the
256 * string
257 */
285 crypto_str->name = kmalloc(olen + 1, GFP_NOFS); 258 crypto_str->name = kmalloc(olen + 1, GFP_NOFS);
286 if (!(crypto_str->name)) 259 if (!(crypto_str->name))
287 return -ENOMEM; 260 return -ENOMEM;
288 return 0; 261 return 0;
289} 262}
263EXPORT_SYMBOL(fscrypt_fname_alloc_buffer);
290 264
291/** 265/**
292 * f2fs_fname_crypto_free_buffer() - 266 * fscrypt_fname_crypto_free_buffer() -
293 * 267 *
294 * Frees the buffer allocated for crypto operation. 268 * Frees the buffer allocated for crypto operation.
295 */ 269 */
296void f2fs_fname_crypto_free_buffer(struct f2fs_str *crypto_str) 270void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
297{ 271{
298 if (!crypto_str) 272 if (!crypto_str)
299 return; 273 return;
300 kfree(crypto_str->name); 274 kfree(crypto_str->name);
301 crypto_str->name = NULL; 275 crypto_str->name = NULL;
302} 276}
277EXPORT_SYMBOL(fscrypt_fname_free_buffer);
303 278
304/** 279/**
305 * f2fs_fname_disk_to_usr() - converts a filename from disk space to user space 280 * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user
281 * space
306 */ 282 */
307int f2fs_fname_disk_to_usr(struct inode *inode, 283int fscrypt_fname_disk_to_usr(struct inode *inode,
308 f2fs_hash_t *hash, 284 u32 hash, u32 minor_hash,
309 const struct f2fs_str *iname, 285 const struct fscrypt_str *iname,
310 struct f2fs_str *oname) 286 struct fscrypt_str *oname)
311{ 287{
312 const struct qstr qname = FSTR_TO_QSTR(iname); 288 const struct qstr qname = FSTR_TO_QSTR(iname);
313 char buf[24]; 289 char buf[24];
314 int ret; 290 int ret;
315 291
316 if (is_dot_dotdot(&qname)) { 292 if (fscrypt_is_dot_dotdot(&qname)) {
317 oname->name[0] = '.'; 293 oname->name[0] = '.';
318 oname->name[iname->len - 1] = '.'; 294 oname->name[iname->len - 1] = '.';
319 oname->len = iname->len; 295 oname->len = iname->len;
320 return oname->len; 296 return oname->len;
321 } 297 }
322 298
323 if (F2FS_I(inode)->i_crypt_info) 299 if (iname->len < FS_CRYPTO_BLOCK_SIZE)
324 return f2fs_fname_decrypt(inode, iname, oname); 300 return -EUCLEAN;
325 301
326 if (iname->len <= F2FS_FNAME_CRYPTO_DIGEST_SIZE) { 302 if (inode->i_crypt_info)
303 return fname_decrypt(inode, iname, oname);
304
305 if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) {
327 ret = digest_encode(iname->name, iname->len, oname->name); 306 ret = digest_encode(iname->name, iname->len, oname->name);
328 oname->len = ret; 307 oname->len = ret;
329 return ret; 308 return ret;
330 } 309 }
331 if (hash) { 310 if (hash) {
332 memcpy(buf, hash, 4); 311 memcpy(buf, &hash, 4);
333 memset(buf + 4, 0, 4); 312 memcpy(buf + 4, &minor_hash, 4);
334 } else 313 } else {
335 memset(buf, 0, 8); 314 memset(buf, 0, 8);
315 }
336 memcpy(buf + 8, iname->name + iname->len - 16, 16); 316 memcpy(buf + 8, iname->name + iname->len - 16, 16);
337 oname->name[0] = '_'; 317 oname->name[0] = '_';
338 ret = digest_encode(buf, 24, oname->name + 1); 318 ret = digest_encode(buf, 24, oname->name + 1);
339 oname->len = ret + 1; 319 oname->len = ret + 1;
340 return ret + 1; 320 return ret + 1;
341} 321}
322EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
342 323
343/** 324/**
344 * f2fs_fname_usr_to_disk() - converts a filename from user space to disk space 325 * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk
326 * space
345 */ 327 */
346int f2fs_fname_usr_to_disk(struct inode *inode, 328int fscrypt_fname_usr_to_disk(struct inode *inode,
347 const struct qstr *iname, 329 const struct qstr *iname,
348 struct f2fs_str *oname) 330 struct fscrypt_str *oname)
349{ 331{
350 int res; 332 if (fscrypt_is_dot_dotdot(iname)) {
351 struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
352
353 if (is_dot_dotdot(iname)) {
354 oname->name[0] = '.'; 333 oname->name[0] = '.';
355 oname->name[iname->len - 1] = '.'; 334 oname->name[iname->len - 1] = '.';
356 oname->len = iname->len; 335 oname->len = iname->len;
357 return oname->len; 336 return oname->len;
358 } 337 }
359 338 if (inode->i_crypt_info)
360 if (ci) { 339 return fname_encrypt(inode, iname, oname);
361 res = f2fs_fname_encrypt(inode, iname, oname); 340 /*
362 return res; 341 * Without a proper key, a user is not allowed to modify the filenames
363 }
364 /* Without a proper key, a user is not allowed to modify the filenames
365 * in a directory. Consequently, a user space name cannot be mapped to 342 * in a directory. Consequently, a user space name cannot be mapped to
366 * a disk-space name */ 343 * a disk-space name
344 */
367 return -EACCES; 345 return -EACCES;
368} 346}
347EXPORT_SYMBOL(fscrypt_fname_usr_to_disk);
369 348
370int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname, 349int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
371 int lookup, struct f2fs_filename *fname) 350 int lookup, struct fscrypt_name *fname)
372{ 351{
373 struct f2fs_crypt_info *ci;
374 int ret = 0, bigname = 0; 352 int ret = 0, bigname = 0;
375 353
376 memset(fname, 0, sizeof(struct f2fs_filename)); 354 memset(fname, 0, sizeof(struct fscrypt_name));
377 fname->usr_fname = iname; 355 fname->usr_fname = iname;
378 356
379 if (!f2fs_encrypted_inode(dir) || is_dot_dotdot(iname)) { 357 if (!dir->i_sb->s_cop->is_encrypted(dir) ||
358 fscrypt_is_dot_dotdot(iname)) {
380 fname->disk_name.name = (unsigned char *)iname->name; 359 fname->disk_name.name = (unsigned char *)iname->name;
381 fname->disk_name.len = iname->len; 360 fname->disk_name.len = iname->len;
382 return 0; 361 return 0;
383 } 362 }
384 ret = f2fs_get_encryption_info(dir); 363 ret = get_crypt_info(dir);
385 if (ret) 364 if (ret && ret != -EOPNOTSUPP)
386 return ret; 365 return ret;
387 ci = F2FS_I(dir)->i_crypt_info; 366
388 if (ci) { 367 if (dir->i_crypt_info) {
389 ret = f2fs_fname_crypto_alloc_buffer(dir, iname->len, 368 ret = fscrypt_fname_alloc_buffer(dir, iname->len,
390 &fname->crypto_buf); 369 &fname->crypto_buf);
391 if (ret < 0) 370 if (ret < 0)
392 return ret; 371 return ret;
393 ret = f2fs_fname_encrypt(dir, iname, &fname->crypto_buf); 372 ret = fname_encrypt(dir, iname, &fname->crypto_buf);
394 if (ret < 0) 373 if (ret < 0)
395 goto errout; 374 goto errout;
396 fname->disk_name.name = fname->crypto_buf.name; 375 fname->disk_name.name = fname->crypto_buf.name;
@@ -400,18 +379,19 @@ int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname,
400 if (!lookup) 379 if (!lookup)
401 return -EACCES; 380 return -EACCES;
402 381
403 /* We don't have the key and we are doing a lookup; decode the 382 /*
383 * We don't have the key and we are doing a lookup; decode the
404 * user-supplied name 384 * user-supplied name
405 */ 385 */
406 if (iname->name[0] == '_') 386 if (iname->name[0] == '_')
407 bigname = 1; 387 bigname = 1;
408 if ((bigname && (iname->len != 33)) || 388 if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43)))
409 (!bigname && (iname->len > 43)))
410 return -ENOENT; 389 return -ENOENT;
411 390
412 fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); 391 fname->crypto_buf.name = kmalloc(32, GFP_KERNEL);
413 if (fname->crypto_buf.name == NULL) 392 if (fname->crypto_buf.name == NULL)
414 return -ENOMEM; 393 return -ENOMEM;
394
415 ret = digest_decode(iname->name + bigname, iname->len - bigname, 395 ret = digest_decode(iname->name + bigname, iname->len - bigname,
416 fname->crypto_buf.name); 396 fname->crypto_buf.name);
417 if (ret < 0) { 397 if (ret < 0) {
@@ -421,20 +401,24 @@ int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname,
421 fname->crypto_buf.len = ret; 401 fname->crypto_buf.len = ret;
422 if (bigname) { 402 if (bigname) {
423 memcpy(&fname->hash, fname->crypto_buf.name, 4); 403 memcpy(&fname->hash, fname->crypto_buf.name, 4);
404 memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4);
424 } else { 405 } else {
425 fname->disk_name.name = fname->crypto_buf.name; 406 fname->disk_name.name = fname->crypto_buf.name;
426 fname->disk_name.len = fname->crypto_buf.len; 407 fname->disk_name.len = fname->crypto_buf.len;
427 } 408 }
428 return 0; 409 return 0;
410
429errout: 411errout:
430 f2fs_fname_crypto_free_buffer(&fname->crypto_buf); 412 fscrypt_fname_free_buffer(&fname->crypto_buf);
431 return ret; 413 return ret;
432} 414}
415EXPORT_SYMBOL(fscrypt_setup_filename);
433 416
434void f2fs_fname_free_filename(struct f2fs_filename *fname) 417void fscrypt_free_filename(struct fscrypt_name *fname)
435{ 418{
436 kfree(fname->crypto_buf.name); 419 kfree(fname->crypto_buf.name);
437 fname->crypto_buf.name = NULL; 420 fname->crypto_buf.name = NULL;
438 fname->usr_fname = NULL; 421 fname->usr_fname = NULL;
439 fname->disk_name.name = NULL; 422 fname->disk_name.name = NULL;
440} 423}
424EXPORT_SYMBOL(fscrypt_free_filename);
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
new file mode 100644
index 000000000000..06f5aa478bf2
--- /dev/null
+++ b/fs/crypto/keyinfo.c
@@ -0,0 +1,272 @@
1/*
2 * key management facility for FS encryption support.
3 *
4 * Copyright (C) 2015, Google, Inc.
5 *
6 * This contains encryption key functions.
7 *
8 * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
9 */
10
11#include <keys/encrypted-type.h>
12#include <keys/user-type.h>
13#include <linux/random.h>
14#include <linux/scatterlist.h>
15#include <uapi/linux/keyctl.h>
16#include <linux/fscrypto.h>
17
18static void derive_crypt_complete(struct crypto_async_request *req, int rc)
19{
20 struct fscrypt_completion_result *ecr = req->data;
21
22 if (rc == -EINPROGRESS)
23 return;
24
25 ecr->res = rc;
26 complete(&ecr->completion);
27}
28
29/**
30 * derive_key_aes() - Derive a key using AES-128-ECB
31 * @deriving_key: Encryption key used for derivation.
32 * @source_key: Source key to which to apply derivation.
33 * @derived_key: Derived key.
34 *
35 * Return: Zero on success; non-zero otherwise.
36 */
37static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
38 u8 source_key[FS_AES_256_XTS_KEY_SIZE],
39 u8 derived_key[FS_AES_256_XTS_KEY_SIZE])
40{
41 int res = 0;
42 struct skcipher_request *req = NULL;
43 DECLARE_FS_COMPLETION_RESULT(ecr);
44 struct scatterlist src_sg, dst_sg;
45 struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
46
47 if (IS_ERR(tfm)) {
48 res = PTR_ERR(tfm);
49 tfm = NULL;
50 goto out;
51 }
52 crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
53 req = skcipher_request_alloc(tfm, GFP_NOFS);
54 if (!req) {
55 res = -ENOMEM;
56 goto out;
57 }
58 skcipher_request_set_callback(req,
59 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
60 derive_crypt_complete, &ecr);
61 res = crypto_skcipher_setkey(tfm, deriving_key,
62 FS_AES_128_ECB_KEY_SIZE);
63 if (res < 0)
64 goto out;
65
66 sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE);
67 sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE);
68 skcipher_request_set_crypt(req, &src_sg, &dst_sg,
69 FS_AES_256_XTS_KEY_SIZE, NULL);
70 res = crypto_skcipher_encrypt(req);
71 if (res == -EINPROGRESS || res == -EBUSY) {
72 wait_for_completion(&ecr.completion);
73 res = ecr.res;
74 }
75out:
76 skcipher_request_free(req);
77 crypto_free_skcipher(tfm);
78 return res;
79}
80
81static void put_crypt_info(struct fscrypt_info *ci)
82{
83 if (!ci)
84 return;
85
86 key_put(ci->ci_keyring_key);
87 crypto_free_skcipher(ci->ci_ctfm);
88 kmem_cache_free(fscrypt_info_cachep, ci);
89}
90
91int get_crypt_info(struct inode *inode)
92{
93 struct fscrypt_info *crypt_info;
94 u8 full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE +
95 (FS_KEY_DESCRIPTOR_SIZE * 2) + 1];
96 struct key *keyring_key = NULL;
97 struct fscrypt_key *master_key;
98 struct fscrypt_context ctx;
99 const struct user_key_payload *ukp;
100 struct crypto_skcipher *ctfm;
101 const char *cipher_str;
102 u8 raw_key[FS_MAX_KEY_SIZE];
103 u8 mode;
104 int res;
105
106 res = fscrypt_initialize();
107 if (res)
108 return res;
109
110 if (!inode->i_sb->s_cop->get_context)
111 return -EOPNOTSUPP;
112retry:
113 crypt_info = ACCESS_ONCE(inode->i_crypt_info);
114 if (crypt_info) {
115 if (!crypt_info->ci_keyring_key ||
116 key_validate(crypt_info->ci_keyring_key) == 0)
117 return 0;
118 fscrypt_put_encryption_info(inode, crypt_info);
119 goto retry;
120 }
121
122 res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
123 if (res < 0) {
124 if (!fscrypt_dummy_context_enabled(inode))
125 return res;
126 ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
127 ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
128 ctx.flags = 0;
129 } else if (res != sizeof(ctx)) {
130 return -EINVAL;
131 }
132 res = 0;
133
134 crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS);
135 if (!crypt_info)
136 return -ENOMEM;
137
138 crypt_info->ci_flags = ctx.flags;
139 crypt_info->ci_data_mode = ctx.contents_encryption_mode;
140 crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
141 crypt_info->ci_ctfm = NULL;
142 crypt_info->ci_keyring_key = NULL;
143 memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
144 sizeof(crypt_info->ci_master_key));
145 if (S_ISREG(inode->i_mode))
146 mode = crypt_info->ci_data_mode;
147 else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
148 mode = crypt_info->ci_filename_mode;
149 else
150 BUG();
151
152 switch (mode) {
153 case FS_ENCRYPTION_MODE_AES_256_XTS:
154 cipher_str = "xts(aes)";
155 break;
156 case FS_ENCRYPTION_MODE_AES_256_CTS:
157 cipher_str = "cts(cbc(aes))";
158 break;
159 default:
160 printk_once(KERN_WARNING
161 "%s: unsupported key mode %d (ino %u)\n",
162 __func__, mode, (unsigned) inode->i_ino);
163 res = -ENOKEY;
164 goto out;
165 }
166 if (fscrypt_dummy_context_enabled(inode)) {
167 memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE);
168 goto got_key;
169 }
170 memcpy(full_key_descriptor, FS_KEY_DESC_PREFIX,
171 FS_KEY_DESC_PREFIX_SIZE);
172 sprintf(full_key_descriptor + FS_KEY_DESC_PREFIX_SIZE,
173 "%*phN", FS_KEY_DESCRIPTOR_SIZE,
174 ctx.master_key_descriptor);
175 full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE +
176 (2 * FS_KEY_DESCRIPTOR_SIZE)] = '\0';
177 keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
178 if (IS_ERR(keyring_key)) {
179 res = PTR_ERR(keyring_key);
180 keyring_key = NULL;
181 goto out;
182 }
183 crypt_info->ci_keyring_key = keyring_key;
184 if (keyring_key->type != &key_type_logon) {
185 printk_once(KERN_WARNING
186 "%s: key type must be logon\n", __func__);
187 res = -ENOKEY;
188 goto out;
189 }
190 down_read(&keyring_key->sem);
191 ukp = user_key_payload(keyring_key);
192 if (ukp->datalen != sizeof(struct fscrypt_key)) {
193 res = -EINVAL;
194 up_read(&keyring_key->sem);
195 goto out;
196 }
197 master_key = (struct fscrypt_key *)ukp->data;
198 BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
199
200 if (master_key->size != FS_AES_256_XTS_KEY_SIZE) {
201 printk_once(KERN_WARNING
202 "%s: key size incorrect: %d\n",
203 __func__, master_key->size);
204 res = -ENOKEY;
205 up_read(&keyring_key->sem);
206 goto out;
207 }
208 res = derive_key_aes(ctx.nonce, master_key->raw, raw_key);
209 up_read(&keyring_key->sem);
210 if (res)
211 goto out;
212got_key:
213 ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
214 if (!ctfm || IS_ERR(ctfm)) {
215 res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
216 printk(KERN_DEBUG
217 "%s: error %d (inode %u) allocating crypto tfm\n",
218 __func__, res, (unsigned) inode->i_ino);
219 goto out;
220 }
221 crypt_info->ci_ctfm = ctfm;
222 crypto_skcipher_clear_flags(ctfm, ~0);
223 crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY);
224 res = crypto_skcipher_setkey(ctfm, raw_key, fscrypt_key_size(mode));
225 if (res)
226 goto out;
227
228 memzero_explicit(raw_key, sizeof(raw_key));
229 if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) {
230 put_crypt_info(crypt_info);
231 goto retry;
232 }
233 return 0;
234
235out:
236 if (res == -ENOKEY)
237 res = 0;
238 put_crypt_info(crypt_info);
239 memzero_explicit(raw_key, sizeof(raw_key));
240 return res;
241}
242
243void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci)
244{
245 struct fscrypt_info *prev;
246
247 if (ci == NULL)
248 ci = ACCESS_ONCE(inode->i_crypt_info);
249 if (ci == NULL)
250 return;
251
252 prev = cmpxchg(&inode->i_crypt_info, ci, NULL);
253 if (prev != ci)
254 return;
255
256 put_crypt_info(ci);
257}
258EXPORT_SYMBOL(fscrypt_put_encryption_info);
259
260int fscrypt_get_encryption_info(struct inode *inode)
261{
262 struct fscrypt_info *ci = inode->i_crypt_info;
263
264 if (!ci ||
265 (ci->ci_keyring_key &&
266 (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
267 (1 << KEY_FLAG_REVOKED) |
268 (1 << KEY_FLAG_DEAD)))))
269 return get_crypt_info(inode);
270 return 0;
271}
272EXPORT_SYMBOL(fscrypt_get_encryption_info);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
new file mode 100644
index 000000000000..0f9961eede1e
--- /dev/null
+++ b/fs/crypto/policy.c
@@ -0,0 +1,229 @@
1/*
2 * Encryption policy functions for per-file encryption support.
3 *
4 * Copyright (C) 2015, Google, Inc.
5 * Copyright (C) 2015, Motorola Mobility.
6 *
7 * Written by Michael Halcrow, 2015.
8 * Modified by Jaegeuk Kim, 2015.
9 */
10
11#include <linux/random.h>
12#include <linux/string.h>
13#include <linux/fscrypto.h>
14
15static int inode_has_encryption_context(struct inode *inode)
16{
17 if (!inode->i_sb->s_cop->get_context)
18 return 0;
19 return (inode->i_sb->s_cop->get_context(inode, NULL, 0L) > 0);
20}
21
22/*
23 * check whether the policy is consistent with the encryption context
24 * for the inode
25 */
26static int is_encryption_context_consistent_with_policy(struct inode *inode,
27 const struct fscrypt_policy *policy)
28{
29 struct fscrypt_context ctx;
30 int res;
31
32 if (!inode->i_sb->s_cop->get_context)
33 return 0;
34
35 res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
36 if (res != sizeof(ctx))
37 return 0;
38
39 return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
40 FS_KEY_DESCRIPTOR_SIZE) == 0 &&
41 (ctx.flags == policy->flags) &&
42 (ctx.contents_encryption_mode ==
43 policy->contents_encryption_mode) &&
44 (ctx.filenames_encryption_mode ==
45 policy->filenames_encryption_mode));
46}
47
48static int create_encryption_context_from_policy(struct inode *inode,
49 const struct fscrypt_policy *policy)
50{
51 struct fscrypt_context ctx;
52 int res;
53
54 if (!inode->i_sb->s_cop->set_context)
55 return -EOPNOTSUPP;
56
57 if (inode->i_sb->s_cop->prepare_context) {
58 res = inode->i_sb->s_cop->prepare_context(inode);
59 if (res)
60 return res;
61 }
62
63 ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
64 memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
65 FS_KEY_DESCRIPTOR_SIZE);
66
67 if (!fscrypt_valid_contents_enc_mode(
68 policy->contents_encryption_mode)) {
69 printk(KERN_WARNING
70 "%s: Invalid contents encryption mode %d\n", __func__,
71 policy->contents_encryption_mode);
72 return -EINVAL;
73 }
74
75 if (!fscrypt_valid_filenames_enc_mode(
76 policy->filenames_encryption_mode)) {
77 printk(KERN_WARNING
78 "%s: Invalid filenames encryption mode %d\n", __func__,
79 policy->filenames_encryption_mode);
80 return -EINVAL;
81 }
82
83 if (policy->flags & ~FS_POLICY_FLAGS_VALID)
84 return -EINVAL;
85
86 ctx.contents_encryption_mode = policy->contents_encryption_mode;
87 ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
88 ctx.flags = policy->flags;
89 BUILD_BUG_ON(sizeof(ctx.nonce) != FS_KEY_DERIVATION_NONCE_SIZE);
90 get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
91
92 return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL);
93}
94
95int fscrypt_process_policy(struct inode *inode,
96 const struct fscrypt_policy *policy)
97{
98 if (policy->version != 0)
99 return -EINVAL;
100
101 if (!inode_has_encryption_context(inode)) {
102 if (!inode->i_sb->s_cop->empty_dir)
103 return -EOPNOTSUPP;
104 if (!inode->i_sb->s_cop->empty_dir(inode))
105 return -ENOTEMPTY;
106 return create_encryption_context_from_policy(inode, policy);
107 }
108
109 if (is_encryption_context_consistent_with_policy(inode, policy))
110 return 0;
111
112 printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
113 __func__);
114 return -EINVAL;
115}
116EXPORT_SYMBOL(fscrypt_process_policy);
117
118int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy)
119{
120 struct fscrypt_context ctx;
121 int res;
122
123 if (!inode->i_sb->s_cop->get_context ||
124 !inode->i_sb->s_cop->is_encrypted(inode))
125 return -ENODATA;
126
127 res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
128 if (res != sizeof(ctx))
129 return -ENODATA;
130 if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
131 return -EINVAL;
132
133 policy->version = 0;
134 policy->contents_encryption_mode = ctx.contents_encryption_mode;
135 policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
136 policy->flags = ctx.flags;
137 memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
138 FS_KEY_DESCRIPTOR_SIZE);
139 return 0;
140}
141EXPORT_SYMBOL(fscrypt_get_policy);
142
143int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
144{
145 struct fscrypt_info *parent_ci, *child_ci;
146 int res;
147
148 if ((parent == NULL) || (child == NULL)) {
149 printk(KERN_ERR "parent %p child %p\n", parent, child);
150 BUG_ON(1);
151 }
152
153 /* no restrictions if the parent directory is not encrypted */
154 if (!parent->i_sb->s_cop->is_encrypted(parent))
155 return 1;
156 /* if the child directory is not encrypted, this is always a problem */
157 if (!parent->i_sb->s_cop->is_encrypted(child))
158 return 0;
159 res = fscrypt_get_encryption_info(parent);
160 if (res)
161 return 0;
162 res = fscrypt_get_encryption_info(child);
163 if (res)
164 return 0;
165 parent_ci = parent->i_crypt_info;
166 child_ci = child->i_crypt_info;
167 if (!parent_ci && !child_ci)
168 return 1;
169 if (!parent_ci || !child_ci)
170 return 0;
171
172 return (memcmp(parent_ci->ci_master_key,
173 child_ci->ci_master_key,
174 FS_KEY_DESCRIPTOR_SIZE) == 0 &&
175 (parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
176 (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) &&
177 (parent_ci->ci_flags == child_ci->ci_flags));
178}
179EXPORT_SYMBOL(fscrypt_has_permitted_context);
180
181/**
182 * fscrypt_inherit_context() - Sets a child context from its parent
183 * @parent: Parent inode from which the context is inherited.
184 * @child: Child inode that inherits the context from @parent.
185 * @fs_data: private data given by FS.
186 * @preload: preload child i_crypt_info
187 *
188 * Return: Zero on success, non-zero otherwise
189 */
190int fscrypt_inherit_context(struct inode *parent, struct inode *child,
191 void *fs_data, bool preload)
192{
193 struct fscrypt_context ctx;
194 struct fscrypt_info *ci;
195 int res;
196
197 if (!parent->i_sb->s_cop->set_context)
198 return -EOPNOTSUPP;
199
200 res = fscrypt_get_encryption_info(parent);
201 if (res < 0)
202 return res;
203
204 ci = parent->i_crypt_info;
205 if (ci == NULL)
206 return -ENOKEY;
207
208 ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
209 if (fscrypt_dummy_context_enabled(parent)) {
210 ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
211 ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
212 ctx.flags = 0;
213 memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE);
214 res = 0;
215 } else {
216 ctx.contents_encryption_mode = ci->ci_data_mode;
217 ctx.filenames_encryption_mode = ci->ci_filename_mode;
218 ctx.flags = ci->ci_flags;
219 memcpy(ctx.master_key_descriptor, ci->ci_master_key,
220 FS_KEY_DESCRIPTOR_SIZE);
221 }
222 get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
223 res = parent->i_sb->s_cop->set_context(child, &ctx,
224 sizeof(ctx), fs_data);
225 if (res)
226 return res;
227 return preload ? fscrypt_get_encryption_info(child): 0;
228}
229EXPORT_SYMBOL(fscrypt_inherit_context);
diff --git a/fs/dax.c b/fs/dax.c
index bbb2ad783770..90322eb7498c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -286,8 +286,13 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
286 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 286 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
287 inode_unlock(inode); 287 inode_unlock(inode);
288 288
289 if ((retval > 0) && end_io) 289 if (end_io) {
290 end_io(iocb, pos, retval, bh.b_private); 290 int err;
291
292 err = end_io(iocb, pos, retval, bh.b_private);
293 if (err)
294 retval = err;
295 }
291 296
292 if (!(flags & DIO_SKIP_DIO_COUNT)) 297 if (!(flags & DIO_SKIP_DIO_COUNT))
293 inode_dio_end(inode); 298 inode_dio_end(inode);
diff --git a/fs/dcache.c b/fs/dcache.c
index 2398f9f94337..32ceae3e6112 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1745,13 +1745,12 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
1745 unsigned add_flags = d_flags_for_inode(inode); 1745 unsigned add_flags = d_flags_for_inode(inode);
1746 1746
1747 spin_lock(&dentry->d_lock); 1747 spin_lock(&dentry->d_lock);
1748 if (inode) 1748 hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
1749 hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
1750 raw_write_seqcount_begin(&dentry->d_seq); 1749 raw_write_seqcount_begin(&dentry->d_seq);
1751 __d_set_inode_and_type(dentry, inode, add_flags); 1750 __d_set_inode_and_type(dentry, inode, add_flags);
1752 raw_write_seqcount_end(&dentry->d_seq); 1751 raw_write_seqcount_end(&dentry->d_seq);
1752 __fsnotify_d_instantiate(dentry);
1753 spin_unlock(&dentry->d_lock); 1753 spin_unlock(&dentry->d_lock);
1754 fsnotify_d_instantiate(dentry, inode);
1755} 1754}
1756 1755
1757/** 1756/**
@@ -1772,91 +1771,16 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
1772void d_instantiate(struct dentry *entry, struct inode * inode) 1771void d_instantiate(struct dentry *entry, struct inode * inode)
1773{ 1772{
1774 BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); 1773 BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
1775 if (inode) 1774 if (inode) {
1776 spin_lock(&inode->i_lock); 1775 spin_lock(&inode->i_lock);
1777 __d_instantiate(entry, inode); 1776 __d_instantiate(entry, inode);
1778 if (inode)
1779 spin_unlock(&inode->i_lock); 1777 spin_unlock(&inode->i_lock);
1778 }
1780 security_d_instantiate(entry, inode); 1779 security_d_instantiate(entry, inode);
1781} 1780}
1782EXPORT_SYMBOL(d_instantiate); 1781EXPORT_SYMBOL(d_instantiate);
1783 1782
1784/** 1783/**
1785 * d_instantiate_unique - instantiate a non-aliased dentry
1786 * @entry: dentry to instantiate
1787 * @inode: inode to attach to this dentry
1788 *
1789 * Fill in inode information in the entry. On success, it returns NULL.
1790 * If an unhashed alias of "entry" already exists, then we return the
1791 * aliased dentry instead and drop one reference to inode.
1792 *
1793 * Note that in order to avoid conflicts with rename() etc, the caller
1794 * had better be holding the parent directory semaphore.
1795 *
1796 * This also assumes that the inode count has been incremented
1797 * (or otherwise set) by the caller to indicate that it is now
1798 * in use by the dcache.
1799 */
1800static struct dentry *__d_instantiate_unique(struct dentry *entry,
1801 struct inode *inode)
1802{
1803 struct dentry *alias;
1804 int len = entry->d_name.len;
1805 const char *name = entry->d_name.name;
1806 unsigned int hash = entry->d_name.hash;
1807
1808 if (!inode) {
1809 __d_instantiate(entry, NULL);
1810 return NULL;
1811 }
1812
1813 hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
1814 /*
1815 * Don't need alias->d_lock here, because aliases with
1816 * d_parent == entry->d_parent are not subject to name or
1817 * parent changes, because the parent inode i_mutex is held.
1818 */
1819 if (alias->d_name.hash != hash)
1820 continue;
1821 if (alias->d_parent != entry->d_parent)
1822 continue;
1823 if (alias->d_name.len != len)
1824 continue;
1825 if (dentry_cmp(alias, name, len))
1826 continue;
1827 __dget(alias);
1828 return alias;
1829 }
1830
1831 __d_instantiate(entry, inode);
1832 return NULL;
1833}
1834
1835struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
1836{
1837 struct dentry *result;
1838
1839 BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
1840
1841 if (inode)
1842 spin_lock(&inode->i_lock);
1843 result = __d_instantiate_unique(entry, inode);
1844 if (inode)
1845 spin_unlock(&inode->i_lock);
1846
1847 if (!result) {
1848 security_d_instantiate(entry, inode);
1849 return NULL;
1850 }
1851
1852 BUG_ON(!d_unhashed(result));
1853 iput(inode);
1854 return result;
1855}
1856
1857EXPORT_SYMBOL(d_instantiate_unique);
1858
1859/**
1860 * d_instantiate_no_diralias - instantiate a non-aliased dentry 1784 * d_instantiate_no_diralias - instantiate a non-aliased dentry
1861 * @entry: dentry to complete 1785 * @entry: dentry to complete
1862 * @inode: inode to attach to this dentry 1786 * @inode: inode to attach to this dentry
@@ -2436,6 +2360,86 @@ void d_rehash(struct dentry * entry)
2436} 2360}
2437EXPORT_SYMBOL(d_rehash); 2361EXPORT_SYMBOL(d_rehash);
2438 2362
2363
2364/* inode->i_lock held if inode is non-NULL */
2365
2366static inline void __d_add(struct dentry *dentry, struct inode *inode)
2367{
2368 if (inode) {
2369 __d_instantiate(dentry, inode);
2370 spin_unlock(&inode->i_lock);
2371 }
2372 security_d_instantiate(dentry, inode);
2373 d_rehash(dentry);
2374}
2375
2376/**
2377 * d_add - add dentry to hash queues
2378 * @entry: dentry to add
2379 * @inode: The inode to attach to this dentry
2380 *
2381 * This adds the entry to the hash queues and initializes @inode.
2382 * The entry was actually filled in earlier during d_alloc().
2383 */
2384
2385void d_add(struct dentry *entry, struct inode *inode)
2386{
2387 if (inode)
2388 spin_lock(&inode->i_lock);
2389 __d_add(entry, inode);
2390}
2391EXPORT_SYMBOL(d_add);
2392
2393/**
2394 * d_exact_alias - find and hash an exact unhashed alias
2395 * @entry: dentry to add
2396 * @inode: The inode to go with this dentry
2397 *
2398 * If an unhashed dentry with the same name/parent and desired
2399 * inode already exists, hash and return it. Otherwise, return
2400 * NULL.
2401 *
2402 * Parent directory should be locked.
2403 */
2404struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
2405{
2406 struct dentry *alias;
2407 int len = entry->d_name.len;
2408 const char *name = entry->d_name.name;
2409 unsigned int hash = entry->d_name.hash;
2410
2411 spin_lock(&inode->i_lock);
2412 hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
2413 /*
2414 * Don't need alias->d_lock here, because aliases with
2415 * d_parent == entry->d_parent are not subject to name or
2416 * parent changes, because the parent inode i_mutex is held.
2417 */
2418 if (alias->d_name.hash != hash)
2419 continue;
2420 if (alias->d_parent != entry->d_parent)
2421 continue;
2422 if (alias->d_name.len != len)
2423 continue;
2424 if (dentry_cmp(alias, name, len))
2425 continue;
2426 spin_lock(&alias->d_lock);
2427 if (!d_unhashed(alias)) {
2428 spin_unlock(&alias->d_lock);
2429 alias = NULL;
2430 } else {
2431 __dget_dlock(alias);
2432 _d_rehash(alias);
2433 spin_unlock(&alias->d_lock);
2434 }
2435 spin_unlock(&inode->i_lock);
2436 return alias;
2437 }
2438 spin_unlock(&inode->i_lock);
2439 return NULL;
2440}
2441EXPORT_SYMBOL(d_exact_alias);
2442
2439/** 2443/**
2440 * dentry_update_name_case - update case insensitive dentry with a new name 2444 * dentry_update_name_case - update case insensitive dentry with a new name
2441 * @dentry: dentry to be updated 2445 * @dentry: dentry to be updated
@@ -2772,10 +2776,9 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
2772 2776
2773 BUG_ON(!d_unhashed(dentry)); 2777 BUG_ON(!d_unhashed(dentry));
2774 2778
2775 if (!inode) { 2779 if (!inode)
2776 __d_instantiate(dentry, NULL);
2777 goto out; 2780 goto out;
2778 } 2781
2779 spin_lock(&inode->i_lock); 2782 spin_lock(&inode->i_lock);
2780 if (S_ISDIR(inode->i_mode)) { 2783 if (S_ISDIR(inode->i_mode)) {
2781 struct dentry *new = __d_find_any_alias(inode); 2784 struct dentry *new = __d_find_any_alias(inode);
@@ -2809,12 +2812,8 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
2809 return new; 2812 return new;
2810 } 2813 }
2811 } 2814 }
2812 /* already taking inode->i_lock, so d_add() by hand */
2813 __d_instantiate(dentry, inode);
2814 spin_unlock(&inode->i_lock);
2815out: 2815out:
2816 security_d_instantiate(dentry, inode); 2816 __d_add(dentry, inode);
2817 d_rehash(dentry);
2818 return NULL; 2817 return NULL;
2819} 2818}
2820EXPORT_SYMBOL(d_splice_alias); 2819EXPORT_SYMBOL(d_splice_alias);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index d6a9012d42ad..476f1ecbd1f0 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -253,8 +253,13 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
253 if (ret == 0) 253 if (ret == 0)
254 ret = transferred; 254 ret = transferred;
255 255
256 if (dio->end_io && dio->result) 256 if (dio->end_io) {
257 dio->end_io(dio->iocb, offset, transferred, dio->private); 257 int err;
258
259 err = dio->end_io(dio->iocb, offset, ret, dio->private);
260 if (err)
261 ret = err;
262 }
258 263
259 if (!(dio->flags & DIO_SKIP_DIO_COUNT)) 264 if (!(dio->flags & DIO_SKIP_DIO_COUNT))
260 inode_dio_end(dio->inode); 265 inode_dio_end(dio->inode);
@@ -445,7 +450,8 @@ static struct bio *dio_await_one(struct dio *dio)
445 __set_current_state(TASK_UNINTERRUPTIBLE); 450 __set_current_state(TASK_UNINTERRUPTIBLE);
446 dio->waiter = current; 451 dio->waiter = current;
447 spin_unlock_irqrestore(&dio->bio_lock, flags); 452 spin_unlock_irqrestore(&dio->bio_lock, flags);
448 if (!blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie)) 453 if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
454 !blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
449 io_schedule(); 455 io_schedule();
450 /* wake up sets us TASK_RUNNING */ 456 /* wake up sets us TASK_RUNNING */
451 spin_lock_irqsave(&dio->bio_lock, flags); 457 spin_lock_irqsave(&dio->bio_lock, flags);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 8e294fbbac39..1669f6291c95 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -343,24 +343,20 @@ static struct config_group *make_cluster(struct config_group *g,
343 struct dlm_cluster *cl = NULL; 343 struct dlm_cluster *cl = NULL;
344 struct dlm_spaces *sps = NULL; 344 struct dlm_spaces *sps = NULL;
345 struct dlm_comms *cms = NULL; 345 struct dlm_comms *cms = NULL;
346 void *gps = NULL;
347 346
348 cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS); 347 cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS);
349 gps = kcalloc(3, sizeof(struct config_group *), GFP_NOFS);
350 sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS); 348 sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS);
351 cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS); 349 cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS);
352 350
353 if (!cl || !gps || !sps || !cms) 351 if (!cl || !sps || !cms)
354 goto fail; 352 goto fail;
355 353
356 config_group_init_type_name(&cl->group, name, &cluster_type); 354 config_group_init_type_name(&cl->group, name, &cluster_type);
357 config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type); 355 config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
358 config_group_init_type_name(&cms->cs_group, "comms", &comms_type); 356 config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
359 357
360 cl->group.default_groups = gps; 358 configfs_add_default_group(&sps->ss_group, &cl->group);
361 cl->group.default_groups[0] = &sps->ss_group; 359 configfs_add_default_group(&cms->cs_group, &cl->group);
362 cl->group.default_groups[1] = &cms->cs_group;
363 cl->group.default_groups[2] = NULL;
364 360
365 cl->cl_tcp_port = dlm_config.ci_tcp_port; 361 cl->cl_tcp_port = dlm_config.ci_tcp_port;
366 cl->cl_buffer_size = dlm_config.ci_buffer_size; 362 cl->cl_buffer_size = dlm_config.ci_buffer_size;
@@ -383,7 +379,6 @@ static struct config_group *make_cluster(struct config_group *g,
383 379
384 fail: 380 fail:
385 kfree(cl); 381 kfree(cl);
386 kfree(gps);
387 kfree(sps); 382 kfree(sps);
388 kfree(cms); 383 kfree(cms);
389 return ERR_PTR(-ENOMEM); 384 return ERR_PTR(-ENOMEM);
@@ -392,14 +387,8 @@ static struct config_group *make_cluster(struct config_group *g,
392static void drop_cluster(struct config_group *g, struct config_item *i) 387static void drop_cluster(struct config_group *g, struct config_item *i)
393{ 388{
394 struct dlm_cluster *cl = config_item_to_cluster(i); 389 struct dlm_cluster *cl = config_item_to_cluster(i);
395 struct config_item *tmp;
396 int j;
397 390
398 for (j = 0; cl->group.default_groups[j]; j++) { 391 configfs_remove_default_groups(&cl->group);
399 tmp = &cl->group.default_groups[j]->cg_item;
400 cl->group.default_groups[j] = NULL;
401 config_item_put(tmp);
402 }
403 392
404 space_list = NULL; 393 space_list = NULL;
405 comm_list = NULL; 394 comm_list = NULL;
@@ -410,7 +399,6 @@ static void drop_cluster(struct config_group *g, struct config_item *i)
410static void release_cluster(struct config_item *i) 399static void release_cluster(struct config_item *i)
411{ 400{
412 struct dlm_cluster *cl = config_item_to_cluster(i); 401 struct dlm_cluster *cl = config_item_to_cluster(i);
413 kfree(cl->group.default_groups);
414 kfree(cl); 402 kfree(cl);
415} 403}
416 404
@@ -418,21 +406,17 @@ static struct config_group *make_space(struct config_group *g, const char *name)
418{ 406{
419 struct dlm_space *sp = NULL; 407 struct dlm_space *sp = NULL;
420 struct dlm_nodes *nds = NULL; 408 struct dlm_nodes *nds = NULL;
421 void *gps = NULL;
422 409
423 sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS); 410 sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS);
424 gps = kcalloc(2, sizeof(struct config_group *), GFP_NOFS);
425 nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS); 411 nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS);
426 412
427 if (!sp || !gps || !nds) 413 if (!sp || !nds)
428 goto fail; 414 goto fail;
429 415
430 config_group_init_type_name(&sp->group, name, &space_type); 416 config_group_init_type_name(&sp->group, name, &space_type);
431 config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
432 417
433 sp->group.default_groups = gps; 418 config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
434 sp->group.default_groups[0] = &nds->ns_group; 419 configfs_add_default_group(&nds->ns_group, &sp->group);
435 sp->group.default_groups[1] = NULL;
436 420
437 INIT_LIST_HEAD(&sp->members); 421 INIT_LIST_HEAD(&sp->members);
438 mutex_init(&sp->members_lock); 422 mutex_init(&sp->members_lock);
@@ -441,7 +425,6 @@ static struct config_group *make_space(struct config_group *g, const char *name)
441 425
442 fail: 426 fail:
443 kfree(sp); 427 kfree(sp);
444 kfree(gps);
445 kfree(nds); 428 kfree(nds);
446 return ERR_PTR(-ENOMEM); 429 return ERR_PTR(-ENOMEM);
447} 430}
@@ -449,24 +432,16 @@ static struct config_group *make_space(struct config_group *g, const char *name)
449static void drop_space(struct config_group *g, struct config_item *i) 432static void drop_space(struct config_group *g, struct config_item *i)
450{ 433{
451 struct dlm_space *sp = config_item_to_space(i); 434 struct dlm_space *sp = config_item_to_space(i);
452 struct config_item *tmp;
453 int j;
454 435
455 /* assert list_empty(&sp->members) */ 436 /* assert list_empty(&sp->members) */
456 437
457 for (j = 0; sp->group.default_groups[j]; j++) { 438 configfs_remove_default_groups(&sp->group);
458 tmp = &sp->group.default_groups[j]->cg_item;
459 sp->group.default_groups[j] = NULL;
460 config_item_put(tmp);
461 }
462
463 config_item_put(i); 439 config_item_put(i);
464} 440}
465 441
466static void release_space(struct config_item *i) 442static void release_space(struct config_item *i)
467{ 443{
468 struct dlm_space *sp = config_item_to_space(i); 444 struct dlm_space *sp = config_item_to_space(i);
469 kfree(sp->group.default_groups);
470 kfree(sp); 445 kfree(sp);
471} 446}
472 447
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3a37bd3f9637..00640e70ed7a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -124,7 +124,10 @@ struct connection {
124 struct connection *othercon; 124 struct connection *othercon;
125 struct work_struct rwork; /* Receive workqueue */ 125 struct work_struct rwork; /* Receive workqueue */
126 struct work_struct swork; /* Send workqueue */ 126 struct work_struct swork; /* Send workqueue */
127 void (*orig_error_report)(struct sock *sk); 127 void (*orig_error_report)(struct sock *);
128 void (*orig_data_ready)(struct sock *);
129 void (*orig_state_change)(struct sock *);
130 void (*orig_write_space)(struct sock *);
128}; 131};
129#define sock2con(x) ((struct connection *)(x)->sk_user_data) 132#define sock2con(x) ((struct connection *)(x)->sk_user_data)
130 133
@@ -467,16 +470,24 @@ int dlm_lowcomms_connect_node(int nodeid)
467 470
468static void lowcomms_error_report(struct sock *sk) 471static void lowcomms_error_report(struct sock *sk)
469{ 472{
470 struct connection *con = sock2con(sk); 473 struct connection *con;
471 struct sockaddr_storage saddr; 474 struct sockaddr_storage saddr;
475 int buflen;
476 void (*orig_report)(struct sock *) = NULL;
472 477
473 if (nodeid_to_addr(con->nodeid, &saddr, NULL, false)) { 478 read_lock_bh(&sk->sk_callback_lock);
479 con = sock2con(sk);
480 if (con == NULL)
481 goto out;
482
483 orig_report = con->orig_error_report;
484 if (con->sock == NULL ||
485 kernel_getpeername(con->sock, (struct sockaddr *)&saddr, &buflen)) {
474 printk_ratelimited(KERN_ERR "dlm: node %d: socket error " 486 printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
475 "sending to node %d, port %d, " 487 "sending to node %d, port %d, "
476 "sk_err=%d/%d\n", dlm_our_nodeid(), 488 "sk_err=%d/%d\n", dlm_our_nodeid(),
477 con->nodeid, dlm_config.ci_tcp_port, 489 con->nodeid, dlm_config.ci_tcp_port,
478 sk->sk_err, sk->sk_err_soft); 490 sk->sk_err, sk->sk_err_soft);
479 return;
480 } else if (saddr.ss_family == AF_INET) { 491 } else if (saddr.ss_family == AF_INET) {
481 struct sockaddr_in *sin4 = (struct sockaddr_in *)&saddr; 492 struct sockaddr_in *sin4 = (struct sockaddr_in *)&saddr;
482 493
@@ -499,22 +510,54 @@ static void lowcomms_error_report(struct sock *sk)
499 dlm_config.ci_tcp_port, sk->sk_err, 510 dlm_config.ci_tcp_port, sk->sk_err,
500 sk->sk_err_soft); 511 sk->sk_err_soft);
501 } 512 }
502 con->orig_error_report(sk); 513out:
514 read_unlock_bh(&sk->sk_callback_lock);
515 if (orig_report)
516 orig_report(sk);
517}
518
519/* Note: sk_callback_lock must be locked before calling this function. */
520static void save_callbacks(struct connection *con, struct sock *sk)
521{
522 lock_sock(sk);
523 con->orig_data_ready = sk->sk_data_ready;
524 con->orig_state_change = sk->sk_state_change;
525 con->orig_write_space = sk->sk_write_space;
526 con->orig_error_report = sk->sk_error_report;
527 release_sock(sk);
528}
529
530static void restore_callbacks(struct connection *con, struct sock *sk)
531{
532 write_lock_bh(&sk->sk_callback_lock);
533 lock_sock(sk);
534 sk->sk_user_data = NULL;
535 sk->sk_data_ready = con->orig_data_ready;
536 sk->sk_state_change = con->orig_state_change;
537 sk->sk_write_space = con->orig_write_space;
538 sk->sk_error_report = con->orig_error_report;
539 release_sock(sk);
540 write_unlock_bh(&sk->sk_callback_lock);
503} 541}
504 542
505/* Make a socket active */ 543/* Make a socket active */
506static void add_sock(struct socket *sock, struct connection *con) 544static void add_sock(struct socket *sock, struct connection *con)
507{ 545{
546 struct sock *sk = sock->sk;
547
548 write_lock_bh(&sk->sk_callback_lock);
508 con->sock = sock; 549 con->sock = sock;
509 550
551 sk->sk_user_data = con;
552 if (!test_bit(CF_IS_OTHERCON, &con->flags))
553 save_callbacks(con, sk);
510 /* Install a data_ready callback */ 554 /* Install a data_ready callback */
511 con->sock->sk->sk_data_ready = lowcomms_data_ready; 555 sk->sk_data_ready = lowcomms_data_ready;
512 con->sock->sk->sk_write_space = lowcomms_write_space; 556 sk->sk_write_space = lowcomms_write_space;
513 con->sock->sk->sk_state_change = lowcomms_state_change; 557 sk->sk_state_change = lowcomms_state_change;
514 con->sock->sk->sk_user_data = con; 558 sk->sk_allocation = GFP_NOFS;
515 con->sock->sk->sk_allocation = GFP_NOFS; 559 sk->sk_error_report = lowcomms_error_report;
516 con->orig_error_report = con->sock->sk->sk_error_report; 560 write_unlock_bh(&sk->sk_callback_lock);
517 con->sock->sk->sk_error_report = lowcomms_error_report;
518} 561}
519 562
520/* Add the port number to an IPv6 or 4 sockaddr and return the address 563/* Add the port number to an IPv6 or 4 sockaddr and return the address
@@ -549,6 +592,8 @@ static void close_connection(struct connection *con, bool and_other,
549 592
550 mutex_lock(&con->sock_mutex); 593 mutex_lock(&con->sock_mutex);
551 if (con->sock) { 594 if (con->sock) {
595 if (!test_bit(CF_IS_OTHERCON, &con->flags))
596 restore_callbacks(con, con->sock->sk);
552 sock_release(con->sock); 597 sock_release(con->sock);
553 con->sock = NULL; 598 con->sock = NULL;
554 } 599 }
@@ -1190,6 +1235,8 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
1190 if (result < 0) { 1235 if (result < 0) {
1191 log_print("Failed to set SO_REUSEADDR on socket: %d", result); 1236 log_print("Failed to set SO_REUSEADDR on socket: %d", result);
1192 } 1237 }
1238 sock->sk->sk_user_data = con;
1239
1193 con->rx_action = tcp_accept_from_sock; 1240 con->rx_action = tcp_accept_from_sock;
1194 con->connect_action = tcp_connect_to_sock; 1241 con->connect_action = tcp_connect_to_sock;
1195 1242
@@ -1271,6 +1318,7 @@ static int sctp_listen_for_all(void)
1271 if (result < 0) 1318 if (result < 0)
1272 log_print("Could not set SCTP NODELAY error %d\n", result); 1319 log_print("Could not set SCTP NODELAY error %d\n", result);
1273 1320
1321 write_lock_bh(&sock->sk->sk_callback_lock);
1274 /* Init con struct */ 1322 /* Init con struct */
1275 sock->sk->sk_user_data = con; 1323 sock->sk->sk_user_data = con;
1276 con->sock = sock; 1324 con->sock = sock;
@@ -1278,6 +1326,8 @@ static int sctp_listen_for_all(void)
1278 con->rx_action = sctp_accept_from_sock; 1326 con->rx_action = sctp_accept_from_sock;
1279 con->connect_action = sctp_connect_to_sock; 1327 con->connect_action = sctp_connect_to_sock;
1280 1328
1329 write_unlock_bh(&sock->sk->sk_callback_lock);
1330
1281 /* Bind to all addresses. */ 1331 /* Bind to all addresses. */
1282 if (sctp_bind_addrs(con, dlm_config.ci_tcp_port)) 1332 if (sctp_bind_addrs(con, dlm_config.ci_tcp_port))
1283 goto create_delsock; 1333 goto create_delsock;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 80d6901493cf..64026e53722a 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -23,6 +23,8 @@
23 * 02111-1307, USA. 23 * 02111-1307, USA.
24 */ 24 */
25 25
26#include <crypto/hash.h>
27#include <crypto/skcipher.h>
26#include <linux/fs.h> 28#include <linux/fs.h>
27#include <linux/mount.h> 29#include <linux/mount.h>
28#include <linux/pagemap.h> 30#include <linux/pagemap.h>
@@ -30,7 +32,6 @@
30#include <linux/compiler.h> 32#include <linux/compiler.h>
31#include <linux/key.h> 33#include <linux/key.h>
32#include <linux/namei.h> 34#include <linux/namei.h>
33#include <linux/crypto.h>
34#include <linux/file.h> 35#include <linux/file.h>
35#include <linux/scatterlist.h> 36#include <linux/scatterlist.h>
36#include <linux/slab.h> 37#include <linux/slab.h>
@@ -74,6 +75,19 @@ void ecryptfs_from_hex(char *dst, char *src, int dst_size)
74 } 75 }
75} 76}
76 77
78static int ecryptfs_hash_digest(struct crypto_shash *tfm,
79 char *src, int len, char *dst)
80{
81 SHASH_DESC_ON_STACK(desc, tfm);
82 int err;
83
84 desc->tfm = tfm;
85 desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
86 err = crypto_shash_digest(desc, src, len, dst);
87 shash_desc_zero(desc);
88 return err;
89}
90
77/** 91/**
78 * ecryptfs_calculate_md5 - calculates the md5 of @src 92 * ecryptfs_calculate_md5 - calculates the md5 of @src
79 * @dst: Pointer to 16 bytes of allocated memory 93 * @dst: Pointer to 16 bytes of allocated memory
@@ -88,45 +102,26 @@ static int ecryptfs_calculate_md5(char *dst,
88 struct ecryptfs_crypt_stat *crypt_stat, 102 struct ecryptfs_crypt_stat *crypt_stat,
89 char *src, int len) 103 char *src, int len)
90{ 104{
91 struct scatterlist sg; 105 struct crypto_shash *tfm;
92 struct hash_desc desc = {
93 .tfm = crypt_stat->hash_tfm,
94 .flags = CRYPTO_TFM_REQ_MAY_SLEEP
95 };
96 int rc = 0; 106 int rc = 0;
97 107
98 mutex_lock(&crypt_stat->cs_hash_tfm_mutex); 108 mutex_lock(&crypt_stat->cs_hash_tfm_mutex);
99 sg_init_one(&sg, (u8 *)src, len); 109 tfm = crypt_stat->hash_tfm;
100 if (!desc.tfm) { 110 if (!tfm) {
101 desc.tfm = crypto_alloc_hash(ECRYPTFS_DEFAULT_HASH, 0, 111 tfm = crypto_alloc_shash(ECRYPTFS_DEFAULT_HASH, 0, 0);
102 CRYPTO_ALG_ASYNC); 112 if (IS_ERR(tfm)) {
103 if (IS_ERR(desc.tfm)) { 113 rc = PTR_ERR(tfm);
104 rc = PTR_ERR(desc.tfm);
105 ecryptfs_printk(KERN_ERR, "Error attempting to " 114 ecryptfs_printk(KERN_ERR, "Error attempting to "
106 "allocate crypto context; rc = [%d]\n", 115 "allocate crypto context; rc = [%d]\n",
107 rc); 116 rc);
108 goto out; 117 goto out;
109 } 118 }
110 crypt_stat->hash_tfm = desc.tfm; 119 crypt_stat->hash_tfm = tfm;
111 }
112 rc = crypto_hash_init(&desc);
113 if (rc) {
114 printk(KERN_ERR
115 "%s: Error initializing crypto hash; rc = [%d]\n",
116 __func__, rc);
117 goto out;
118 } 120 }
119 rc = crypto_hash_update(&desc, &sg, len); 121 rc = ecryptfs_hash_digest(tfm, src, len, dst);
120 if (rc) { 122 if (rc) {
121 printk(KERN_ERR 123 printk(KERN_ERR
122 "%s: Error updating crypto hash; rc = [%d]\n", 124 "%s: Error computing crypto hash; rc = [%d]\n",
123 __func__, rc);
124 goto out;
125 }
126 rc = crypto_hash_final(&desc, dst);
127 if (rc) {
128 printk(KERN_ERR
129 "%s: Error finalizing crypto hash; rc = [%d]\n",
130 __func__, rc); 125 __func__, rc);
131 goto out; 126 goto out;
132 } 127 }
@@ -234,10 +229,8 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
234{ 229{
235 struct ecryptfs_key_sig *key_sig, *key_sig_tmp; 230 struct ecryptfs_key_sig *key_sig, *key_sig_tmp;
236 231
237 if (crypt_stat->tfm) 232 crypto_free_skcipher(crypt_stat->tfm);
238 crypto_free_ablkcipher(crypt_stat->tfm); 233 crypto_free_shash(crypt_stat->hash_tfm);
239 if (crypt_stat->hash_tfm)
240 crypto_free_hash(crypt_stat->hash_tfm);
241 list_for_each_entry_safe(key_sig, key_sig_tmp, 234 list_for_each_entry_safe(key_sig, key_sig_tmp,
242 &crypt_stat->keysig_list, crypt_stat_list) { 235 &crypt_stat->keysig_list, crypt_stat_list) {
243 list_del(&key_sig->crypt_stat_list); 236 list_del(&key_sig->crypt_stat_list);
@@ -342,7 +335,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
342 struct scatterlist *src_sg, int size, 335 struct scatterlist *src_sg, int size,
343 unsigned char *iv, int op) 336 unsigned char *iv, int op)
344{ 337{
345 struct ablkcipher_request *req = NULL; 338 struct skcipher_request *req = NULL;
346 struct extent_crypt_result ecr; 339 struct extent_crypt_result ecr;
347 int rc = 0; 340 int rc = 0;
348 341
@@ -358,20 +351,20 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
358 init_completion(&ecr.completion); 351 init_completion(&ecr.completion);
359 352
360 mutex_lock(&crypt_stat->cs_tfm_mutex); 353 mutex_lock(&crypt_stat->cs_tfm_mutex);
361 req = ablkcipher_request_alloc(crypt_stat->tfm, GFP_NOFS); 354 req = skcipher_request_alloc(crypt_stat->tfm, GFP_NOFS);
362 if (!req) { 355 if (!req) {
363 mutex_unlock(&crypt_stat->cs_tfm_mutex); 356 mutex_unlock(&crypt_stat->cs_tfm_mutex);
364 rc = -ENOMEM; 357 rc = -ENOMEM;
365 goto out; 358 goto out;
366 } 359 }
367 360
368 ablkcipher_request_set_callback(req, 361 skcipher_request_set_callback(req,
369 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, 362 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
370 extent_crypt_complete, &ecr); 363 extent_crypt_complete, &ecr);
371 /* Consider doing this once, when the file is opened */ 364 /* Consider doing this once, when the file is opened */
372 if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) { 365 if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) {
373 rc = crypto_ablkcipher_setkey(crypt_stat->tfm, crypt_stat->key, 366 rc = crypto_skcipher_setkey(crypt_stat->tfm, crypt_stat->key,
374 crypt_stat->key_size); 367 crypt_stat->key_size);
375 if (rc) { 368 if (rc) {
376 ecryptfs_printk(KERN_ERR, 369 ecryptfs_printk(KERN_ERR,
377 "Error setting key; rc = [%d]\n", 370 "Error setting key; rc = [%d]\n",
@@ -383,9 +376,9 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
383 crypt_stat->flags |= ECRYPTFS_KEY_SET; 376 crypt_stat->flags |= ECRYPTFS_KEY_SET;
384 } 377 }
385 mutex_unlock(&crypt_stat->cs_tfm_mutex); 378 mutex_unlock(&crypt_stat->cs_tfm_mutex);
386 ablkcipher_request_set_crypt(req, src_sg, dst_sg, size, iv); 379 skcipher_request_set_crypt(req, src_sg, dst_sg, size, iv);
387 rc = op == ENCRYPT ? crypto_ablkcipher_encrypt(req) : 380 rc = op == ENCRYPT ? crypto_skcipher_encrypt(req) :
388 crypto_ablkcipher_decrypt(req); 381 crypto_skcipher_decrypt(req);
389 if (rc == -EINPROGRESS || rc == -EBUSY) { 382 if (rc == -EINPROGRESS || rc == -EBUSY) {
390 struct extent_crypt_result *ecr = req->base.data; 383 struct extent_crypt_result *ecr = req->base.data;
391 384
@@ -394,7 +387,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
394 reinit_completion(&ecr->completion); 387 reinit_completion(&ecr->completion);
395 } 388 }
396out: 389out:
397 ablkcipher_request_free(req); 390 skcipher_request_free(req);
398 return rc; 391 return rc;
399} 392}
400 393
@@ -622,7 +615,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
622 crypt_stat->cipher, "cbc"); 615 crypt_stat->cipher, "cbc");
623 if (rc) 616 if (rc)
624 goto out_unlock; 617 goto out_unlock;
625 crypt_stat->tfm = crypto_alloc_ablkcipher(full_alg_name, 0, 0); 618 crypt_stat->tfm = crypto_alloc_skcipher(full_alg_name, 0, 0);
626 if (IS_ERR(crypt_stat->tfm)) { 619 if (IS_ERR(crypt_stat->tfm)) {
627 rc = PTR_ERR(crypt_stat->tfm); 620 rc = PTR_ERR(crypt_stat->tfm);
628 crypt_stat->tfm = NULL; 621 crypt_stat->tfm = NULL;
@@ -631,7 +624,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
631 full_alg_name); 624 full_alg_name);
632 goto out_free; 625 goto out_free;
633 } 626 }
634 crypto_ablkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY); 627 crypto_skcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
635 rc = 0; 628 rc = 0;
636out_free: 629out_free:
637 kfree(full_alg_name); 630 kfree(full_alg_name);
@@ -1499,16 +1492,14 @@ out:
1499 */ 1492 */
1500static int 1493static int
1501ecryptfs_encrypt_filename(struct ecryptfs_filename *filename, 1494ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
1502 struct ecryptfs_crypt_stat *crypt_stat,
1503 struct ecryptfs_mount_crypt_stat *mount_crypt_stat) 1495 struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
1504{ 1496{
1505 int rc = 0; 1497 int rc = 0;
1506 1498
1507 filename->encrypted_filename = NULL; 1499 filename->encrypted_filename = NULL;
1508 filename->encrypted_filename_size = 0; 1500 filename->encrypted_filename_size = 0;
1509 if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) 1501 if (mount_crypt_stat && (mount_crypt_stat->flags
1510 || (mount_crypt_stat && (mount_crypt_stat->flags 1502 & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)) {
1511 & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
1512 size_t packet_size; 1503 size_t packet_size;
1513 size_t remaining_bytes; 1504 size_t remaining_bytes;
1514 1505
@@ -1591,7 +1582,7 @@ out:
1591 * event, regardless of whether this function succeeds for fails. 1582 * event, regardless of whether this function succeeds for fails.
1592 */ 1583 */
1593static int 1584static int
1594ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, 1585ecryptfs_process_key_cipher(struct crypto_skcipher **key_tfm,
1595 char *cipher_name, size_t *key_size) 1586 char *cipher_name, size_t *key_size)
1596{ 1587{
1597 char dummy_key[ECRYPTFS_MAX_KEY_BYTES]; 1588 char dummy_key[ECRYPTFS_MAX_KEY_BYTES];
@@ -1609,21 +1600,18 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
1609 "ecb"); 1600 "ecb");
1610 if (rc) 1601 if (rc)
1611 goto out; 1602 goto out;
1612 *key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC); 1603 *key_tfm = crypto_alloc_skcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC);
1613 if (IS_ERR(*key_tfm)) { 1604 if (IS_ERR(*key_tfm)) {
1614 rc = PTR_ERR(*key_tfm); 1605 rc = PTR_ERR(*key_tfm);
1615 printk(KERN_ERR "Unable to allocate crypto cipher with name " 1606 printk(KERN_ERR "Unable to allocate crypto cipher with name "
1616 "[%s]; rc = [%d]\n", full_alg_name, rc); 1607 "[%s]; rc = [%d]\n", full_alg_name, rc);
1617 goto out; 1608 goto out;
1618 } 1609 }
1619 crypto_blkcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY); 1610 crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY);
1620 if (*key_size == 0) { 1611 if (*key_size == 0)
1621 struct blkcipher_alg *alg = crypto_blkcipher_alg(*key_tfm); 1612 *key_size = crypto_skcipher_default_keysize(*key_tfm);
1622
1623 *key_size = alg->max_keysize;
1624 }
1625 get_random_bytes(dummy_key, *key_size); 1613 get_random_bytes(dummy_key, *key_size);
1626 rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size); 1614 rc = crypto_skcipher_setkey(*key_tfm, dummy_key, *key_size);
1627 if (rc) { 1615 if (rc) {
1628 printk(KERN_ERR "Error attempting to set key of size [%zd] for " 1616 printk(KERN_ERR "Error attempting to set key of size [%zd] for "
1629 "cipher [%s]; rc = [%d]\n", *key_size, full_alg_name, 1617 "cipher [%s]; rc = [%d]\n", *key_size, full_alg_name,
@@ -1660,8 +1648,7 @@ int ecryptfs_destroy_crypto(void)
1660 list_for_each_entry_safe(key_tfm, key_tfm_tmp, &key_tfm_list, 1648 list_for_each_entry_safe(key_tfm, key_tfm_tmp, &key_tfm_list,
1661 key_tfm_list) { 1649 key_tfm_list) {
1662 list_del(&key_tfm->key_tfm_list); 1650 list_del(&key_tfm->key_tfm_list);
1663 if (key_tfm->key_tfm) 1651 crypto_free_skcipher(key_tfm->key_tfm);
1664 crypto_free_blkcipher(key_tfm->key_tfm);
1665 kmem_cache_free(ecryptfs_key_tfm_cache, key_tfm); 1652 kmem_cache_free(ecryptfs_key_tfm_cache, key_tfm);
1666 } 1653 }
1667 mutex_unlock(&key_tfm_list_mutex); 1654 mutex_unlock(&key_tfm_list_mutex);
@@ -1747,7 +1734,7 @@ int ecryptfs_tfm_exists(char *cipher_name, struct ecryptfs_key_tfm **key_tfm)
1747 * Searches for cached item first, and creates new if not found. 1734 * Searches for cached item first, and creates new if not found.
1748 * Returns 0 on success, non-zero if adding new cipher failed 1735 * Returns 0 on success, non-zero if adding new cipher failed
1749 */ 1736 */
1750int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm, 1737int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_skcipher **tfm,
1751 struct mutex **tfm_mutex, 1738 struct mutex **tfm_mutex,
1752 char *cipher_name) 1739 char *cipher_name)
1753{ 1740{
@@ -1944,7 +1931,6 @@ out:
1944int ecryptfs_encrypt_and_encode_filename( 1931int ecryptfs_encrypt_and_encode_filename(
1945 char **encoded_name, 1932 char **encoded_name,
1946 size_t *encoded_name_size, 1933 size_t *encoded_name_size,
1947 struct ecryptfs_crypt_stat *crypt_stat,
1948 struct ecryptfs_mount_crypt_stat *mount_crypt_stat, 1934 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
1949 const char *name, size_t name_size) 1935 const char *name, size_t name_size)
1950{ 1936{
@@ -1953,9 +1939,8 @@ int ecryptfs_encrypt_and_encode_filename(
1953 1939
1954 (*encoded_name) = NULL; 1940 (*encoded_name) = NULL;
1955 (*encoded_name_size) = 0; 1941 (*encoded_name_size) = 0;
1956 if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES)) 1942 if (mount_crypt_stat && (mount_crypt_stat->flags
1957 || (mount_crypt_stat && (mount_crypt_stat->flags 1943 & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) {
1958 & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) {
1959 struct ecryptfs_filename *filename; 1944 struct ecryptfs_filename *filename;
1960 1945
1961 filename = kzalloc(sizeof(*filename), GFP_KERNEL); 1946 filename = kzalloc(sizeof(*filename), GFP_KERNEL);
@@ -1968,8 +1953,7 @@ int ecryptfs_encrypt_and_encode_filename(
1968 } 1953 }
1969 filename->filename = (char *)name; 1954 filename->filename = (char *)name;
1970 filename->filename_size = name_size; 1955 filename->filename_size = name_size;
1971 rc = ecryptfs_encrypt_filename(filename, crypt_stat, 1956 rc = ecryptfs_encrypt_filename(filename, mount_crypt_stat);
1972 mount_crypt_stat);
1973 if (rc) { 1957 if (rc) {
1974 printk(KERN_ERR "%s: Error attempting to encrypt " 1958 printk(KERN_ERR "%s: Error attempting to encrypt "
1975 "filename; rc = [%d]\n", __func__, rc); 1959 "filename; rc = [%d]\n", __func__, rc);
@@ -1980,11 +1964,9 @@ int ecryptfs_encrypt_and_encode_filename(
1980 NULL, &encoded_name_no_prefix_size, 1964 NULL, &encoded_name_no_prefix_size,
1981 filename->encrypted_filename, 1965 filename->encrypted_filename,
1982 filename->encrypted_filename_size); 1966 filename->encrypted_filename_size);
1983 if ((crypt_stat && (crypt_stat->flags 1967 if (mount_crypt_stat
1984 & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
1985 || (mount_crypt_stat
1986 && (mount_crypt_stat->flags 1968 && (mount_crypt_stat->flags
1987 & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) 1969 & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))
1988 (*encoded_name_size) = 1970 (*encoded_name_size) =
1989 (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 1971 (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
1990 + encoded_name_no_prefix_size); 1972 + encoded_name_no_prefix_size);
@@ -2002,11 +1984,9 @@ int ecryptfs_encrypt_and_encode_filename(
2002 kfree(filename); 1984 kfree(filename);
2003 goto out; 1985 goto out;
2004 } 1986 }
2005 if ((crypt_stat && (crypt_stat->flags 1987 if (mount_crypt_stat
2006 & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
2007 || (mount_crypt_stat
2008 && (mount_crypt_stat->flags 1988 && (mount_crypt_stat->flags
2009 & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) { 1989 & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)) {
2010 memcpy((*encoded_name), 1990 memcpy((*encoded_name),
2011 ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, 1991 ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
2012 ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE); 1992 ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE);
@@ -2120,7 +2100,7 @@ out:
2120int ecryptfs_set_f_namelen(long *namelen, long lower_namelen, 2100int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
2121 struct ecryptfs_mount_crypt_stat *mount_crypt_stat) 2101 struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
2122{ 2102{
2123 struct blkcipher_desc desc; 2103 struct crypto_skcipher *tfm;
2124 struct mutex *tfm_mutex; 2104 struct mutex *tfm_mutex;
2125 size_t cipher_blocksize; 2105 size_t cipher_blocksize;
2126 int rc; 2106 int rc;
@@ -2130,7 +2110,7 @@ int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
2130 return 0; 2110 return 0;
2131 } 2111 }
2132 2112
2133 rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex, 2113 rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&tfm, &tfm_mutex,
2134 mount_crypt_stat->global_default_fn_cipher_name); 2114 mount_crypt_stat->global_default_fn_cipher_name);
2135 if (unlikely(rc)) { 2115 if (unlikely(rc)) {
2136 (*namelen) = 0; 2116 (*namelen) = 0;
@@ -2138,7 +2118,7 @@ int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
2138 } 2118 }
2139 2119
2140 mutex_lock(tfm_mutex); 2120 mutex_lock(tfm_mutex);
2141 cipher_blocksize = crypto_blkcipher_blocksize(desc.tfm); 2121 cipher_blocksize = crypto_skcipher_blocksize(tfm);
2142 mutex_unlock(tfm_mutex); 2122 mutex_unlock(tfm_mutex);
2143 2123
2144 /* Return an exact amount for the common cases */ 2124 /* Return an exact amount for the common cases */
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 7b39260c7bba..d123fbaa28e0 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -28,6 +28,7 @@
28#ifndef ECRYPTFS_KERNEL_H 28#ifndef ECRYPTFS_KERNEL_H
29#define ECRYPTFS_KERNEL_H 29#define ECRYPTFS_KERNEL_H
30 30
31#include <crypto/skcipher.h>
31#include <keys/user-type.h> 32#include <keys/user-type.h>
32#include <keys/encrypted-type.h> 33#include <keys/encrypted-type.h>
33#include <linux/fs.h> 34#include <linux/fs.h>
@@ -38,7 +39,6 @@
38#include <linux/nsproxy.h> 39#include <linux/nsproxy.h>
39#include <linux/backing-dev.h> 40#include <linux/backing-dev.h>
40#include <linux/ecryptfs.h> 41#include <linux/ecryptfs.h>
41#include <linux/crypto.h>
42 42
43#define ECRYPTFS_DEFAULT_IV_BYTES 16 43#define ECRYPTFS_DEFAULT_IV_BYTES 16
44#define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096 44#define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096
@@ -233,9 +233,9 @@ struct ecryptfs_crypt_stat {
233 size_t extent_shift; 233 size_t extent_shift;
234 unsigned int extent_mask; 234 unsigned int extent_mask;
235 struct ecryptfs_mount_crypt_stat *mount_crypt_stat; 235 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
236 struct crypto_ablkcipher *tfm; 236 struct crypto_skcipher *tfm;
237 struct crypto_hash *hash_tfm; /* Crypto context for generating 237 struct crypto_shash *hash_tfm; /* Crypto context for generating
238 * the initialization vectors */ 238 * the initialization vectors */
239 unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; 239 unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
240 unsigned char key[ECRYPTFS_MAX_KEY_BYTES]; 240 unsigned char key[ECRYPTFS_MAX_KEY_BYTES];
241 unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES]; 241 unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES];
@@ -309,7 +309,7 @@ struct ecryptfs_global_auth_tok {
309 * keeps a list of crypto API contexts around to use when needed. 309 * keeps a list of crypto API contexts around to use when needed.
310 */ 310 */
311struct ecryptfs_key_tfm { 311struct ecryptfs_key_tfm {
312 struct crypto_blkcipher *key_tfm; 312 struct crypto_skcipher *key_tfm;
313 size_t key_size; 313 size_t key_size;
314 struct mutex key_tfm_mutex; 314 struct mutex key_tfm_mutex;
315 struct list_head key_tfm_list; 315 struct list_head key_tfm_list;
@@ -569,7 +569,6 @@ int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
569int ecryptfs_encrypt_and_encode_filename( 569int ecryptfs_encrypt_and_encode_filename(
570 char **encoded_name, 570 char **encoded_name,
571 size_t *encoded_name_size, 571 size_t *encoded_name_size,
572 struct ecryptfs_crypt_stat *crypt_stat,
573 struct ecryptfs_mount_crypt_stat *mount_crypt_stat, 572 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
574 const char *name, size_t name_size); 573 const char *name, size_t name_size);
575struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry); 574struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
@@ -659,7 +658,7 @@ ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name,
659int ecryptfs_init_crypto(void); 658int ecryptfs_init_crypto(void);
660int ecryptfs_destroy_crypto(void); 659int ecryptfs_destroy_crypto(void);
661int ecryptfs_tfm_exists(char *cipher_name, struct ecryptfs_key_tfm **key_tfm); 660int ecryptfs_tfm_exists(char *cipher_name, struct ecryptfs_key_tfm **key_tfm);
662int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm, 661int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_skcipher **tfm,
663 struct mutex **tfm_mutex, 662 struct mutex **tfm_mutex,
664 char *cipher_name); 663 char *cipher_name);
665int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, 664int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 4e685ac1024d..121114e9a464 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -29,7 +29,6 @@
29#include <linux/dcache.h> 29#include <linux/dcache.h>
30#include <linux/namei.h> 30#include <linux/namei.h>
31#include <linux/mount.h> 31#include <linux/mount.h>
32#include <linux/crypto.h>
33#include <linux/fs_stack.h> 32#include <linux/fs_stack.h>
34#include <linux/slab.h> 33#include <linux/slab.h>
35#include <linux/xattr.h> 34#include <linux/xattr.h>
@@ -397,11 +396,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
397 int rc = 0; 396 int rc = 0;
398 397
399 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); 398 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
400 inode_lock(d_inode(lower_dir_dentry)); 399 lower_dentry = lookup_one_len_unlocked(ecryptfs_dentry->d_name.name,
401 lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
402 lower_dir_dentry, 400 lower_dir_dentry,
403 ecryptfs_dentry->d_name.len); 401 ecryptfs_dentry->d_name.len);
404 inode_unlock(d_inode(lower_dir_dentry));
405 if (IS_ERR(lower_dentry)) { 402 if (IS_ERR(lower_dentry)) {
406 rc = PTR_ERR(lower_dentry); 403 rc = PTR_ERR(lower_dentry);
407 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " 404 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
@@ -419,18 +416,16 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
419 dput(lower_dentry); 416 dput(lower_dentry);
420 rc = ecryptfs_encrypt_and_encode_filename( 417 rc = ecryptfs_encrypt_and_encode_filename(
421 &encrypted_and_encoded_name, &encrypted_and_encoded_name_size, 418 &encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
422 NULL, mount_crypt_stat, ecryptfs_dentry->d_name.name, 419 mount_crypt_stat, ecryptfs_dentry->d_name.name,
423 ecryptfs_dentry->d_name.len); 420 ecryptfs_dentry->d_name.len);
424 if (rc) { 421 if (rc) {
425 printk(KERN_ERR "%s: Error attempting to encrypt and encode " 422 printk(KERN_ERR "%s: Error attempting to encrypt and encode "
426 "filename; rc = [%d]\n", __func__, rc); 423 "filename; rc = [%d]\n", __func__, rc);
427 goto out; 424 goto out;
428 } 425 }
429 inode_lock(d_inode(lower_dir_dentry)); 426 lower_dentry = lookup_one_len_unlocked(encrypted_and_encoded_name,
430 lower_dentry = lookup_one_len(encrypted_and_encoded_name,
431 lower_dir_dentry, 427 lower_dir_dentry,
432 encrypted_and_encoded_name_size); 428 encrypted_and_encoded_name_size);
433 inode_unlock(d_inode(lower_dir_dentry));
434 if (IS_ERR(lower_dentry)) { 429 if (IS_ERR(lower_dentry)) {
435 rc = PTR_ERR(lower_dentry); 430 rc = PTR_ERR(lower_dentry);
436 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " 431 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
@@ -502,7 +497,6 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
502 dir->i_sb)->mount_crypt_stat; 497 dir->i_sb)->mount_crypt_stat;
503 rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname, 498 rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
504 &encoded_symlen, 499 &encoded_symlen,
505 NULL,
506 mount_crypt_stat, symname, 500 mount_crypt_stat, symname,
507 strlen(symname)); 501 strlen(symname));
508 if (rc) 502 if (rc)
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 6bd67e2011f0..9893d1538122 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -25,11 +25,12 @@
25 * 02111-1307, USA. 25 * 02111-1307, USA.
26 */ 26 */
27 27
28#include <crypto/hash.h>
29#include <crypto/skcipher.h>
28#include <linux/string.h> 30#include <linux/string.h>
29#include <linux/pagemap.h> 31#include <linux/pagemap.h>
30#include <linux/key.h> 32#include <linux/key.h>
31#include <linux/random.h> 33#include <linux/random.h>
32#include <linux/crypto.h>
33#include <linux/scatterlist.h> 34#include <linux/scatterlist.h>
34#include <linux/slab.h> 35#include <linux/slab.h>
35#include "ecryptfs_kernel.h" 36#include "ecryptfs_kernel.h"
@@ -601,12 +602,13 @@ struct ecryptfs_write_tag_70_packet_silly_stack {
601 struct ecryptfs_auth_tok *auth_tok; 602 struct ecryptfs_auth_tok *auth_tok;
602 struct scatterlist src_sg[2]; 603 struct scatterlist src_sg[2];
603 struct scatterlist dst_sg[2]; 604 struct scatterlist dst_sg[2];
604 struct blkcipher_desc desc; 605 struct crypto_skcipher *skcipher_tfm;
606 struct skcipher_request *skcipher_req;
605 char iv[ECRYPTFS_MAX_IV_BYTES]; 607 char iv[ECRYPTFS_MAX_IV_BYTES];
606 char hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; 608 char hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
607 char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; 609 char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
608 struct hash_desc hash_desc; 610 struct crypto_shash *hash_tfm;
609 struct scatterlist hash_sg; 611 struct shash_desc *hash_desc;
610}; 612};
611 613
612/** 614/**
@@ -629,14 +631,12 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
629 struct key *auth_tok_key = NULL; 631 struct key *auth_tok_key = NULL;
630 int rc = 0; 632 int rc = 0;
631 633
632 s = kmalloc(sizeof(*s), GFP_KERNEL); 634 s = kzalloc(sizeof(*s), GFP_KERNEL);
633 if (!s) { 635 if (!s) {
634 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " 636 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
635 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); 637 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
636 rc = -ENOMEM; 638 return -ENOMEM;
637 goto out;
638 } 639 }
639 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
640 (*packet_size) = 0; 640 (*packet_size) = 0;
641 rc = ecryptfs_find_auth_tok_for_sig( 641 rc = ecryptfs_find_auth_tok_for_sig(
642 &auth_tok_key, 642 &auth_tok_key,
@@ -649,7 +649,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
649 goto out; 649 goto out;
650 } 650 }
651 rc = ecryptfs_get_tfm_and_mutex_for_cipher_name( 651 rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(
652 &s->desc.tfm, 652 &s->skcipher_tfm,
653 &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name); 653 &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name);
654 if (unlikely(rc)) { 654 if (unlikely(rc)) {
655 printk(KERN_ERR "Internal error whilst attempting to get " 655 printk(KERN_ERR "Internal error whilst attempting to get "
@@ -658,7 +658,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
658 goto out; 658 goto out;
659 } 659 }
660 mutex_lock(s->tfm_mutex); 660 mutex_lock(s->tfm_mutex);
661 s->block_size = crypto_blkcipher_blocksize(s->desc.tfm); 661 s->block_size = crypto_skcipher_blocksize(s->skcipher_tfm);
662 /* Plus one for the \0 separator between the random prefix 662 /* Plus one for the \0 separator between the random prefix
663 * and the plaintext filename */ 663 * and the plaintext filename */
664 s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1); 664 s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1);
@@ -691,6 +691,19 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
691 rc = -EINVAL; 691 rc = -EINVAL;
692 goto out_unlock; 692 goto out_unlock;
693 } 693 }
694
695 s->skcipher_req = skcipher_request_alloc(s->skcipher_tfm, GFP_KERNEL);
696 if (!s->skcipher_req) {
697 printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
698 "skcipher_request_alloc for %s\n", __func__,
699 crypto_skcipher_driver_name(s->skcipher_tfm));
700 rc = -ENOMEM;
701 goto out_unlock;
702 }
703
704 skcipher_request_set_callback(s->skcipher_req,
705 CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
706
694 s->block_aligned_filename = kzalloc(s->block_aligned_filename_size, 707 s->block_aligned_filename = kzalloc(s->block_aligned_filename_size,
695 GFP_KERNEL); 708 GFP_KERNEL);
696 if (!s->block_aligned_filename) { 709 if (!s->block_aligned_filename) {
@@ -700,7 +713,6 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
700 rc = -ENOMEM; 713 rc = -ENOMEM;
701 goto out_unlock; 714 goto out_unlock;
702 } 715 }
703 s->i = 0;
704 dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE; 716 dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE;
705 rc = ecryptfs_write_packet_length(&dest[s->i], 717 rc = ecryptfs_write_packet_length(&dest[s->i],
706 (ECRYPTFS_SIG_SIZE 718 (ECRYPTFS_SIG_SIZE
@@ -738,40 +750,36 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
738 "password tokens\n", __func__); 750 "password tokens\n", __func__);
739 goto out_free_unlock; 751 goto out_free_unlock;
740 } 752 }
741 sg_init_one( 753 s->hash_tfm = crypto_alloc_shash(ECRYPTFS_TAG_70_DIGEST, 0, 0);
742 &s->hash_sg, 754 if (IS_ERR(s->hash_tfm)) {
743 (u8 *)s->auth_tok->token.password.session_key_encryption_key, 755 rc = PTR_ERR(s->hash_tfm);
744 s->auth_tok->token.password.session_key_encryption_key_bytes);
745 s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
746 s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0,
747 CRYPTO_ALG_ASYNC);
748 if (IS_ERR(s->hash_desc.tfm)) {
749 rc = PTR_ERR(s->hash_desc.tfm);
750 printk(KERN_ERR "%s: Error attempting to " 756 printk(KERN_ERR "%s: Error attempting to "
751 "allocate hash crypto context; rc = [%d]\n", 757 "allocate hash crypto context; rc = [%d]\n",
752 __func__, rc); 758 __func__, rc);
753 goto out_free_unlock; 759 goto out_free_unlock;
754 } 760 }
755 rc = crypto_hash_init(&s->hash_desc); 761
756 if (rc) { 762 s->hash_desc = kmalloc(sizeof(*s->hash_desc) +
757 printk(KERN_ERR 763 crypto_shash_descsize(s->hash_tfm), GFP_KERNEL);
758 "%s: Error initializing crypto hash; rc = [%d]\n", 764 if (!s->hash_desc) {
759 __func__, rc); 765 printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
760 goto out_release_free_unlock; 766 "kmalloc [%zd] bytes\n", __func__,
761 } 767 sizeof(*s->hash_desc) +
762 rc = crypto_hash_update( 768 crypto_shash_descsize(s->hash_tfm));
763 &s->hash_desc, &s->hash_sg, 769 rc = -ENOMEM;
764 s->auth_tok->token.password.session_key_encryption_key_bytes);
765 if (rc) {
766 printk(KERN_ERR
767 "%s: Error updating crypto hash; rc = [%d]\n",
768 __func__, rc);
769 goto out_release_free_unlock; 770 goto out_release_free_unlock;
770 } 771 }
771 rc = crypto_hash_final(&s->hash_desc, s->hash); 772
773 s->hash_desc->tfm = s->hash_tfm;
774 s->hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
775
776 rc = crypto_shash_digest(s->hash_desc,
777 (u8 *)s->auth_tok->token.password.session_key_encryption_key,
778 s->auth_tok->token.password.session_key_encryption_key_bytes,
779 s->hash);
772 if (rc) { 780 if (rc) {
773 printk(KERN_ERR 781 printk(KERN_ERR
774 "%s: Error finalizing crypto hash; rc = [%d]\n", 782 "%s: Error computing crypto hash; rc = [%d]\n",
775 __func__, rc); 783 __func__, rc);
776 goto out_release_free_unlock; 784 goto out_release_free_unlock;
777 } 785 }
@@ -780,27 +788,12 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
780 s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)]; 788 s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)];
781 if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE) 789 if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)
782 == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) { 790 == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) {
783 sg_init_one(&s->hash_sg, (u8 *)s->hash, 791 rc = crypto_shash_digest(s->hash_desc, (u8 *)s->hash,
784 ECRYPTFS_TAG_70_DIGEST_SIZE); 792 ECRYPTFS_TAG_70_DIGEST_SIZE,
785 rc = crypto_hash_init(&s->hash_desc); 793 s->tmp_hash);
786 if (rc) {
787 printk(KERN_ERR
788 "%s: Error initializing crypto hash; "
789 "rc = [%d]\n", __func__, rc);
790 goto out_release_free_unlock;
791 }
792 rc = crypto_hash_update(&s->hash_desc, &s->hash_sg,
793 ECRYPTFS_TAG_70_DIGEST_SIZE);
794 if (rc) { 794 if (rc) {
795 printk(KERN_ERR 795 printk(KERN_ERR
796 "%s: Error updating crypto hash; " 796 "%s: Error computing crypto hash; "
797 "rc = [%d]\n", __func__, rc);
798 goto out_release_free_unlock;
799 }
800 rc = crypto_hash_final(&s->hash_desc, s->tmp_hash);
801 if (rc) {
802 printk(KERN_ERR
803 "%s: Error finalizing crypto hash; "
804 "rc = [%d]\n", __func__, rc); 797 "rc = [%d]\n", __func__, rc);
805 goto out_release_free_unlock; 798 goto out_release_free_unlock;
806 } 799 }
@@ -834,10 +827,8 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
834 * of the IV here, so we just use 0's for the IV. Note the 827 * of the IV here, so we just use 0's for the IV. Note the
835 * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 828 * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
836 * >= ECRYPTFS_MAX_IV_BYTES. */ 829 * >= ECRYPTFS_MAX_IV_BYTES. */
837 memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); 830 rc = crypto_skcipher_setkey(
838 s->desc.info = s->iv; 831 s->skcipher_tfm,
839 rc = crypto_blkcipher_setkey(
840 s->desc.tfm,
841 s->auth_tok->token.password.session_key_encryption_key, 832 s->auth_tok->token.password.session_key_encryption_key,
842 mount_crypt_stat->global_default_fn_cipher_key_bytes); 833 mount_crypt_stat->global_default_fn_cipher_key_bytes);
843 if (rc < 0) { 834 if (rc < 0) {
@@ -850,8 +841,9 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
850 mount_crypt_stat->global_default_fn_cipher_key_bytes); 841 mount_crypt_stat->global_default_fn_cipher_key_bytes);
851 goto out_release_free_unlock; 842 goto out_release_free_unlock;
852 } 843 }
853 rc = crypto_blkcipher_encrypt_iv(&s->desc, s->dst_sg, s->src_sg, 844 skcipher_request_set_crypt(s->skcipher_req, s->src_sg, s->dst_sg,
854 s->block_aligned_filename_size); 845 s->block_aligned_filename_size, s->iv);
846 rc = crypto_skcipher_encrypt(s->skcipher_req);
855 if (rc) { 847 if (rc) {
856 printk(KERN_ERR "%s: Error attempting to encrypt filename; " 848 printk(KERN_ERR "%s: Error attempting to encrypt filename; "
857 "rc = [%d]\n", __func__, rc); 849 "rc = [%d]\n", __func__, rc);
@@ -861,7 +853,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
861 (*packet_size) = s->i; 853 (*packet_size) = s->i;
862 (*remaining_bytes) -= (*packet_size); 854 (*remaining_bytes) -= (*packet_size);
863out_release_free_unlock: 855out_release_free_unlock:
864 crypto_free_hash(s->hash_desc.tfm); 856 crypto_free_shash(s->hash_tfm);
865out_free_unlock: 857out_free_unlock:
866 kzfree(s->block_aligned_filename); 858 kzfree(s->block_aligned_filename);
867out_unlock: 859out_unlock:
@@ -871,6 +863,8 @@ out:
871 up_write(&(auth_tok_key->sem)); 863 up_write(&(auth_tok_key->sem));
872 key_put(auth_tok_key); 864 key_put(auth_tok_key);
873 } 865 }
866 skcipher_request_free(s->skcipher_req);
867 kzfree(s->hash_desc);
874 kfree(s); 868 kfree(s);
875 return rc; 869 return rc;
876} 870}
@@ -888,7 +882,8 @@ struct ecryptfs_parse_tag_70_packet_silly_stack {
888 struct ecryptfs_auth_tok *auth_tok; 882 struct ecryptfs_auth_tok *auth_tok;
889 struct scatterlist src_sg[2]; 883 struct scatterlist src_sg[2];
890 struct scatterlist dst_sg[2]; 884 struct scatterlist dst_sg[2];
891 struct blkcipher_desc desc; 885 struct crypto_skcipher *skcipher_tfm;
886 struct skcipher_request *skcipher_req;
892 char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1]; 887 char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1];
893 char iv[ECRYPTFS_MAX_IV_BYTES]; 888 char iv[ECRYPTFS_MAX_IV_BYTES];
894 char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; 889 char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
@@ -922,14 +917,12 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
922 (*packet_size) = 0; 917 (*packet_size) = 0;
923 (*filename_size) = 0; 918 (*filename_size) = 0;
924 (*filename) = NULL; 919 (*filename) = NULL;
925 s = kmalloc(sizeof(*s), GFP_KERNEL); 920 s = kzalloc(sizeof(*s), GFP_KERNEL);
926 if (!s) { 921 if (!s) {
927 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " 922 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
928 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); 923 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
929 rc = -ENOMEM; 924 return -ENOMEM;
930 goto out;
931 } 925 }
932 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
933 if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) { 926 if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) {
934 printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be " 927 printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
935 "at least [%d]\n", __func__, max_packet_size, 928 "at least [%d]\n", __func__, max_packet_size,
@@ -992,7 +985,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
992 rc); 985 rc);
993 goto out; 986 goto out;
994 } 987 }
995 rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm, 988 rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->skcipher_tfm,
996 &s->tfm_mutex, 989 &s->tfm_mutex,
997 s->cipher_string); 990 s->cipher_string);
998 if (unlikely(rc)) { 991 if (unlikely(rc)) {
@@ -1030,12 +1023,23 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
1030 __func__, rc, s->block_aligned_filename_size); 1023 __func__, rc, s->block_aligned_filename_size);
1031 goto out_free_unlock; 1024 goto out_free_unlock;
1032 } 1025 }
1026
1027 s->skcipher_req = skcipher_request_alloc(s->skcipher_tfm, GFP_KERNEL);
1028 if (!s->skcipher_req) {
1029 printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
1030 "skcipher_request_alloc for %s\n", __func__,
1031 crypto_skcipher_driver_name(s->skcipher_tfm));
1032 rc = -ENOMEM;
1033 goto out_free_unlock;
1034 }
1035
1036 skcipher_request_set_callback(s->skcipher_req,
1037 CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
1038
1033 /* The characters in the first block effectively do the job of 1039 /* The characters in the first block effectively do the job of
1034 * the IV here, so we just use 0's for the IV. Note the 1040 * the IV here, so we just use 0's for the IV. Note the
1035 * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 1041 * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
1036 * >= ECRYPTFS_MAX_IV_BYTES. */ 1042 * >= ECRYPTFS_MAX_IV_BYTES. */
1037 memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
1038 s->desc.info = s->iv;
1039 /* TODO: Support other key modules than passphrase for 1043 /* TODO: Support other key modules than passphrase for
1040 * filename encryption */ 1044 * filename encryption */
1041 if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) { 1045 if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
@@ -1044,8 +1048,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
1044 "password tokens\n", __func__); 1048 "password tokens\n", __func__);
1045 goto out_free_unlock; 1049 goto out_free_unlock;
1046 } 1050 }
1047 rc = crypto_blkcipher_setkey( 1051 rc = crypto_skcipher_setkey(
1048 s->desc.tfm, 1052 s->skcipher_tfm,
1049 s->auth_tok->token.password.session_key_encryption_key, 1053 s->auth_tok->token.password.session_key_encryption_key,
1050 mount_crypt_stat->global_default_fn_cipher_key_bytes); 1054 mount_crypt_stat->global_default_fn_cipher_key_bytes);
1051 if (rc < 0) { 1055 if (rc < 0) {
@@ -1058,14 +1062,14 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
1058 mount_crypt_stat->global_default_fn_cipher_key_bytes); 1062 mount_crypt_stat->global_default_fn_cipher_key_bytes);
1059 goto out_free_unlock; 1063 goto out_free_unlock;
1060 } 1064 }
1061 rc = crypto_blkcipher_decrypt_iv(&s->desc, s->dst_sg, s->src_sg, 1065 skcipher_request_set_crypt(s->skcipher_req, s->src_sg, s->dst_sg,
1062 s->block_aligned_filename_size); 1066 s->block_aligned_filename_size, s->iv);
1067 rc = crypto_skcipher_decrypt(s->skcipher_req);
1063 if (rc) { 1068 if (rc) {
1064 printk(KERN_ERR "%s: Error attempting to decrypt filename; " 1069 printk(KERN_ERR "%s: Error attempting to decrypt filename; "
1065 "rc = [%d]\n", __func__, rc); 1070 "rc = [%d]\n", __func__, rc);
1066 goto out_free_unlock; 1071 goto out_free_unlock;
1067 } 1072 }
1068 s->i = 0;
1069 while (s->decrypted_filename[s->i] != '\0' 1073 while (s->decrypted_filename[s->i] != '\0'
1070 && s->i < s->block_aligned_filename_size) 1074 && s->i < s->block_aligned_filename_size)
1071 s->i++; 1075 s->i++;
@@ -1108,6 +1112,7 @@ out:
1108 up_write(&(auth_tok_key->sem)); 1112 up_write(&(auth_tok_key->sem));
1109 key_put(auth_tok_key); 1113 key_put(auth_tok_key);
1110 } 1114 }
1115 skcipher_request_free(s->skcipher_req);
1111 kfree(s); 1116 kfree(s);
1112 return rc; 1117 return rc;
1113} 1118}
@@ -1667,9 +1672,8 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
1667 struct scatterlist dst_sg[2]; 1672 struct scatterlist dst_sg[2];
1668 struct scatterlist src_sg[2]; 1673 struct scatterlist src_sg[2];
1669 struct mutex *tfm_mutex; 1674 struct mutex *tfm_mutex;
1670 struct blkcipher_desc desc = { 1675 struct crypto_skcipher *tfm;
1671 .flags = CRYPTO_TFM_REQ_MAY_SLEEP 1676 struct skcipher_request *req = NULL;
1672 };
1673 int rc = 0; 1677 int rc = 0;
1674 1678
1675 if (unlikely(ecryptfs_verbosity > 0)) { 1679 if (unlikely(ecryptfs_verbosity > 0)) {
@@ -1680,7 +1684,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
1680 auth_tok->token.password.session_key_encryption_key, 1684 auth_tok->token.password.session_key_encryption_key,
1681 auth_tok->token.password.session_key_encryption_key_bytes); 1685 auth_tok->token.password.session_key_encryption_key_bytes);
1682 } 1686 }
1683 rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex, 1687 rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&tfm, &tfm_mutex,
1684 crypt_stat->cipher); 1688 crypt_stat->cipher);
1685 if (unlikely(rc)) { 1689 if (unlikely(rc)) {
1686 printk(KERN_ERR "Internal error whilst attempting to get " 1690 printk(KERN_ERR "Internal error whilst attempting to get "
@@ -1711,8 +1715,20 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
1711 goto out; 1715 goto out;
1712 } 1716 }
1713 mutex_lock(tfm_mutex); 1717 mutex_lock(tfm_mutex);
1714 rc = crypto_blkcipher_setkey( 1718 req = skcipher_request_alloc(tfm, GFP_KERNEL);
1715 desc.tfm, auth_tok->token.password.session_key_encryption_key, 1719 if (!req) {
1720 mutex_unlock(tfm_mutex);
1721 printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
1722 "skcipher_request_alloc for %s\n", __func__,
1723 crypto_skcipher_driver_name(tfm));
1724 rc = -ENOMEM;
1725 goto out;
1726 }
1727
1728 skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP,
1729 NULL, NULL);
1730 rc = crypto_skcipher_setkey(
1731 tfm, auth_tok->token.password.session_key_encryption_key,
1716 crypt_stat->key_size); 1732 crypt_stat->key_size);
1717 if (unlikely(rc < 0)) { 1733 if (unlikely(rc < 0)) {
1718 mutex_unlock(tfm_mutex); 1734 mutex_unlock(tfm_mutex);
@@ -1720,8 +1736,10 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
1720 rc = -EINVAL; 1736 rc = -EINVAL;
1721 goto out; 1737 goto out;
1722 } 1738 }
1723 rc = crypto_blkcipher_decrypt(&desc, dst_sg, src_sg, 1739 skcipher_request_set_crypt(req, src_sg, dst_sg,
1724 auth_tok->session_key.encrypted_key_size); 1740 auth_tok->session_key.encrypted_key_size,
1741 NULL);
1742 rc = crypto_skcipher_decrypt(req);
1725 mutex_unlock(tfm_mutex); 1743 mutex_unlock(tfm_mutex);
1726 if (unlikely(rc)) { 1744 if (unlikely(rc)) {
1727 printk(KERN_ERR "Error decrypting; rc = [%d]\n", rc); 1745 printk(KERN_ERR "Error decrypting; rc = [%d]\n", rc);
@@ -1738,6 +1756,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
1738 crypt_stat->key_size); 1756 crypt_stat->key_size);
1739 } 1757 }
1740out: 1758out:
1759 skcipher_request_free(req);
1741 return rc; 1760 return rc;
1742} 1761}
1743 1762
@@ -2191,16 +2210,14 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2191 size_t max_packet_size; 2210 size_t max_packet_size;
2192 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = 2211 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
2193 crypt_stat->mount_crypt_stat; 2212 crypt_stat->mount_crypt_stat;
2194 struct blkcipher_desc desc = { 2213 struct crypto_skcipher *tfm;
2195 .tfm = NULL, 2214 struct skcipher_request *req;
2196 .flags = CRYPTO_TFM_REQ_MAY_SLEEP
2197 };
2198 int rc = 0; 2215 int rc = 0;
2199 2216
2200 (*packet_size) = 0; 2217 (*packet_size) = 0;
2201 ecryptfs_from_hex(key_rec->sig, auth_tok->token.password.signature, 2218 ecryptfs_from_hex(key_rec->sig, auth_tok->token.password.signature,
2202 ECRYPTFS_SIG_SIZE); 2219 ECRYPTFS_SIG_SIZE);
2203 rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex, 2220 rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&tfm, &tfm_mutex,
2204 crypt_stat->cipher); 2221 crypt_stat->cipher);
2205 if (unlikely(rc)) { 2222 if (unlikely(rc)) {
2206 printk(KERN_ERR "Internal error whilst attempting to get " 2223 printk(KERN_ERR "Internal error whilst attempting to get "
@@ -2209,12 +2226,11 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2209 goto out; 2226 goto out;
2210 } 2227 }
2211 if (mount_crypt_stat->global_default_cipher_key_size == 0) { 2228 if (mount_crypt_stat->global_default_cipher_key_size == 0) {
2212 struct blkcipher_alg *alg = crypto_blkcipher_alg(desc.tfm);
2213
2214 printk(KERN_WARNING "No key size specified at mount; " 2229 printk(KERN_WARNING "No key size specified at mount; "
2215 "defaulting to [%d]\n", alg->max_keysize); 2230 "defaulting to [%d]\n",
2231 crypto_skcipher_default_keysize(tfm));
2216 mount_crypt_stat->global_default_cipher_key_size = 2232 mount_crypt_stat->global_default_cipher_key_size =
2217 alg->max_keysize; 2233 crypto_skcipher_default_keysize(tfm);
2218 } 2234 }
2219 if (crypt_stat->key_size == 0) 2235 if (crypt_stat->key_size == 0)
2220 crypt_stat->key_size = 2236 crypt_stat->key_size =
@@ -2284,20 +2300,36 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2284 goto out; 2300 goto out;
2285 } 2301 }
2286 mutex_lock(tfm_mutex); 2302 mutex_lock(tfm_mutex);
2287 rc = crypto_blkcipher_setkey(desc.tfm, session_key_encryption_key, 2303 rc = crypto_skcipher_setkey(tfm, session_key_encryption_key,
2288 crypt_stat->key_size); 2304 crypt_stat->key_size);
2289 if (rc < 0) { 2305 if (rc < 0) {
2290 mutex_unlock(tfm_mutex); 2306 mutex_unlock(tfm_mutex);
2291 ecryptfs_printk(KERN_ERR, "Error setting key for crypto " 2307 ecryptfs_printk(KERN_ERR, "Error setting key for crypto "
2292 "context; rc = [%d]\n", rc); 2308 "context; rc = [%d]\n", rc);
2293 goto out; 2309 goto out;
2294 } 2310 }
2311
2312 req = skcipher_request_alloc(tfm, GFP_KERNEL);
2313 if (!req) {
2314 mutex_unlock(tfm_mutex);
2315 ecryptfs_printk(KERN_ERR, "Out of kernel memory whilst "
2316 "attempting to skcipher_request_alloc for "
2317 "%s\n", crypto_skcipher_driver_name(tfm));
2318 rc = -ENOMEM;
2319 goto out;
2320 }
2321
2322 skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP,
2323 NULL, NULL);
2324
2295 rc = 0; 2325 rc = 0;
2296 ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n", 2326 ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n",
2297 crypt_stat->key_size); 2327 crypt_stat->key_size);
2298 rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg, 2328 skcipher_request_set_crypt(req, src_sg, dst_sg,
2299 (*key_rec).enc_key_size); 2329 (*key_rec).enc_key_size, NULL);
2330 rc = crypto_skcipher_encrypt(req);
2300 mutex_unlock(tfm_mutex); 2331 mutex_unlock(tfm_mutex);
2332 skcipher_request_free(req);
2301 if (rc) { 2333 if (rc) {
2302 printk(KERN_ERR "Error encrypting; rc = [%d]\n", rc); 2334 printk(KERN_ERR "Error encrypting; rc = [%d]\n", rc);
2303 goto out; 2335 goto out;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index e25b6b06bacf..8b0b4a73116d 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -29,7 +29,6 @@
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/namei.h> 30#include <linux/namei.h>
31#include <linux/skbuff.h> 31#include <linux/skbuff.h>
32#include <linux/crypto.h>
33#include <linux/mount.h> 32#include <linux/mount.h>
34#include <linux/pagemap.h> 33#include <linux/pagemap.h>
35#include <linux/key.h> 34#include <linux/key.h>
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index c6ced4cbf0cf..1f5865263b3e 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -30,7 +30,6 @@
30#include <linux/page-flags.h> 30#include <linux/page-flags.h>
31#include <linux/mount.h> 31#include <linux/mount.h>
32#include <linux/file.h> 32#include <linux/file.h>
33#include <linux/crypto.h>
34#include <linux/scatterlist.h> 33#include <linux/scatterlist.h>
35#include <linux/slab.h> 34#include <linux/slab.h>
36#include <asm/unaligned.h> 35#include <asm/unaligned.h>
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index afa1b81c3418..77a486d3a51b 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -29,7 +29,6 @@
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/seq_file.h> 30#include <linux/seq_file.h>
31#include <linux/file.h> 31#include <linux/file.h>
32#include <linux/crypto.h>
33#include <linux/statfs.h> 32#include <linux/statfs.h>
34#include <linux/magic.h> 33#include <linux/magic.h>
35#include "ecryptfs_kernel.h" 34#include "ecryptfs_kernel.h"
diff --git a/fs/eventfd.c b/fs/eventfd.c
index ed70cf9fdc7b..1231cd1999d8 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -121,8 +121,46 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
121 u64 count; 121 u64 count;
122 122
123 poll_wait(file, &ctx->wqh, wait); 123 poll_wait(file, &ctx->wqh, wait);
124 smp_rmb(); 124
125 count = ctx->count; 125 /*
126 * All writes to ctx->count occur within ctx->wqh.lock. This read
127 * can be done outside ctx->wqh.lock because we know that poll_wait
128 * takes that lock (through add_wait_queue) if our caller will sleep.
129 *
130 * The read _can_ therefore seep into add_wait_queue's critical
131 * section, but cannot move above it! add_wait_queue's spin_lock acts
132 * as an acquire barrier and ensures that the read be ordered properly
133 * against the writes. The following CAN happen and is safe:
134 *
135 * poll write
136 * ----------------- ------------
137 * lock ctx->wqh.lock (in poll_wait)
138 * count = ctx->count
139 * __add_wait_queue
140 * unlock ctx->wqh.lock
141 * lock ctx->qwh.lock
142 * ctx->count += n
143 * if (waitqueue_active)
144 * wake_up_locked_poll
145 * unlock ctx->qwh.lock
146 * eventfd_poll returns 0
147 *
148 * but the following, which would miss a wakeup, cannot happen:
149 *
150 * poll write
151 * ----------------- ------------
152 * count = ctx->count (INVALID!)
153 * lock ctx->qwh.lock
154 * ctx->count += n
155 * **waitqueue_active is false**
156 * **no wake_up_locked_poll!**
157 * unlock ctx->qwh.lock
158 * lock ctx->wqh.lock (in poll_wait)
159 * __add_wait_queue
160 * unlock ctx->wqh.lock
161 * eventfd_poll returns 0
162 */
163 count = READ_ONCE(ctx->count);
126 164
127 if (count > 0) 165 if (count > 0)
128 events |= POLLIN; 166 events |= POLLIN;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index cde60741cad2..8a74a2a52e0f 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1616,7 +1616,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1616{ 1616{
1617 int res = 0, eavail, timed_out = 0; 1617 int res = 0, eavail, timed_out = 0;
1618 unsigned long flags; 1618 unsigned long flags;
1619 long slack = 0; 1619 u64 slack = 0;
1620 wait_queue_t wait; 1620 wait_queue_t wait;
1621 ktime_t expires, *to = NULL; 1621 ktime_t expires, *to = NULL;
1622 1622
diff --git a/fs/exec.c b/fs/exec.c
index dcd4ac7d3f1e..c4010b8207a1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -56,6 +56,7 @@
56#include <linux/pipe_fs_i.h> 56#include <linux/pipe_fs_i.h>
57#include <linux/oom.h> 57#include <linux/oom.h>
58#include <linux/compat.h> 58#include <linux/compat.h>
59#include <linux/vmalloc.h>
59 60
60#include <asm/uaccess.h> 61#include <asm/uaccess.h>
61#include <asm/mmu_context.h> 62#include <asm/mmu_context.h>
@@ -198,8 +199,12 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
198 return NULL; 199 return NULL;
199 } 200 }
200#endif 201#endif
201 ret = get_user_pages(current, bprm->mm, pos, 202 /*
202 1, write, 1, &page, NULL); 203 * We are doing an exec(). 'current' is the process
204 * doing the exec and bprm->mm is the new process's mm.
205 */
206 ret = get_user_pages_remote(current, bprm->mm, pos, 1, write,
207 1, &page, NULL);
203 if (ret <= 0) 208 if (ret <= 0)
204 return NULL; 209 return NULL;
205 210
@@ -831,6 +836,97 @@ int kernel_read(struct file *file, loff_t offset,
831 836
832EXPORT_SYMBOL(kernel_read); 837EXPORT_SYMBOL(kernel_read);
833 838
839int kernel_read_file(struct file *file, void **buf, loff_t *size,
840 loff_t max_size, enum kernel_read_file_id id)
841{
842 loff_t i_size, pos;
843 ssize_t bytes = 0;
844 int ret;
845
846 if (!S_ISREG(file_inode(file)->i_mode) || max_size < 0)
847 return -EINVAL;
848
849 ret = security_kernel_read_file(file, id);
850 if (ret)
851 return ret;
852
853 i_size = i_size_read(file_inode(file));
854 if (max_size > 0 && i_size > max_size)
855 return -EFBIG;
856 if (i_size <= 0)
857 return -EINVAL;
858
859 *buf = vmalloc(i_size);
860 if (!*buf)
861 return -ENOMEM;
862
863 pos = 0;
864 while (pos < i_size) {
865 bytes = kernel_read(file, pos, (char *)(*buf) + pos,
866 i_size - pos);
867 if (bytes < 0) {
868 ret = bytes;
869 goto out;
870 }
871
872 if (bytes == 0)
873 break;
874 pos += bytes;
875 }
876
877 if (pos != i_size) {
878 ret = -EIO;
879 goto out;
880 }
881
882 ret = security_kernel_post_read_file(file, *buf, i_size, id);
883 if (!ret)
884 *size = pos;
885
886out:
887 if (ret < 0) {
888 vfree(*buf);
889 *buf = NULL;
890 }
891 return ret;
892}
893EXPORT_SYMBOL_GPL(kernel_read_file);
894
895int kernel_read_file_from_path(char *path, void **buf, loff_t *size,
896 loff_t max_size, enum kernel_read_file_id id)
897{
898 struct file *file;
899 int ret;
900
901 if (!path || !*path)
902 return -EINVAL;
903
904 file = filp_open(path, O_RDONLY, 0);
905 if (IS_ERR(file))
906 return PTR_ERR(file);
907
908 ret = kernel_read_file(file, buf, size, max_size, id);
909 fput(file);
910 return ret;
911}
912EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
913
914int kernel_read_file_from_fd(int fd, void **buf, loff_t *size, loff_t max_size,
915 enum kernel_read_file_id id)
916{
917 struct fd f = fdget(fd);
918 int ret = -EBADF;
919
920 if (!f.file)
921 goto out;
922
923 ret = kernel_read_file(f.file, buf, size, max_size, id);
924out:
925 fdput(f);
926 return ret;
927}
928EXPORT_SYMBOL_GPL(kernel_read_file_from_fd);
929
834ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) 930ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
835{ 931{
836 ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); 932 ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 4c69c94cafd8..170939f379d7 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -61,6 +61,8 @@ struct ext2_block_alloc_info {
61#define rsv_start rsv_window._rsv_start 61#define rsv_start rsv_window._rsv_start
62#define rsv_end rsv_window._rsv_end 62#define rsv_end rsv_window._rsv_end
63 63
64struct mb_cache;
65
64/* 66/*
65 * second extended-fs super-block data in memory 67 * second extended-fs super-block data in memory
66 */ 68 */
@@ -111,6 +113,7 @@ struct ext2_sb_info {
111 * of the mount options. 113 * of the mount options.
112 */ 114 */
113 spinlock_t s_lock; 115 spinlock_t s_lock;
116 struct mb_cache *s_mb_cache;
114}; 117};
115 118
116static inline spinlock_t * 119static inline spinlock_t *
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 2a188413a2b0..b78caf25f746 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -131,7 +131,10 @@ static void ext2_put_super (struct super_block * sb)
131 131
132 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 132 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
133 133
134 ext2_xattr_put_super(sb); 134 if (sbi->s_mb_cache) {
135 ext2_xattr_destroy_cache(sbi->s_mb_cache);
136 sbi->s_mb_cache = NULL;
137 }
135 if (!(sb->s_flags & MS_RDONLY)) { 138 if (!(sb->s_flags & MS_RDONLY)) {
136 struct ext2_super_block *es = sbi->s_es; 139 struct ext2_super_block *es = sbi->s_es;
137 140
@@ -1104,6 +1107,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
1104 ext2_msg(sb, KERN_ERR, "error: insufficient memory"); 1107 ext2_msg(sb, KERN_ERR, "error: insufficient memory");
1105 goto failed_mount3; 1108 goto failed_mount3;
1106 } 1109 }
1110
1111#ifdef CONFIG_EXT2_FS_XATTR
1112 sbi->s_mb_cache = ext2_xattr_create_cache();
1113 if (!sbi->s_mb_cache) {
1114 ext2_msg(sb, KERN_ERR, "Failed to create an mb_cache");
1115 goto failed_mount3;
1116 }
1117#endif
1107 /* 1118 /*
1108 * set up enough so that it can read an inode 1119 * set up enough so that it can read an inode
1109 */ 1120 */
@@ -1149,6 +1160,8 @@ cantfind_ext2:
1149 sb->s_id); 1160 sb->s_id);
1150 goto failed_mount; 1161 goto failed_mount;
1151failed_mount3: 1162failed_mount3:
1163 if (sbi->s_mb_cache)
1164 ext2_xattr_destroy_cache(sbi->s_mb_cache);
1152 percpu_counter_destroy(&sbi->s_freeblocks_counter); 1165 percpu_counter_destroy(&sbi->s_freeblocks_counter);
1153 percpu_counter_destroy(&sbi->s_freeinodes_counter); 1166 percpu_counter_destroy(&sbi->s_freeinodes_counter);
1154 percpu_counter_destroy(&sbi->s_dirs_counter); 1167 percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -1555,20 +1568,17 @@ MODULE_ALIAS_FS("ext2");
1555 1568
1556static int __init init_ext2_fs(void) 1569static int __init init_ext2_fs(void)
1557{ 1570{
1558 int err = init_ext2_xattr(); 1571 int err;
1559 if (err) 1572
1560 return err;
1561 err = init_inodecache(); 1573 err = init_inodecache();
1562 if (err) 1574 if (err)
1563 goto out1; 1575 return err;
1564 err = register_filesystem(&ext2_fs_type); 1576 err = register_filesystem(&ext2_fs_type);
1565 if (err) 1577 if (err)
1566 goto out; 1578 goto out;
1567 return 0; 1579 return 0;
1568out: 1580out:
1569 destroy_inodecache(); 1581 destroy_inodecache();
1570out1:
1571 exit_ext2_xattr();
1572 return err; 1582 return err;
1573} 1583}
1574 1584
@@ -1576,7 +1586,6 @@ static void __exit exit_ext2_fs(void)
1576{ 1586{
1577 unregister_filesystem(&ext2_fs_type); 1587 unregister_filesystem(&ext2_fs_type);
1578 destroy_inodecache(); 1588 destroy_inodecache();
1579 exit_ext2_xattr();
1580} 1589}
1581 1590
1582MODULE_AUTHOR("Remy Card and others"); 1591MODULE_AUTHOR("Remy Card and others");
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index f57a7aba32eb..1a5e3bff0b63 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -90,14 +90,12 @@
90static int ext2_xattr_set2(struct inode *, struct buffer_head *, 90static int ext2_xattr_set2(struct inode *, struct buffer_head *,
91 struct ext2_xattr_header *); 91 struct ext2_xattr_header *);
92 92
93static int ext2_xattr_cache_insert(struct buffer_head *); 93static int ext2_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
94static struct buffer_head *ext2_xattr_cache_find(struct inode *, 94static struct buffer_head *ext2_xattr_cache_find(struct inode *,
95 struct ext2_xattr_header *); 95 struct ext2_xattr_header *);
96static void ext2_xattr_rehash(struct ext2_xattr_header *, 96static void ext2_xattr_rehash(struct ext2_xattr_header *,
97 struct ext2_xattr_entry *); 97 struct ext2_xattr_entry *);
98 98
99static struct mb_cache *ext2_xattr_cache;
100
101static const struct xattr_handler *ext2_xattr_handler_map[] = { 99static const struct xattr_handler *ext2_xattr_handler_map[] = {
102 [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, 100 [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler,
103#ifdef CONFIG_EXT2_FS_POSIX_ACL 101#ifdef CONFIG_EXT2_FS_POSIX_ACL
@@ -152,6 +150,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name,
152 size_t name_len, size; 150 size_t name_len, size;
153 char *end; 151 char *end;
154 int error; 152 int error;
153 struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
155 154
156 ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", 155 ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
157 name_index, name, buffer, (long)buffer_size); 156 name_index, name, buffer, (long)buffer_size);
@@ -196,7 +195,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get",
196 goto found; 195 goto found;
197 entry = next; 196 entry = next;
198 } 197 }
199 if (ext2_xattr_cache_insert(bh)) 198 if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
200 ea_idebug(inode, "cache insert failed"); 199 ea_idebug(inode, "cache insert failed");
201 error = -ENODATA; 200 error = -ENODATA;
202 goto cleanup; 201 goto cleanup;
@@ -209,7 +208,7 @@ found:
209 le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) 208 le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
210 goto bad_block; 209 goto bad_block;
211 210
212 if (ext2_xattr_cache_insert(bh)) 211 if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
213 ea_idebug(inode, "cache insert failed"); 212 ea_idebug(inode, "cache insert failed");
214 if (buffer) { 213 if (buffer) {
215 error = -ERANGE; 214 error = -ERANGE;
@@ -247,6 +246,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
247 char *end; 246 char *end;
248 size_t rest = buffer_size; 247 size_t rest = buffer_size;
249 int error; 248 int error;
249 struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
250 250
251 ea_idebug(inode, "buffer=%p, buffer_size=%ld", 251 ea_idebug(inode, "buffer=%p, buffer_size=%ld",
252 buffer, (long)buffer_size); 252 buffer, (long)buffer_size);
@@ -281,7 +281,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
281 goto bad_block; 281 goto bad_block;
282 entry = next; 282 entry = next;
283 } 283 }
284 if (ext2_xattr_cache_insert(bh)) 284 if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
285 ea_idebug(inode, "cache insert failed"); 285 ea_idebug(inode, "cache insert failed");
286 286
287 /* list the attribute names */ 287 /* list the attribute names */
@@ -483,22 +483,23 @@ bad_block: ext2_error(sb, "ext2_xattr_set",
483 /* Here we know that we can set the new attribute. */ 483 /* Here we know that we can set the new attribute. */
484 484
485 if (header) { 485 if (header) {
486 struct mb_cache_entry *ce;
487
488 /* assert(header == HDR(bh)); */ 486 /* assert(header == HDR(bh)); */
489 ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev,
490 bh->b_blocknr);
491 lock_buffer(bh); 487 lock_buffer(bh);
492 if (header->h_refcount == cpu_to_le32(1)) { 488 if (header->h_refcount == cpu_to_le32(1)) {
489 __u32 hash = le32_to_cpu(header->h_hash);
490
493 ea_bdebug(bh, "modifying in-place"); 491 ea_bdebug(bh, "modifying in-place");
494 if (ce) 492 /*
495 mb_cache_entry_free(ce); 493 * This must happen under buffer lock for
494 * ext2_xattr_set2() to reliably detect modified block
495 */
496 mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache,
497 hash, bh->b_blocknr);
498
496 /* keep the buffer locked while modifying it. */ 499 /* keep the buffer locked while modifying it. */
497 } else { 500 } else {
498 int offset; 501 int offset;
499 502
500 if (ce)
501 mb_cache_entry_release(ce);
502 unlock_buffer(bh); 503 unlock_buffer(bh);
503 ea_bdebug(bh, "cloning"); 504 ea_bdebug(bh, "cloning");
504 header = kmalloc(bh->b_size, GFP_KERNEL); 505 header = kmalloc(bh->b_size, GFP_KERNEL);
@@ -626,6 +627,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
626 struct super_block *sb = inode->i_sb; 627 struct super_block *sb = inode->i_sb;
627 struct buffer_head *new_bh = NULL; 628 struct buffer_head *new_bh = NULL;
628 int error; 629 int error;
630 struct mb_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache;
629 631
630 if (header) { 632 if (header) {
631 new_bh = ext2_xattr_cache_find(inode, header); 633 new_bh = ext2_xattr_cache_find(inode, header);
@@ -653,7 +655,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
653 don't need to change the reference count. */ 655 don't need to change the reference count. */
654 new_bh = old_bh; 656 new_bh = old_bh;
655 get_bh(new_bh); 657 get_bh(new_bh);
656 ext2_xattr_cache_insert(new_bh); 658 ext2_xattr_cache_insert(ext2_mb_cache, new_bh);
657 } else { 659 } else {
658 /* We need to allocate a new block */ 660 /* We need to allocate a new block */
659 ext2_fsblk_t goal = ext2_group_first_block_no(sb, 661 ext2_fsblk_t goal = ext2_group_first_block_no(sb,
@@ -674,7 +676,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
674 memcpy(new_bh->b_data, header, new_bh->b_size); 676 memcpy(new_bh->b_data, header, new_bh->b_size);
675 set_buffer_uptodate(new_bh); 677 set_buffer_uptodate(new_bh);
676 unlock_buffer(new_bh); 678 unlock_buffer(new_bh);
677 ext2_xattr_cache_insert(new_bh); 679 ext2_xattr_cache_insert(ext2_mb_cache, new_bh);
678 680
679 ext2_xattr_update_super_block(sb); 681 ext2_xattr_update_super_block(sb);
680 } 682 }
@@ -707,19 +709,21 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
707 709
708 error = 0; 710 error = 0;
709 if (old_bh && old_bh != new_bh) { 711 if (old_bh && old_bh != new_bh) {
710 struct mb_cache_entry *ce;
711
712 /* 712 /*
713 * If there was an old block and we are no longer using it, 713 * If there was an old block and we are no longer using it,
714 * release the old block. 714 * release the old block.
715 */ 715 */
716 ce = mb_cache_entry_get(ext2_xattr_cache, old_bh->b_bdev,
717 old_bh->b_blocknr);
718 lock_buffer(old_bh); 716 lock_buffer(old_bh);
719 if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) { 717 if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
718 __u32 hash = le32_to_cpu(HDR(old_bh)->h_hash);
719
720 /*
721 * This must happen under buffer lock for
722 * ext2_xattr_set2() to reliably detect freed block
723 */
724 mb_cache_entry_delete_block(ext2_mb_cache,
725 hash, old_bh->b_blocknr);
720 /* Free the old block. */ 726 /* Free the old block. */
721 if (ce)
722 mb_cache_entry_free(ce);
723 ea_bdebug(old_bh, "freeing"); 727 ea_bdebug(old_bh, "freeing");
724 ext2_free_blocks(inode, old_bh->b_blocknr, 1); 728 ext2_free_blocks(inode, old_bh->b_blocknr, 1);
725 mark_inode_dirty(inode); 729 mark_inode_dirty(inode);
@@ -730,8 +734,6 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
730 } else { 734 } else {
731 /* Decrement the refcount only. */ 735 /* Decrement the refcount only. */
732 le32_add_cpu(&HDR(old_bh)->h_refcount, -1); 736 le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
733 if (ce)
734 mb_cache_entry_release(ce);
735 dquot_free_block_nodirty(inode, 1); 737 dquot_free_block_nodirty(inode, 1);
736 mark_inode_dirty(inode); 738 mark_inode_dirty(inode);
737 mark_buffer_dirty(old_bh); 739 mark_buffer_dirty(old_bh);
@@ -757,7 +759,6 @@ void
757ext2_xattr_delete_inode(struct inode *inode) 759ext2_xattr_delete_inode(struct inode *inode)
758{ 760{
759 struct buffer_head *bh = NULL; 761 struct buffer_head *bh = NULL;
760 struct mb_cache_entry *ce;
761 762
762 down_write(&EXT2_I(inode)->xattr_sem); 763 down_write(&EXT2_I(inode)->xattr_sem);
763 if (!EXT2_I(inode)->i_file_acl) 764 if (!EXT2_I(inode)->i_file_acl)
@@ -777,19 +778,22 @@ ext2_xattr_delete_inode(struct inode *inode)
777 EXT2_I(inode)->i_file_acl); 778 EXT2_I(inode)->i_file_acl);
778 goto cleanup; 779 goto cleanup;
779 } 780 }
780 ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, bh->b_blocknr);
781 lock_buffer(bh); 781 lock_buffer(bh);
782 if (HDR(bh)->h_refcount == cpu_to_le32(1)) { 782 if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
783 if (ce) 783 __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
784 mb_cache_entry_free(ce); 784
785 /*
786 * This must happen under buffer lock for ext2_xattr_set2() to
787 * reliably detect freed block
788 */
789 mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache,
790 hash, bh->b_blocknr);
785 ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); 791 ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
786 get_bh(bh); 792 get_bh(bh);
787 bforget(bh); 793 bforget(bh);
788 unlock_buffer(bh); 794 unlock_buffer(bh);
789 } else { 795 } else {
790 le32_add_cpu(&HDR(bh)->h_refcount, -1); 796 le32_add_cpu(&HDR(bh)->h_refcount, -1);
791 if (ce)
792 mb_cache_entry_release(ce);
793 ea_bdebug(bh, "refcount now=%d", 797 ea_bdebug(bh, "refcount now=%d",
794 le32_to_cpu(HDR(bh)->h_refcount)); 798 le32_to_cpu(HDR(bh)->h_refcount));
795 unlock_buffer(bh); 799 unlock_buffer(bh);
@@ -806,18 +810,6 @@ cleanup:
806} 810}
807 811
808/* 812/*
809 * ext2_xattr_put_super()
810 *
811 * This is called when a file system is unmounted.
812 */
813void
814ext2_xattr_put_super(struct super_block *sb)
815{
816 mb_cache_shrink(sb->s_bdev);
817}
818
819
820/*
821 * ext2_xattr_cache_insert() 813 * ext2_xattr_cache_insert()
822 * 814 *
823 * Create a new entry in the extended attribute cache, and insert 815 * Create a new entry in the extended attribute cache, and insert
@@ -826,28 +818,20 @@ ext2_xattr_put_super(struct super_block *sb)
826 * Returns 0, or a negative error number on failure. 818 * Returns 0, or a negative error number on failure.
827 */ 819 */
828static int 820static int
829ext2_xattr_cache_insert(struct buffer_head *bh) 821ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh)
830{ 822{
831 __u32 hash = le32_to_cpu(HDR(bh)->h_hash); 823 __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
832 struct mb_cache_entry *ce;
833 int error; 824 int error;
834 825
835 ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS); 826 error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr, 1);
836 if (!ce)
837 return -ENOMEM;
838 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
839 if (error) { 827 if (error) {
840 mb_cache_entry_free(ce);
841 if (error == -EBUSY) { 828 if (error == -EBUSY) {
842 ea_bdebug(bh, "already in cache (%d cache entries)", 829 ea_bdebug(bh, "already in cache (%d cache entries)",
843 atomic_read(&ext2_xattr_cache->c_entry_count)); 830 atomic_read(&ext2_xattr_cache->c_entry_count));
844 error = 0; 831 error = 0;
845 } 832 }
846 } else { 833 } else
847 ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, 834 ea_bdebug(bh, "inserting [%x]", (int)hash);
848 atomic_read(&ext2_xattr_cache->c_entry_count));
849 mb_cache_entry_release(ce);
850 }
851 return error; 835 return error;
852} 836}
853 837
@@ -904,22 +888,16 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
904{ 888{
905 __u32 hash = le32_to_cpu(header->h_hash); 889 __u32 hash = le32_to_cpu(header->h_hash);
906 struct mb_cache_entry *ce; 890 struct mb_cache_entry *ce;
891 struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
907 892
908 if (!header->h_hash) 893 if (!header->h_hash)
909 return NULL; /* never share */ 894 return NULL; /* never share */
910 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); 895 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
911again: 896again:
912 ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev, 897 ce = mb_cache_entry_find_first(ext2_mb_cache, hash);
913 hash);
914 while (ce) { 898 while (ce) {
915 struct buffer_head *bh; 899 struct buffer_head *bh;
916 900
917 if (IS_ERR(ce)) {
918 if (PTR_ERR(ce) == -EAGAIN)
919 goto again;
920 break;
921 }
922
923 bh = sb_bread(inode->i_sb, ce->e_block); 901 bh = sb_bread(inode->i_sb, ce->e_block);
924 if (!bh) { 902 if (!bh) {
925 ext2_error(inode->i_sb, "ext2_xattr_cache_find", 903 ext2_error(inode->i_sb, "ext2_xattr_cache_find",
@@ -927,7 +905,21 @@ again:
927 inode->i_ino, (unsigned long) ce->e_block); 905 inode->i_ino, (unsigned long) ce->e_block);
928 } else { 906 } else {
929 lock_buffer(bh); 907 lock_buffer(bh);
930 if (le32_to_cpu(HDR(bh)->h_refcount) > 908 /*
909 * We have to be careful about races with freeing or
910 * rehashing of xattr block. Once we hold buffer lock
911 * xattr block's state is stable so we can check
912 * whether the block got freed / rehashed or not.
913 * Since we unhash mbcache entry under buffer lock when
914 * freeing / rehashing xattr block, checking whether
915 * entry is still hashed is reliable.
916 */
917 if (hlist_bl_unhashed(&ce->e_hash_list)) {
918 mb_cache_entry_put(ext2_mb_cache, ce);
919 unlock_buffer(bh);
920 brelse(bh);
921 goto again;
922 } else if (le32_to_cpu(HDR(bh)->h_refcount) >
931 EXT2_XATTR_REFCOUNT_MAX) { 923 EXT2_XATTR_REFCOUNT_MAX) {
932 ea_idebug(inode, "block %ld refcount %d>%d", 924 ea_idebug(inode, "block %ld refcount %d>%d",
933 (unsigned long) ce->e_block, 925 (unsigned long) ce->e_block,
@@ -936,13 +928,14 @@ again:
936 } else if (!ext2_xattr_cmp(header, HDR(bh))) { 928 } else if (!ext2_xattr_cmp(header, HDR(bh))) {
937 ea_bdebug(bh, "b_count=%d", 929 ea_bdebug(bh, "b_count=%d",
938 atomic_read(&(bh->b_count))); 930 atomic_read(&(bh->b_count)));
939 mb_cache_entry_release(ce); 931 mb_cache_entry_touch(ext2_mb_cache, ce);
932 mb_cache_entry_put(ext2_mb_cache, ce);
940 return bh; 933 return bh;
941 } 934 }
942 unlock_buffer(bh); 935 unlock_buffer(bh);
943 brelse(bh); 936 brelse(bh);
944 } 937 }
945 ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); 938 ce = mb_cache_entry_find_next(ext2_mb_cache, ce);
946 } 939 }
947 return NULL; 940 return NULL;
948} 941}
@@ -1015,17 +1008,15 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header,
1015 1008
1016#undef BLOCK_HASH_SHIFT 1009#undef BLOCK_HASH_SHIFT
1017 1010
1018int __init 1011#define HASH_BUCKET_BITS 10
1019init_ext2_xattr(void) 1012
1013struct mb_cache *ext2_xattr_create_cache(void)
1020{ 1014{
1021 ext2_xattr_cache = mb_cache_create("ext2_xattr", 6); 1015 return mb_cache_create(HASH_BUCKET_BITS);
1022 if (!ext2_xattr_cache)
1023 return -ENOMEM;
1024 return 0;
1025} 1016}
1026 1017
1027void 1018void ext2_xattr_destroy_cache(struct mb_cache *cache)
1028exit_ext2_xattr(void)
1029{ 1019{
1030 mb_cache_destroy(ext2_xattr_cache); 1020 if (cache)
1021 mb_cache_destroy(cache);
1031} 1022}
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 60edf298644e..6f82ab1b00ca 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -53,6 +53,8 @@ struct ext2_xattr_entry {
53#define EXT2_XATTR_SIZE(size) \ 53#define EXT2_XATTR_SIZE(size) \
54 (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND) 54 (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND)
55 55
56struct mb_cache;
57
56# ifdef CONFIG_EXT2_FS_XATTR 58# ifdef CONFIG_EXT2_FS_XATTR
57 59
58extern const struct xattr_handler ext2_xattr_user_handler; 60extern const struct xattr_handler ext2_xattr_user_handler;
@@ -65,10 +67,9 @@ extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t);
65extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int); 67extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
66 68
67extern void ext2_xattr_delete_inode(struct inode *); 69extern void ext2_xattr_delete_inode(struct inode *);
68extern void ext2_xattr_put_super(struct super_block *);
69 70
70extern int init_ext2_xattr(void); 71extern struct mb_cache *ext2_xattr_create_cache(void);
71extern void exit_ext2_xattr(void); 72extern void ext2_xattr_destroy_cache(struct mb_cache *cache);
72 73
73extern const struct xattr_handler *ext2_xattr_handlers[]; 74extern const struct xattr_handler *ext2_xattr_handlers[];
74 75
@@ -93,19 +94,7 @@ ext2_xattr_delete_inode(struct inode *inode)
93{ 94{
94} 95}
95 96
96static inline void 97static inline void ext2_xattr_destroy_cache(struct mb_cache *cache)
97ext2_xattr_put_super(struct super_block *sb)
98{
99}
100
101static inline int
102init_ext2_xattr(void)
103{
104 return 0;
105}
106
107static inline void
108exit_ext2_xattr(void)
109{ 98{
110} 99}
111 100
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 38f7562489bb..edc053a81914 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -18,11 +18,9 @@
18 * Special Publication 800-38E and IEEE P1619/D16. 18 * Special Publication 800-38E and IEEE P1619/D16.
19 */ 19 */
20 20
21#include <crypto/hash.h> 21#include <crypto/skcipher.h>
22#include <crypto/sha.h>
23#include <keys/user-type.h> 22#include <keys/user-type.h>
24#include <keys/encrypted-type.h> 23#include <keys/encrypted-type.h>
25#include <linux/crypto.h>
26#include <linux/ecryptfs.h> 24#include <linux/ecryptfs.h>
27#include <linux/gfp.h> 25#include <linux/gfp.h>
28#include <linux/kernel.h> 26#include <linux/kernel.h>
@@ -261,21 +259,21 @@ static int ext4_page_crypto(struct inode *inode,
261 259
262{ 260{
263 u8 xts_tweak[EXT4_XTS_TWEAK_SIZE]; 261 u8 xts_tweak[EXT4_XTS_TWEAK_SIZE];
264 struct ablkcipher_request *req = NULL; 262 struct skcipher_request *req = NULL;
265 DECLARE_EXT4_COMPLETION_RESULT(ecr); 263 DECLARE_EXT4_COMPLETION_RESULT(ecr);
266 struct scatterlist dst, src; 264 struct scatterlist dst, src;
267 struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; 265 struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
268 struct crypto_ablkcipher *tfm = ci->ci_ctfm; 266 struct crypto_skcipher *tfm = ci->ci_ctfm;
269 int res = 0; 267 int res = 0;
270 268
271 req = ablkcipher_request_alloc(tfm, GFP_NOFS); 269 req = skcipher_request_alloc(tfm, GFP_NOFS);
272 if (!req) { 270 if (!req) {
273 printk_ratelimited(KERN_ERR 271 printk_ratelimited(KERN_ERR
274 "%s: crypto_request_alloc() failed\n", 272 "%s: crypto_request_alloc() failed\n",
275 __func__); 273 __func__);
276 return -ENOMEM; 274 return -ENOMEM;
277 } 275 }
278 ablkcipher_request_set_callback( 276 skcipher_request_set_callback(
279 req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, 277 req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
280 ext4_crypt_complete, &ecr); 278 ext4_crypt_complete, &ecr);
281 279
@@ -288,21 +286,21 @@ static int ext4_page_crypto(struct inode *inode,
288 sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0); 286 sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
289 sg_init_table(&src, 1); 287 sg_init_table(&src, 1);
290 sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0); 288 sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
291 ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE, 289 skcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
292 xts_tweak); 290 xts_tweak);
293 if (rw == EXT4_DECRYPT) 291 if (rw == EXT4_DECRYPT)
294 res = crypto_ablkcipher_decrypt(req); 292 res = crypto_skcipher_decrypt(req);
295 else 293 else
296 res = crypto_ablkcipher_encrypt(req); 294 res = crypto_skcipher_encrypt(req);
297 if (res == -EINPROGRESS || res == -EBUSY) { 295 if (res == -EINPROGRESS || res == -EBUSY) {
298 wait_for_completion(&ecr.completion); 296 wait_for_completion(&ecr.completion);
299 res = ecr.res; 297 res = ecr.res;
300 } 298 }
301 ablkcipher_request_free(req); 299 skcipher_request_free(req);
302 if (res) { 300 if (res) {
303 printk_ratelimited( 301 printk_ratelimited(
304 KERN_ERR 302 KERN_ERR
305 "%s: crypto_ablkcipher_encrypt() returned %d\n", 303 "%s: crypto_skcipher_encrypt() returned %d\n",
306 __func__, res); 304 __func__, res);
307 return res; 305 return res;
308 } 306 }
diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c
index 2fbef8a14760..1a2f360405db 100644
--- a/fs/ext4/crypto_fname.c
+++ b/fs/ext4/crypto_fname.c
@@ -11,11 +11,9 @@
11 * 11 *
12 */ 12 */
13 13
14#include <crypto/hash.h> 14#include <crypto/skcipher.h>
15#include <crypto/sha.h>
16#include <keys/encrypted-type.h> 15#include <keys/encrypted-type.h>
17#include <keys/user-type.h> 16#include <keys/user-type.h>
18#include <linux/crypto.h>
19#include <linux/gfp.h> 17#include <linux/gfp.h>
20#include <linux/kernel.h> 18#include <linux/kernel.h>
21#include <linux/key.h> 19#include <linux/key.h>
@@ -65,10 +63,10 @@ static int ext4_fname_encrypt(struct inode *inode,
65 struct ext4_str *oname) 63 struct ext4_str *oname)
66{ 64{
67 u32 ciphertext_len; 65 u32 ciphertext_len;
68 struct ablkcipher_request *req = NULL; 66 struct skcipher_request *req = NULL;
69 DECLARE_EXT4_COMPLETION_RESULT(ecr); 67 DECLARE_EXT4_COMPLETION_RESULT(ecr);
70 struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; 68 struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
71 struct crypto_ablkcipher *tfm = ci->ci_ctfm; 69 struct crypto_skcipher *tfm = ci->ci_ctfm;
72 int res = 0; 70 int res = 0;
73 char iv[EXT4_CRYPTO_BLOCK_SIZE]; 71 char iv[EXT4_CRYPTO_BLOCK_SIZE];
74 struct scatterlist src_sg, dst_sg; 72 struct scatterlist src_sg, dst_sg;
@@ -95,14 +93,14 @@ static int ext4_fname_encrypt(struct inode *inode,
95 } 93 }
96 94
97 /* Allocate request */ 95 /* Allocate request */
98 req = ablkcipher_request_alloc(tfm, GFP_NOFS); 96 req = skcipher_request_alloc(tfm, GFP_NOFS);
99 if (!req) { 97 if (!req) {
100 printk_ratelimited( 98 printk_ratelimited(
101 KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); 99 KERN_ERR "%s: crypto_request_alloc() failed\n", __func__);
102 kfree(alloc_buf); 100 kfree(alloc_buf);
103 return -ENOMEM; 101 return -ENOMEM;
104 } 102 }
105 ablkcipher_request_set_callback(req, 103 skcipher_request_set_callback(req,
106 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, 104 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
107 ext4_dir_crypt_complete, &ecr); 105 ext4_dir_crypt_complete, &ecr);
108 106
@@ -117,14 +115,14 @@ static int ext4_fname_encrypt(struct inode *inode,
117 /* Create encryption request */ 115 /* Create encryption request */
118 sg_init_one(&src_sg, workbuf, ciphertext_len); 116 sg_init_one(&src_sg, workbuf, ciphertext_len);
119 sg_init_one(&dst_sg, oname->name, ciphertext_len); 117 sg_init_one(&dst_sg, oname->name, ciphertext_len);
120 ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); 118 skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
121 res = crypto_ablkcipher_encrypt(req); 119 res = crypto_skcipher_encrypt(req);
122 if (res == -EINPROGRESS || res == -EBUSY) { 120 if (res == -EINPROGRESS || res == -EBUSY) {
123 wait_for_completion(&ecr.completion); 121 wait_for_completion(&ecr.completion);
124 res = ecr.res; 122 res = ecr.res;
125 } 123 }
126 kfree(alloc_buf); 124 kfree(alloc_buf);
127 ablkcipher_request_free(req); 125 skcipher_request_free(req);
128 if (res < 0) { 126 if (res < 0) {
129 printk_ratelimited( 127 printk_ratelimited(
130 KERN_ERR "%s: Error (error code %d)\n", __func__, res); 128 KERN_ERR "%s: Error (error code %d)\n", __func__, res);
@@ -145,11 +143,11 @@ static int ext4_fname_decrypt(struct inode *inode,
145 struct ext4_str *oname) 143 struct ext4_str *oname)
146{ 144{
147 struct ext4_str tmp_in[2], tmp_out[1]; 145 struct ext4_str tmp_in[2], tmp_out[1];
148 struct ablkcipher_request *req = NULL; 146 struct skcipher_request *req = NULL;
149 DECLARE_EXT4_COMPLETION_RESULT(ecr); 147 DECLARE_EXT4_COMPLETION_RESULT(ecr);
150 struct scatterlist src_sg, dst_sg; 148 struct scatterlist src_sg, dst_sg;
151 struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; 149 struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
152 struct crypto_ablkcipher *tfm = ci->ci_ctfm; 150 struct crypto_skcipher *tfm = ci->ci_ctfm;
153 int res = 0; 151 int res = 0;
154 char iv[EXT4_CRYPTO_BLOCK_SIZE]; 152 char iv[EXT4_CRYPTO_BLOCK_SIZE];
155 unsigned lim = max_name_len(inode); 153 unsigned lim = max_name_len(inode);
@@ -162,13 +160,13 @@ static int ext4_fname_decrypt(struct inode *inode,
162 tmp_out[0].name = oname->name; 160 tmp_out[0].name = oname->name;
163 161
164 /* Allocate request */ 162 /* Allocate request */
165 req = ablkcipher_request_alloc(tfm, GFP_NOFS); 163 req = skcipher_request_alloc(tfm, GFP_NOFS);
166 if (!req) { 164 if (!req) {
167 printk_ratelimited( 165 printk_ratelimited(
168 KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); 166 KERN_ERR "%s: crypto_request_alloc() failed\n", __func__);
169 return -ENOMEM; 167 return -ENOMEM;
170 } 168 }
171 ablkcipher_request_set_callback(req, 169 skcipher_request_set_callback(req,
172 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, 170 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
173 ext4_dir_crypt_complete, &ecr); 171 ext4_dir_crypt_complete, &ecr);
174 172
@@ -178,13 +176,13 @@ static int ext4_fname_decrypt(struct inode *inode,
178 /* Create encryption request */ 176 /* Create encryption request */
179 sg_init_one(&src_sg, iname->name, iname->len); 177 sg_init_one(&src_sg, iname->name, iname->len);
180 sg_init_one(&dst_sg, oname->name, oname->len); 178 sg_init_one(&dst_sg, oname->name, oname->len);
181 ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); 179 skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
182 res = crypto_ablkcipher_decrypt(req); 180 res = crypto_skcipher_decrypt(req);
183 if (res == -EINPROGRESS || res == -EBUSY) { 181 if (res == -EINPROGRESS || res == -EBUSY) {
184 wait_for_completion(&ecr.completion); 182 wait_for_completion(&ecr.completion);
185 res = ecr.res; 183 res = ecr.res;
186 } 184 }
187 ablkcipher_request_free(req); 185 skcipher_request_free(req);
188 if (res < 0) { 186 if (res < 0) {
189 printk_ratelimited( 187 printk_ratelimited(
190 KERN_ERR "%s: Error in ext4_fname_encrypt (error code %d)\n", 188 KERN_ERR "%s: Error in ext4_fname_encrypt (error code %d)\n",
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
index 9a16d1e75a49..0129d688d1f7 100644
--- a/fs/ext4/crypto_key.c
+++ b/fs/ext4/crypto_key.c
@@ -8,6 +8,7 @@
8 * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. 8 * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
9 */ 9 */
10 10
11#include <crypto/skcipher.h>
11#include <keys/encrypted-type.h> 12#include <keys/encrypted-type.h>
12#include <keys/user-type.h> 13#include <keys/user-type.h>
13#include <linux/random.h> 14#include <linux/random.h>
@@ -41,45 +42,42 @@ static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE],
41 char derived_key[EXT4_AES_256_XTS_KEY_SIZE]) 42 char derived_key[EXT4_AES_256_XTS_KEY_SIZE])
42{ 43{
43 int res = 0; 44 int res = 0;
44 struct ablkcipher_request *req = NULL; 45 struct skcipher_request *req = NULL;
45 DECLARE_EXT4_COMPLETION_RESULT(ecr); 46 DECLARE_EXT4_COMPLETION_RESULT(ecr);
46 struct scatterlist src_sg, dst_sg; 47 struct scatterlist src_sg, dst_sg;
47 struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0, 48 struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
48 0);
49 49
50 if (IS_ERR(tfm)) { 50 if (IS_ERR(tfm)) {
51 res = PTR_ERR(tfm); 51 res = PTR_ERR(tfm);
52 tfm = NULL; 52 tfm = NULL;
53 goto out; 53 goto out;
54 } 54 }
55 crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); 55 crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
56 req = ablkcipher_request_alloc(tfm, GFP_NOFS); 56 req = skcipher_request_alloc(tfm, GFP_NOFS);
57 if (!req) { 57 if (!req) {
58 res = -ENOMEM; 58 res = -ENOMEM;
59 goto out; 59 goto out;
60 } 60 }
61 ablkcipher_request_set_callback(req, 61 skcipher_request_set_callback(req,
62 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, 62 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
63 derive_crypt_complete, &ecr); 63 derive_crypt_complete, &ecr);
64 res = crypto_ablkcipher_setkey(tfm, deriving_key, 64 res = crypto_skcipher_setkey(tfm, deriving_key,
65 EXT4_AES_128_ECB_KEY_SIZE); 65 EXT4_AES_128_ECB_KEY_SIZE);
66 if (res < 0) 66 if (res < 0)
67 goto out; 67 goto out;
68 sg_init_one(&src_sg, source_key, EXT4_AES_256_XTS_KEY_SIZE); 68 sg_init_one(&src_sg, source_key, EXT4_AES_256_XTS_KEY_SIZE);
69 sg_init_one(&dst_sg, derived_key, EXT4_AES_256_XTS_KEY_SIZE); 69 sg_init_one(&dst_sg, derived_key, EXT4_AES_256_XTS_KEY_SIZE);
70 ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, 70 skcipher_request_set_crypt(req, &src_sg, &dst_sg,
71 EXT4_AES_256_XTS_KEY_SIZE, NULL); 71 EXT4_AES_256_XTS_KEY_SIZE, NULL);
72 res = crypto_ablkcipher_encrypt(req); 72 res = crypto_skcipher_encrypt(req);
73 if (res == -EINPROGRESS || res == -EBUSY) { 73 if (res == -EINPROGRESS || res == -EBUSY) {
74 wait_for_completion(&ecr.completion); 74 wait_for_completion(&ecr.completion);
75 res = ecr.res; 75 res = ecr.res;
76 } 76 }
77 77
78out: 78out:
79 if (req) 79 skcipher_request_free(req);
80 ablkcipher_request_free(req); 80 crypto_free_skcipher(tfm);
81 if (tfm)
82 crypto_free_ablkcipher(tfm);
83 return res; 81 return res;
84} 82}
85 83
@@ -90,7 +88,7 @@ void ext4_free_crypt_info(struct ext4_crypt_info *ci)
90 88
91 if (ci->ci_keyring_key) 89 if (ci->ci_keyring_key)
92 key_put(ci->ci_keyring_key); 90 key_put(ci->ci_keyring_key);
93 crypto_free_ablkcipher(ci->ci_ctfm); 91 crypto_free_skcipher(ci->ci_ctfm);
94 kmem_cache_free(ext4_crypt_info_cachep, ci); 92 kmem_cache_free(ext4_crypt_info_cachep, ci);
95} 93}
96 94
@@ -122,7 +120,7 @@ int _ext4_get_encryption_info(struct inode *inode)
122 struct ext4_encryption_context ctx; 120 struct ext4_encryption_context ctx;
123 const struct user_key_payload *ukp; 121 const struct user_key_payload *ukp;
124 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 122 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
125 struct crypto_ablkcipher *ctfm; 123 struct crypto_skcipher *ctfm;
126 const char *cipher_str; 124 const char *cipher_str;
127 char raw_key[EXT4_MAX_KEY_SIZE]; 125 char raw_key[EXT4_MAX_KEY_SIZE];
128 char mode; 126 char mode;
@@ -237,7 +235,7 @@ retry:
237 if (res) 235 if (res)
238 goto out; 236 goto out;
239got_key: 237got_key:
240 ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0); 238 ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
241 if (!ctfm || IS_ERR(ctfm)) { 239 if (!ctfm || IS_ERR(ctfm)) {
242 res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; 240 res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
243 printk(KERN_DEBUG 241 printk(KERN_DEBUG
@@ -246,11 +244,11 @@ got_key:
246 goto out; 244 goto out;
247 } 245 }
248 crypt_info->ci_ctfm = ctfm; 246 crypt_info->ci_ctfm = ctfm;
249 crypto_ablkcipher_clear_flags(ctfm, ~0); 247 crypto_skcipher_clear_flags(ctfm, ~0);
250 crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm), 248 crypto_tfm_set_flags(crypto_skcipher_tfm(ctfm),
251 CRYPTO_TFM_REQ_WEAK_KEY); 249 CRYPTO_TFM_REQ_WEAK_KEY);
252 res = crypto_ablkcipher_setkey(ctfm, raw_key, 250 res = crypto_skcipher_setkey(ctfm, raw_key,
253 ext4_encryption_key_size(mode)); 251 ext4_encryption_key_size(mode));
254 if (res) 252 if (res)
255 goto out; 253 goto out;
256 memzero_explicit(raw_key, sizeof(raw_key)); 254 memzero_explicit(raw_key, sizeof(raw_key));
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 33f5e2a50cf8..50ba27cbed03 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -285,7 +285,7 @@ errout:
285static inline int is_32bit_api(void) 285static inline int is_32bit_api(void)
286{ 286{
287#ifdef CONFIG_COMPAT 287#ifdef CONFIG_COMPAT
288 return is_compat_task(); 288 return in_compat_syscall();
289#else 289#else
290 return (BITS_PER_LONG == 32); 290 return (BITS_PER_LONG == 32);
291#endif 291#endif
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 157b458a69d4..c04743519865 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -42,6 +42,18 @@
42 */ 42 */
43 43
44/* 44/*
45 * with AGGRESSIVE_CHECK allocator runs consistency checks over
46 * structures. these checks slow things down a lot
47 */
48#define AGGRESSIVE_CHECK__
49
50/*
51 * with DOUBLE_CHECK defined mballoc creates persistent in-core
52 * bitmaps, maintains and uses them to check for double allocations
53 */
54#define DOUBLE_CHECK__
55
56/*
45 * Define EXT4FS_DEBUG to produce debug messages 57 * Define EXT4FS_DEBUG to produce debug messages
46 */ 58 */
47#undef EXT4FS_DEBUG 59#undef EXT4FS_DEBUG
@@ -182,9 +194,9 @@ typedef struct ext4_io_end {
182 struct bio *bio; /* Linked list of completed 194 struct bio *bio; /* Linked list of completed
183 * bios covering the extent */ 195 * bios covering the extent */
184 unsigned int flag; /* unwritten or not */ 196 unsigned int flag; /* unwritten or not */
197 atomic_t count; /* reference counter */
185 loff_t offset; /* offset in the file */ 198 loff_t offset; /* offset in the file */
186 ssize_t size; /* size of the extent */ 199 ssize_t size; /* size of the extent */
187 atomic_t count; /* reference counter */
188} ext4_io_end_t; 200} ext4_io_end_t;
189 201
190struct ext4_io_submit { 202struct ext4_io_submit {
@@ -1024,13 +1036,8 @@ struct ext4_inode_info {
1024 * transaction reserved 1036 * transaction reserved
1025 */ 1037 */
1026 struct list_head i_rsv_conversion_list; 1038 struct list_head i_rsv_conversion_list;
1027 /*
1028 * Completed IOs that need unwritten extents handling and don't have
1029 * transaction reserved
1030 */
1031 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
1032 atomic_t i_unwritten; /* Nr. of inflight conversions pending */
1033 struct work_struct i_rsv_conversion_work; 1039 struct work_struct i_rsv_conversion_work;
1040 atomic_t i_unwritten; /* Nr. of inflight conversions pending */
1034 1041
1035 spinlock_t i_block_reservation_lock; 1042 spinlock_t i_block_reservation_lock;
1036 1043
@@ -1504,25 +1511,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
1504 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); 1511 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
1505} 1512}
1506 1513
1507static inline void ext4_set_io_unwritten_flag(struct inode *inode,
1508 struct ext4_io_end *io_end)
1509{
1510 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
1511 io_end->flag |= EXT4_IO_END_UNWRITTEN;
1512 atomic_inc(&EXT4_I(inode)->i_unwritten);
1513 }
1514}
1515
1516static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
1517{
1518 return inode->i_private;
1519}
1520
1521static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
1522{
1523 inode->i_private = io;
1524}
1525
1526/* 1514/*
1527 * Inode dynamic state flags 1515 * Inode dynamic state flags
1528 */ 1516 */
@@ -2506,12 +2494,14 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
2506int ext4_inode_is_fast_symlink(struct inode *inode); 2494int ext4_inode_is_fast_symlink(struct inode *inode);
2507struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); 2495struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
2508struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); 2496struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
2509int ext4_get_block_write(struct inode *inode, sector_t iblock, 2497int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
2510 struct buffer_head *bh_result, int create); 2498 struct buffer_head *bh_result, int create);
2511int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, 2499int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
2512 struct buffer_head *bh_result, int create); 2500 struct buffer_head *bh_result, int create);
2513int ext4_get_block(struct inode *inode, sector_t iblock, 2501int ext4_get_block(struct inode *inode, sector_t iblock,
2514 struct buffer_head *bh_result, int create); 2502 struct buffer_head *bh_result, int create);
2503int ext4_dio_get_block(struct inode *inode, sector_t iblock,
2504 struct buffer_head *bh_result, int create);
2515int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 2505int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2516 struct buffer_head *bh, int create); 2506 struct buffer_head *bh, int create);
2517int ext4_walk_page_buffers(handle_t *handle, 2507int ext4_walk_page_buffers(handle_t *handle,
@@ -2559,6 +2549,9 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
2559 int used, int quota_claim); 2549 int used, int quota_claim);
2560extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, 2550extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
2561 ext4_fsblk_t pblk, ext4_lblk_t len); 2551 ext4_fsblk_t pblk, ext4_lblk_t len);
2552extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
2553 unsigned int map_len,
2554 struct extent_status *result);
2562 2555
2563/* indirect.c */ 2556/* indirect.c */
2564extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, 2557extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
@@ -3285,15 +3278,33 @@ static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
3285#define EXT4_WQ_HASH_SZ 37 3278#define EXT4_WQ_HASH_SZ 37
3286#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ 3279#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
3287 EXT4_WQ_HASH_SZ]) 3280 EXT4_WQ_HASH_SZ])
3288#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\
3289 EXT4_WQ_HASH_SZ])
3290extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; 3281extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
3291extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
3292 3282
3293#define EXT4_RESIZING 0 3283#define EXT4_RESIZING 0
3294extern int ext4_resize_begin(struct super_block *sb); 3284extern int ext4_resize_begin(struct super_block *sb);
3295extern void ext4_resize_end(struct super_block *sb); 3285extern void ext4_resize_end(struct super_block *sb);
3296 3286
3287static inline void ext4_set_io_unwritten_flag(struct inode *inode,
3288 struct ext4_io_end *io_end)
3289{
3290 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
3291 io_end->flag |= EXT4_IO_END_UNWRITTEN;
3292 atomic_inc(&EXT4_I(inode)->i_unwritten);
3293 }
3294}
3295
3296static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
3297{
3298 struct inode *inode = io_end->inode;
3299
3300 if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
3301 io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
3302 /* Wake up anyone waiting on unwritten extent conversion */
3303 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
3304 wake_up_all(ext4_ioend_wq(inode));
3305 }
3306}
3307
3297#endif /* __KERNEL__ */ 3308#endif /* __KERNEL__ */
3298 3309
3299#define EFSBADCRC EBADMSG /* Bad CRC detected */ 3310#define EFSBADCRC EBADMSG /* Bad CRC detected */
diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h
index ac7d4e813796..1f73c29717e1 100644
--- a/fs/ext4/ext4_crypto.h
+++ b/fs/ext4/ext4_crypto.h
@@ -77,7 +77,7 @@ struct ext4_crypt_info {
77 char ci_data_mode; 77 char ci_data_mode;
78 char ci_filename_mode; 78 char ci_filename_mode;
79 char ci_flags; 79 char ci_flags;
80 struct crypto_ablkcipher *ci_ctfm; 80 struct crypto_skcipher *ci_ctfm;
81 struct key *ci_keyring_key; 81 struct key *ci_keyring_key;
82 char ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE]; 82 char ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE];
83}; 83};
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 3c9381547094..8ecf84b8f5a1 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -11,7 +11,7 @@
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public Licens 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
17 */ 17 */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3753ceb0b0dd..95bf4679ac54 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -15,7 +15,7 @@
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details. 16 * GNU General Public License for more details.
17 * 17 *
18 * You should have received a copy of the GNU General Public Licens 18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software 19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
21 */ 21 */
@@ -1736,6 +1736,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1736 */ 1736 */
1737 if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) 1737 if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
1738 return 0; 1738 return 0;
1739 /*
1740 * The check for IO to unwritten extent is somewhat racy as we
1741 * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
1742 * dropping i_data_sem. But reserved blocks should save us in that
1743 * case.
1744 */
1739 if (ext4_ext_is_unwritten(ex1) && 1745 if (ext4_ext_is_unwritten(ex1) &&
1740 (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || 1746 (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
1741 atomic_read(&EXT4_I(inode)->i_unwritten) || 1747 atomic_read(&EXT4_I(inode)->i_unwritten) ||
@@ -2293,59 +2299,69 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
2293} 2299}
2294 2300
2295/* 2301/*
2296 * ext4_ext_put_gap_in_cache: 2302 * ext4_ext_determine_hole - determine hole around given block
2297 * calculate boundaries of the gap that the requested block fits into 2303 * @inode: inode we lookup in
2298 * and cache this gap 2304 * @path: path in extent tree to @lblk
2305 * @lblk: pointer to logical block around which we want to determine hole
2306 *
2307 * Determine hole length (and start if easily possible) around given logical
2308 * block. We don't try too hard to find the beginning of the hole but @path
2309 * actually points to extent before @lblk, we provide it.
2310 *
2311 * The function returns the length of a hole starting at @lblk. We update @lblk
2312 * to the beginning of the hole if we managed to find it.
2299 */ 2313 */
2300static void 2314static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
2301ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, 2315 struct ext4_ext_path *path,
2302 ext4_lblk_t block) 2316 ext4_lblk_t *lblk)
2303{ 2317{
2304 int depth = ext_depth(inode); 2318 int depth = ext_depth(inode);
2305 ext4_lblk_t len;
2306 ext4_lblk_t lblock;
2307 struct ext4_extent *ex; 2319 struct ext4_extent *ex;
2308 struct extent_status es; 2320 ext4_lblk_t len;
2309 2321
2310 ex = path[depth].p_ext; 2322 ex = path[depth].p_ext;
2311 if (ex == NULL) { 2323 if (ex == NULL) {
2312 /* there is no extent yet, so gap is [0;-] */ 2324 /* there is no extent yet, so gap is [0;-] */
2313 lblock = 0; 2325 *lblk = 0;
2314 len = EXT_MAX_BLOCKS; 2326 len = EXT_MAX_BLOCKS;
2315 ext_debug("cache gap(whole file):"); 2327 } else if (*lblk < le32_to_cpu(ex->ee_block)) {
2316 } else if (block < le32_to_cpu(ex->ee_block)) { 2328 len = le32_to_cpu(ex->ee_block) - *lblk;
2317 lblock = block; 2329 } else if (*lblk >= le32_to_cpu(ex->ee_block)
2318 len = le32_to_cpu(ex->ee_block) - block;
2319 ext_debug("cache gap(before): %u [%u:%u]",
2320 block,
2321 le32_to_cpu(ex->ee_block),
2322 ext4_ext_get_actual_len(ex));
2323 } else if (block >= le32_to_cpu(ex->ee_block)
2324 + ext4_ext_get_actual_len(ex)) { 2330 + ext4_ext_get_actual_len(ex)) {
2325 ext4_lblk_t next; 2331 ext4_lblk_t next;
2326 lblock = le32_to_cpu(ex->ee_block)
2327 + ext4_ext_get_actual_len(ex);
2328 2332
2333 *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
2329 next = ext4_ext_next_allocated_block(path); 2334 next = ext4_ext_next_allocated_block(path);
2330 ext_debug("cache gap(after): [%u:%u] %u", 2335 BUG_ON(next == *lblk);
2331 le32_to_cpu(ex->ee_block), 2336 len = next - *lblk;
2332 ext4_ext_get_actual_len(ex),
2333 block);
2334 BUG_ON(next == lblock);
2335 len = next - lblock;
2336 } else { 2337 } else {
2337 BUG(); 2338 BUG();
2338 } 2339 }
2340 return len;
2341}
2339 2342
2340 ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es); 2343/*
2344 * ext4_ext_put_gap_in_cache:
2345 * calculate boundaries of the gap that the requested block fits into
2346 * and cache this gap
2347 */
2348static void
2349ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
2350 ext4_lblk_t hole_len)
2351{
2352 struct extent_status es;
2353
2354 ext4_es_find_delayed_extent_range(inode, hole_start,
2355 hole_start + hole_len - 1, &es);
2341 if (es.es_len) { 2356 if (es.es_len) {
2342 /* There's delayed extent containing lblock? */ 2357 /* There's delayed extent containing lblock? */
2343 if (es.es_lblk <= lblock) 2358 if (es.es_lblk <= hole_start)
2344 return; 2359 return;
2345 len = min(es.es_lblk - lblock, len); 2360 hole_len = min(es.es_lblk - hole_start, hole_len);
2346 } 2361 }
2347 ext_debug(" -> %u:%u\n", lblock, len); 2362 ext_debug(" -> %u:%u\n", hole_start, hole_len);
2348 ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE); 2363 ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
2364 EXTENT_STATUS_HOLE);
2349} 2365}
2350 2366
2351/* 2367/*
@@ -3927,7 +3943,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3927static int 3943static int
3928convert_initialized_extent(handle_t *handle, struct inode *inode, 3944convert_initialized_extent(handle_t *handle, struct inode *inode,
3929 struct ext4_map_blocks *map, 3945 struct ext4_map_blocks *map,
3930 struct ext4_ext_path **ppath, int flags, 3946 struct ext4_ext_path **ppath,
3931 unsigned int allocated) 3947 unsigned int allocated)
3932{ 3948{
3933 struct ext4_ext_path *path = *ppath; 3949 struct ext4_ext_path *path = *ppath;
@@ -4007,7 +4023,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
4007 struct ext4_ext_path *path = *ppath; 4023 struct ext4_ext_path *path = *ppath;
4008 int ret = 0; 4024 int ret = 0;
4009 int err = 0; 4025 int err = 0;
4010 ext4_io_end_t *io = ext4_inode_aio(inode);
4011 4026
4012 ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " 4027 ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
4013 "block %llu, max_blocks %u, flags %x, allocated %u\n", 4028 "block %llu, max_blocks %u, flags %x, allocated %u\n",
@@ -4030,15 +4045,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
4030 flags | EXT4_GET_BLOCKS_CONVERT); 4045 flags | EXT4_GET_BLOCKS_CONVERT);
4031 if (ret <= 0) 4046 if (ret <= 0)
4032 goto out; 4047 goto out;
4033 /*
4034 * Flag the inode(non aio case) or end_io struct (aio case)
4035 * that this IO needs to conversion to written when IO is
4036 * completed
4037 */
4038 if (io)
4039 ext4_set_io_unwritten_flag(inode, io);
4040 else
4041 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
4042 map->m_flags |= EXT4_MAP_UNWRITTEN; 4048 map->m_flags |= EXT4_MAP_UNWRITTEN;
4043 goto out; 4049 goto out;
4044 } 4050 }
@@ -4283,9 +4289,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4283 unsigned int allocated = 0, offset = 0; 4289 unsigned int allocated = 0, offset = 0;
4284 unsigned int allocated_clusters = 0; 4290 unsigned int allocated_clusters = 0;
4285 struct ext4_allocation_request ar; 4291 struct ext4_allocation_request ar;
4286 ext4_io_end_t *io = ext4_inode_aio(inode);
4287 ext4_lblk_t cluster_offset; 4292 ext4_lblk_t cluster_offset;
4288 int set_unwritten = 0;
4289 bool map_from_cluster = false; 4293 bool map_from_cluster = false;
4290 4294
4291 ext_debug("blocks %u/%u requested for inode %lu\n", 4295 ext_debug("blocks %u/%u requested for inode %lu\n",
@@ -4347,7 +4351,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4347 (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { 4351 (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4348 allocated = convert_initialized_extent( 4352 allocated = convert_initialized_extent(
4349 handle, inode, map, &path, 4353 handle, inode, map, &path,
4350 flags, allocated); 4354 allocated);
4351 goto out2; 4355 goto out2;
4352 } else if (!ext4_ext_is_unwritten(ex)) 4356 } else if (!ext4_ext_is_unwritten(ex))
4353 goto out; 4357 goto out;
@@ -4368,11 +4372,22 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4368 * we couldn't try to create block if create flag is zero 4372 * we couldn't try to create block if create flag is zero
4369 */ 4373 */
4370 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 4374 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4375 ext4_lblk_t hole_start, hole_len;
4376
4377 hole_start = map->m_lblk;
4378 hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
4371 /* 4379 /*
4372 * put just found gap into cache to speed up 4380 * put just found gap into cache to speed up
4373 * subsequent requests 4381 * subsequent requests
4374 */ 4382 */
4375 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); 4383 ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
4384
4385 /* Update hole_len to reflect hole size after map->m_lblk */
4386 if (hole_start != map->m_lblk)
4387 hole_len -= map->m_lblk - hole_start;
4388 map->m_pblk = 0;
4389 map->m_len = min_t(unsigned int, map->m_len, hole_len);
4390
4376 goto out2; 4391 goto out2;
4377 } 4392 }
4378 4393
@@ -4482,15 +4497,6 @@ got_allocated_blocks:
4482 if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ 4497 if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
4483 ext4_ext_mark_unwritten(&newex); 4498 ext4_ext_mark_unwritten(&newex);
4484 map->m_flags |= EXT4_MAP_UNWRITTEN; 4499 map->m_flags |= EXT4_MAP_UNWRITTEN;
4485 /*
4486 * io_end structure was created for every IO write to an
4487 * unwritten extent. To avoid unnecessary conversion,
4488 * here we flag the IO that really needs the conversion.
4489 * For non asycn direct IO case, flag the inode state
4490 * that we need to perform conversion when IO is done.
4491 */
4492 if (flags & EXT4_GET_BLOCKS_PRE_IO)
4493 set_unwritten = 1;
4494 } 4500 }
4495 4501
4496 err = 0; 4502 err = 0;
@@ -4501,14 +4507,6 @@ got_allocated_blocks:
4501 err = ext4_ext_insert_extent(handle, inode, &path, 4507 err = ext4_ext_insert_extent(handle, inode, &path,
4502 &newex, flags); 4508 &newex, flags);
4503 4509
4504 if (!err && set_unwritten) {
4505 if (io)
4506 ext4_set_io_unwritten_flag(inode, io);
4507 else
4508 ext4_set_inode_state(inode,
4509 EXT4_STATE_DIO_UNWRITTEN);
4510 }
4511
4512 if (err && free_on_err) { 4510 if (err && free_on_err) {
4513 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? 4511 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
4514 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; 4512 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index ac748b3af1c1..e38b987ac7f5 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -823,8 +823,8 @@ out:
823 es->es_lblk = es1->es_lblk; 823 es->es_lblk = es1->es_lblk;
824 es->es_len = es1->es_len; 824 es->es_len = es1->es_len;
825 es->es_pblk = es1->es_pblk; 825 es->es_pblk = es1->es_pblk;
826 if (!ext4_es_is_referenced(es)) 826 if (!ext4_es_is_referenced(es1))
827 ext4_es_set_referenced(es); 827 ext4_es_set_referenced(es1);
828 stats->es_stats_cache_hits++; 828 stats->es_stats_cache_hits++;
829 } else { 829 } else {
830 stats->es_stats_cache_misses++; 830 stats->es_stats_cache_misses++;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4cd318f31cbe..6659e216385e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -93,31 +93,29 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
93{ 93{
94 struct file *file = iocb->ki_filp; 94 struct file *file = iocb->ki_filp;
95 struct inode *inode = file_inode(iocb->ki_filp); 95 struct inode *inode = file_inode(iocb->ki_filp);
96 struct mutex *aio_mutex = NULL;
97 struct blk_plug plug; 96 struct blk_plug plug;
98 int o_direct = iocb->ki_flags & IOCB_DIRECT; 97 int o_direct = iocb->ki_flags & IOCB_DIRECT;
98 int unaligned_aio = 0;
99 int overwrite = 0; 99 int overwrite = 0;
100 ssize_t ret; 100 ssize_t ret;
101 101
102 inode_lock(inode);
103 ret = generic_write_checks(iocb, from);
104 if (ret <= 0)
105 goto out;
106
102 /* 107 /*
103 * Unaligned direct AIO must be serialized; see comment above 108 * Unaligned direct AIO must be serialized among each other as zeroing
104 * In the case of O_APPEND, assume that we must always serialize 109 * of partial blocks of two competing unaligned AIOs can result in data
110 * corruption.
105 */ 111 */
106 if (o_direct && 112 if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
107 ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
108 !is_sync_kiocb(iocb) && 113 !is_sync_kiocb(iocb) &&
109 (iocb->ki_flags & IOCB_APPEND || 114 ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
110 ext4_unaligned_aio(inode, from, iocb->ki_pos))) { 115 unaligned_aio = 1;
111 aio_mutex = ext4_aio_mutex(inode);
112 mutex_lock(aio_mutex);
113 ext4_unwritten_wait(inode); 116 ext4_unwritten_wait(inode);
114 } 117 }
115 118
116 inode_lock(inode);
117 ret = generic_write_checks(iocb, from);
118 if (ret <= 0)
119 goto out;
120
121 /* 119 /*
122 * If we have encountered a bitmap-format file, the size limit 120 * If we have encountered a bitmap-format file, the size limit
123 * is smaller than s_maxbytes, which is for extent-mapped files. 121 * is smaller than s_maxbytes, which is for extent-mapped files.
@@ -139,7 +137,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
139 blk_start_plug(&plug); 137 blk_start_plug(&plug);
140 138
141 /* check whether we do a DIO overwrite or not */ 139 /* check whether we do a DIO overwrite or not */
142 if (ext4_should_dioread_nolock(inode) && !aio_mutex && 140 if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
143 !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { 141 !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
144 struct ext4_map_blocks map; 142 struct ext4_map_blocks map;
145 unsigned int blkbits = inode->i_blkbits; 143 unsigned int blkbits = inode->i_blkbits;
@@ -181,14 +179,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
181 if (o_direct) 179 if (o_direct)
182 blk_finish_plug(&plug); 180 blk_finish_plug(&plug);
183 181
184 if (aio_mutex)
185 mutex_unlock(aio_mutex);
186 return ret; 182 return ret;
187 183
188out: 184out:
189 inode_unlock(inode); 185 inode_unlock(inode);
190 if (aio_mutex)
191 mutex_unlock(aio_mutex);
192 return ret; 186 return ret;
193} 187}
194 188
@@ -417,7 +411,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
417 */ 411 */
418static int ext4_find_unwritten_pgoff(struct inode *inode, 412static int ext4_find_unwritten_pgoff(struct inode *inode,
419 int whence, 413 int whence,
420 struct ext4_map_blocks *map, 414 ext4_lblk_t end_blk,
421 loff_t *offset) 415 loff_t *offset)
422{ 416{
423 struct pagevec pvec; 417 struct pagevec pvec;
@@ -432,7 +426,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
432 blkbits = inode->i_sb->s_blocksize_bits; 426 blkbits = inode->i_sb->s_blocksize_bits;
433 startoff = *offset; 427 startoff = *offset;
434 lastoff = startoff; 428 lastoff = startoff;
435 endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; 429 endoff = (loff_t)end_blk << blkbits;
436 430
437 index = startoff >> PAGE_CACHE_SHIFT; 431 index = startoff >> PAGE_CACHE_SHIFT;
438 end = endoff >> PAGE_CACHE_SHIFT; 432 end = endoff >> PAGE_CACHE_SHIFT;
@@ -550,12 +544,11 @@ out:
550static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) 544static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
551{ 545{
552 struct inode *inode = file->f_mapping->host; 546 struct inode *inode = file->f_mapping->host;
553 struct ext4_map_blocks map;
554 struct extent_status es; 547 struct extent_status es;
555 ext4_lblk_t start, last, end; 548 ext4_lblk_t start, last, end;
556 loff_t dataoff, isize; 549 loff_t dataoff, isize;
557 int blkbits; 550 int blkbits;
558 int ret = 0; 551 int ret;
559 552
560 inode_lock(inode); 553 inode_lock(inode);
561 554
@@ -572,41 +565,32 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
572 dataoff = offset; 565 dataoff = offset;
573 566
574 do { 567 do {
575 map.m_lblk = last; 568 ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
576 map.m_len = end - last + 1; 569 if (ret <= 0) {
577 ret = ext4_map_blocks(NULL, inode, &map, 0); 570 /* No extent found -> no data */
578 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 571 if (ret == 0)
579 if (last != start) 572 ret = -ENXIO;
580 dataoff = (loff_t)last << blkbits; 573 inode_unlock(inode);
581 break; 574 return ret;
582 } 575 }
583 576
584 /* 577 last = es.es_lblk;
585 * If there is a delay extent at this offset, 578 if (last != start)
586 * it will be as a data. 579 dataoff = (loff_t)last << blkbits;
587 */ 580 if (!ext4_es_is_unwritten(&es))
588 ext4_es_find_delayed_extent_range(inode, last, last, &es);
589 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
590 if (last != start)
591 dataoff = (loff_t)last << blkbits;
592 break; 581 break;
593 }
594 582
595 /* 583 /*
596 * If there is a unwritten extent at this offset, 584 * If there is a unwritten extent at this offset,
597 * it will be as a data or a hole according to page 585 * it will be as a data or a hole according to page
598 * cache that has data or not. 586 * cache that has data or not.
599 */ 587 */
600 if (map.m_flags & EXT4_MAP_UNWRITTEN) { 588 if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
601 int unwritten; 589 es.es_lblk + es.es_len, &dataoff))
602 unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, 590 break;
603 &map, &dataoff); 591 last += es.es_len;
604 if (unwritten)
605 break;
606 }
607
608 last++;
609 dataoff = (loff_t)last << blkbits; 592 dataoff = (loff_t)last << blkbits;
593 cond_resched();
610 } while (last <= end); 594 } while (last <= end);
611 595
612 inode_unlock(inode); 596 inode_unlock(inode);
@@ -623,12 +607,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
623static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) 607static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
624{ 608{
625 struct inode *inode = file->f_mapping->host; 609 struct inode *inode = file->f_mapping->host;
626 struct ext4_map_blocks map;
627 struct extent_status es; 610 struct extent_status es;
628 ext4_lblk_t start, last, end; 611 ext4_lblk_t start, last, end;
629 loff_t holeoff, isize; 612 loff_t holeoff, isize;
630 int blkbits; 613 int blkbits;
631 int ret = 0; 614 int ret;
632 615
633 inode_lock(inode); 616 inode_lock(inode);
634 617
@@ -645,44 +628,30 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
645 holeoff = offset; 628 holeoff = offset;
646 629
647 do { 630 do {
648 map.m_lblk = last; 631 ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
649 map.m_len = end - last + 1; 632 if (ret < 0) {
650 ret = ext4_map_blocks(NULL, inode, &map, 0); 633 inode_unlock(inode);
651 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 634 return ret;
652 last += ret;
653 holeoff = (loff_t)last << blkbits;
654 continue;
655 } 635 }
656 636 /* Found a hole? */
657 /* 637 if (ret == 0 || es.es_lblk > last) {
658 * If there is a delay extent at this offset, 638 if (last != start)
659 * we will skip this extent. 639 holeoff = (loff_t)last << blkbits;
660 */ 640 break;
661 ext4_es_find_delayed_extent_range(inode, last, last, &es);
662 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
663 last = es.es_lblk + es.es_len;
664 holeoff = (loff_t)last << blkbits;
665 continue;
666 } 641 }
667
668 /* 642 /*
669 * If there is a unwritten extent at this offset, 643 * If there is a unwritten extent at this offset,
670 * it will be as a data or a hole according to page 644 * it will be as a data or a hole according to page
671 * cache that has data or not. 645 * cache that has data or not.
672 */ 646 */
673 if (map.m_flags & EXT4_MAP_UNWRITTEN) { 647 if (ext4_es_is_unwritten(&es) &&
674 int unwritten; 648 ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
675 unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, 649 last + es.es_len, &holeoff))
676 &map, &holeoff); 650 break;
677 if (!unwritten) {
678 last += ret;
679 holeoff = (loff_t)last << blkbits;
680 continue;
681 }
682 }
683 651
684 /* find a hole */ 652 last += es.es_len;
685 break; 653 holeoff = (loff_t)last << blkbits;
654 cond_resched();
686 } while (last <= end); 655 } while (last <= end);
687 656
688 inode_unlock(inode); 657 inode_unlock(inode);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index acc0ad56bf2f..237b877d316d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -787,7 +787,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
787 sbi = EXT4_SB(sb); 787 sbi = EXT4_SB(sb);
788 788
789 /* 789 /*
790 * Initalize owners and quota early so that we don't have to account 790 * Initialize owners and quota early so that we don't have to account
791 * for quota initialization worst case in standard inode creating 791 * for quota initialization worst case in standard inode creating
792 * transaction 792 * transaction
793 */ 793 */
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 355ef9c36c87..3027fa681de5 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -555,8 +555,23 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
555 goto got_it; 555 goto got_it;
556 } 556 }
557 557
558 /* Next simple case - plain lookup or failed read of indirect block */ 558 /* Next simple case - plain lookup failed */
559 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) 559 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
560 unsigned epb = inode->i_sb->s_blocksize / sizeof(u32);
561 int i;
562
563 /* Count number blocks in a subtree under 'partial' */
564 count = 1;
565 for (i = 0; partial + i != chain + depth - 1; i++)
566 count *= epb;
567 /* Fill in size of a hole we found */
568 map->m_pblk = 0;
569 map->m_len = min_t(unsigned int, map->m_len, count);
570 goto cleanup;
571 }
572
573 /* Failed read of indirect block */
574 if (err == -EIO)
560 goto cleanup; 575 goto cleanup;
561 576
562 /* 577 /*
@@ -693,21 +708,21 @@ retry:
693 } 708 }
694 if (IS_DAX(inode)) 709 if (IS_DAX(inode))
695 ret = dax_do_io(iocb, inode, iter, offset, 710 ret = dax_do_io(iocb, inode, iter, offset,
696 ext4_get_block, NULL, 0); 711 ext4_dio_get_block, NULL, 0);
697 else 712 else
698 ret = __blockdev_direct_IO(iocb, inode, 713 ret = __blockdev_direct_IO(iocb, inode,
699 inode->i_sb->s_bdev, iter, 714 inode->i_sb->s_bdev, iter,
700 offset, ext4_get_block, NULL, 715 offset, ext4_dio_get_block,
701 NULL, 0); 716 NULL, NULL, 0);
702 inode_dio_end(inode); 717 inode_dio_end(inode);
703 } else { 718 } else {
704locked: 719locked:
705 if (IS_DAX(inode)) 720 if (IS_DAX(inode))
706 ret = dax_do_io(iocb, inode, iter, offset, 721 ret = dax_do_io(iocb, inode, iter, offset,
707 ext4_get_block, NULL, DIO_LOCKING); 722 ext4_dio_get_block, NULL, DIO_LOCKING);
708 else 723 else
709 ret = blockdev_direct_IO(iocb, inode, iter, offset, 724 ret = blockdev_direct_IO(iocb, inode, iter, offset,
710 ext4_get_block); 725 ext4_dio_get_block);
711 726
712 if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { 727 if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
713 loff_t isize = i_size_read(inode); 728 loff_t isize = i_size_read(inode);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index dfe3b9bafc0d..7cbdd3752ba5 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -581,9 +581,10 @@ retry:
581 if (ret) 581 if (ret)
582 goto out; 582 goto out;
583 583
584 if (ext4_should_dioread_nolock(inode)) 584 if (ext4_should_dioread_nolock(inode)) {
585 ret = __block_write_begin(page, from, to, ext4_get_block_write); 585 ret = __block_write_begin(page, from, to,
586 else 586 ext4_get_block_unwritten);
587 } else
587 ret = __block_write_begin(page, from, to, ext4_get_block); 588 ret = __block_write_begin(page, from, to, ext4_get_block);
588 589
589 if (!ret && ext4_should_journal_data(inode)) { 590 if (!ret && ext4_should_journal_data(inode)) {
@@ -1696,7 +1697,6 @@ int ext4_delete_inline_entry(handle_t *handle,
1696 if (err) 1697 if (err)
1697 goto out; 1698 goto out;
1698 1699
1699 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1700 err = ext4_mark_inode_dirty(handle, dir); 1700 err = ext4_mark_inode_dirty(handle, dir);
1701 if (unlikely(err)) 1701 if (unlikely(err))
1702 goto out; 1702 goto out;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index aee960b1af34..dab84a2530ff 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -216,7 +216,6 @@ void ext4_evict_inode(struct inode *inode)
216 } 216 }
217 truncate_inode_pages_final(&inode->i_data); 217 truncate_inode_pages_final(&inode->i_data);
218 218
219 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
220 goto no_delete; 219 goto no_delete;
221 } 220 }
222 221
@@ -228,8 +227,6 @@ void ext4_evict_inode(struct inode *inode)
228 ext4_begin_ordered_truncate(inode, 0); 227 ext4_begin_ordered_truncate(inode, 0);
229 truncate_inode_pages_final(&inode->i_data); 228 truncate_inode_pages_final(&inode->i_data);
230 229
231 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
232
233 /* 230 /*
234 * Protect us against freezing - iput() caller didn't have to have any 231 * Protect us against freezing - iput() caller didn't have to have any
235 * protection against it 232 * protection against it
@@ -458,13 +455,13 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
458 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping 455 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
459 * based files 456 * based files
460 * 457 *
461 * On success, it returns the number of blocks being mapped or allocated. 458 * On success, it returns the number of blocks being mapped or allocated. if
462 * if create==0 and the blocks are pre-allocated and unwritten block, 459 * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
463 * the result buffer head is unmapped. If the create ==1, it will make sure 460 * is marked as unwritten. If the create == 1, it will mark @map as mapped.
464 * the buffer head is mapped.
465 * 461 *
466 * It returns 0 if plain look up failed (blocks have not been allocated), in 462 * It returns 0 if plain look up failed (blocks have not been allocated), in
467 * that case, buffer head is unmapped 463 * that case, @map is returned as unmapped but we still do fill map->m_len to
464 * indicate the length of a hole starting at map->m_lblk.
468 * 465 *
469 * It returns the error in case of allocation failure. 466 * It returns the error in case of allocation failure.
470 */ 467 */
@@ -507,6 +504,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
507 retval = map->m_len; 504 retval = map->m_len;
508 map->m_len = retval; 505 map->m_len = retval;
509 } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { 506 } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
507 map->m_pblk = 0;
508 retval = es.es_len - (map->m_lblk - es.es_lblk);
509 if (retval > map->m_len)
510 retval = map->m_len;
511 map->m_len = retval;
510 retval = 0; 512 retval = 0;
511 } else { 513 } else {
512 BUG_ON(1); 514 BUG_ON(1);
@@ -714,16 +716,11 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
714 cmpxchg(&bh->b_state, old_state, new_state) != old_state)); 716 cmpxchg(&bh->b_state, old_state, new_state) != old_state));
715} 717}
716 718
717/* Maximum number of blocks we map for direct IO at once. */
718#define DIO_MAX_BLOCKS 4096
719
720static int _ext4_get_block(struct inode *inode, sector_t iblock, 719static int _ext4_get_block(struct inode *inode, sector_t iblock,
721 struct buffer_head *bh, int flags) 720 struct buffer_head *bh, int flags)
722{ 721{
723 handle_t *handle = ext4_journal_current_handle();
724 struct ext4_map_blocks map; 722 struct ext4_map_blocks map;
725 int ret = 0, started = 0; 723 int ret = 0;
726 int dio_credits;
727 724
728 if (ext4_has_inline_data(inode)) 725 if (ext4_has_inline_data(inode))
729 return -ERANGE; 726 return -ERANGE;
@@ -731,33 +728,14 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
731 map.m_lblk = iblock; 728 map.m_lblk = iblock;
732 map.m_len = bh->b_size >> inode->i_blkbits; 729 map.m_len = bh->b_size >> inode->i_blkbits;
733 730
734 if (flags && !handle) { 731 ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map,
735 /* Direct IO write... */ 732 flags);
736 if (map.m_len > DIO_MAX_BLOCKS)
737 map.m_len = DIO_MAX_BLOCKS;
738 dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
739 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
740 dio_credits);
741 if (IS_ERR(handle)) {
742 ret = PTR_ERR(handle);
743 return ret;
744 }
745 started = 1;
746 }
747
748 ret = ext4_map_blocks(handle, inode, &map, flags);
749 if (ret > 0) { 733 if (ret > 0) {
750 ext4_io_end_t *io_end = ext4_inode_aio(inode);
751
752 map_bh(bh, inode->i_sb, map.m_pblk); 734 map_bh(bh, inode->i_sb, map.m_pblk);
753 ext4_update_bh_state(bh, map.m_flags); 735 ext4_update_bh_state(bh, map.m_flags);
754 if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
755 set_buffer_defer_completion(bh);
756 bh->b_size = inode->i_sb->s_blocksize * map.m_len; 736 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
757 ret = 0; 737 ret = 0;
758 } 738 }
759 if (started)
760 ext4_journal_stop(handle);
761 return ret; 739 return ret;
762} 740}
763 741
@@ -769,6 +747,155 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
769} 747}
770 748
771/* 749/*
750 * Get block function used when preparing for buffered write if we require
751 * creating an unwritten extent if blocks haven't been allocated. The extent
752 * will be converted to written after the IO is complete.
753 */
754int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
755 struct buffer_head *bh_result, int create)
756{
757 ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
758 inode->i_ino, create);
759 return _ext4_get_block(inode, iblock, bh_result,
760 EXT4_GET_BLOCKS_IO_CREATE_EXT);
761}
762
763/* Maximum number of blocks we map for direct IO at once. */
764#define DIO_MAX_BLOCKS 4096
765
766static handle_t *start_dio_trans(struct inode *inode,
767 struct buffer_head *bh_result)
768{
769 int dio_credits;
770
771 /* Trim mapping request to maximum we can map at once for DIO */
772 if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
773 bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
774 dio_credits = ext4_chunk_trans_blocks(inode,
775 bh_result->b_size >> inode->i_blkbits);
776 return ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
777}
778
779/* Get block function for DIO reads and writes to inodes without extents */
780int ext4_dio_get_block(struct inode *inode, sector_t iblock,
781 struct buffer_head *bh, int create)
782{
783 handle_t *handle;
784 int ret;
785
786 /* We don't expect handle for direct IO */
787 WARN_ON_ONCE(ext4_journal_current_handle());
788
789 if (create) {
790 handle = start_dio_trans(inode, bh);
791 if (IS_ERR(handle))
792 return PTR_ERR(handle);
793 }
794 ret = _ext4_get_block(inode, iblock, bh,
795 create ? EXT4_GET_BLOCKS_CREATE : 0);
796 if (create)
797 ext4_journal_stop(handle);
798 return ret;
799}
800
801/*
802 * Get block function for AIO DIO writes when we create unwritten extent if
803 * blocks are not allocated yet. The extent will be converted to written
804 * after IO is complete.
805 */
806static int ext4_dio_get_block_unwritten_async(struct inode *inode,
807 sector_t iblock, struct buffer_head *bh_result, int create)
808{
809 handle_t *handle;
810 int ret;
811
812 /* We don't expect handle for direct IO */
813 WARN_ON_ONCE(ext4_journal_current_handle());
814
815 handle = start_dio_trans(inode, bh_result);
816 if (IS_ERR(handle))
817 return PTR_ERR(handle);
818 ret = _ext4_get_block(inode, iblock, bh_result,
819 EXT4_GET_BLOCKS_IO_CREATE_EXT);
820 ext4_journal_stop(handle);
821
822 /*
823 * When doing DIO using unwritten extents, we need io_end to convert
824 * unwritten extents to written on IO completion. We allocate io_end
825 * once we spot unwritten extent and store it in b_private. Generic
826 * DIO code keeps b_private set and furthermore passes the value to
827 * our completion callback in 'private' argument.
828 */
829 if (!ret && buffer_unwritten(bh_result)) {
830 if (!bh_result->b_private) {
831 ext4_io_end_t *io_end;
832
833 io_end = ext4_init_io_end(inode, GFP_KERNEL);
834 if (!io_end)
835 return -ENOMEM;
836 bh_result->b_private = io_end;
837 ext4_set_io_unwritten_flag(inode, io_end);
838 }
839 set_buffer_defer_completion(bh_result);
840 }
841
842 return ret;
843}
844
845/*
846 * Get block function for non-AIO DIO writes when we create unwritten extent if
847 * blocks are not allocated yet. The extent will be converted to written
848 * after IO is complete from ext4_ext_direct_IO() function.
849 */
850static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
851 sector_t iblock, struct buffer_head *bh_result, int create)
852{
853 handle_t *handle;
854 int ret;
855
856 /* We don't expect handle for direct IO */
857 WARN_ON_ONCE(ext4_journal_current_handle());
858
859 handle = start_dio_trans(inode, bh_result);
860 if (IS_ERR(handle))
861 return PTR_ERR(handle);
862 ret = _ext4_get_block(inode, iblock, bh_result,
863 EXT4_GET_BLOCKS_IO_CREATE_EXT);
864 ext4_journal_stop(handle);
865
866 /*
867 * Mark inode as having pending DIO writes to unwritten extents.
868 * ext4_ext_direct_IO() checks this flag and converts extents to
869 * written.
870 */
871 if (!ret && buffer_unwritten(bh_result))
872 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
873
874 return ret;
875}
876
877static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
878 struct buffer_head *bh_result, int create)
879{
880 int ret;
881
882 ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
883 inode->i_ino, create);
884 /* We don't expect handle for direct IO */
885 WARN_ON_ONCE(ext4_journal_current_handle());
886
887 ret = _ext4_get_block(inode, iblock, bh_result, 0);
888 /*
889 * Blocks should have been preallocated! ext4_file_write_iter() checks
890 * that.
891 */
892 WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
893
894 return ret;
895}
896
897
898/*
772 * `handle' can be NULL if create is zero 899 * `handle' can be NULL if create is zero
773 */ 900 */
774struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 901struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
@@ -1079,13 +1206,14 @@ retry_journal:
1079#ifdef CONFIG_EXT4_FS_ENCRYPTION 1206#ifdef CONFIG_EXT4_FS_ENCRYPTION
1080 if (ext4_should_dioread_nolock(inode)) 1207 if (ext4_should_dioread_nolock(inode))
1081 ret = ext4_block_write_begin(page, pos, len, 1208 ret = ext4_block_write_begin(page, pos, len,
1082 ext4_get_block_write); 1209 ext4_get_block_unwritten);
1083 else 1210 else
1084 ret = ext4_block_write_begin(page, pos, len, 1211 ret = ext4_block_write_begin(page, pos, len,
1085 ext4_get_block); 1212 ext4_get_block);
1086#else 1213#else
1087 if (ext4_should_dioread_nolock(inode)) 1214 if (ext4_should_dioread_nolock(inode))
1088 ret = __block_write_begin(page, pos, len, ext4_get_block_write); 1215 ret = __block_write_begin(page, pos, len,
1216 ext4_get_block_unwritten);
1089 else 1217 else
1090 ret = __block_write_begin(page, pos, len, ext4_get_block); 1218 ret = __block_write_begin(page, pos, len, ext4_get_block);
1091#endif 1219#endif
@@ -3088,37 +3216,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3088 return try_to_free_buffers(page); 3216 return try_to_free_buffers(page);
3089} 3217}
3090 3218
3091/*
3092 * ext4_get_block used when preparing for a DIO write or buffer write.
3093 * We allocate an uinitialized extent if blocks haven't been allocated.
3094 * The extent will be converted to initialized after the IO is complete.
3095 */
3096int ext4_get_block_write(struct inode *inode, sector_t iblock,
3097 struct buffer_head *bh_result, int create)
3098{
3099 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3100 inode->i_ino, create);
3101 return _ext4_get_block(inode, iblock, bh_result,
3102 EXT4_GET_BLOCKS_IO_CREATE_EXT);
3103}
3104
3105static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock,
3106 struct buffer_head *bh_result, int create)
3107{
3108 int ret;
3109
3110 ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n",
3111 inode->i_ino, create);
3112 ret = _ext4_get_block(inode, iblock, bh_result, 0);
3113 /*
3114 * Blocks should have been preallocated! ext4_file_write_iter() checks
3115 * that.
3116 */
3117 WARN_ON_ONCE(!buffer_mapped(bh_result));
3118
3119 return ret;
3120}
3121
3122#ifdef CONFIG_FS_DAX 3219#ifdef CONFIG_FS_DAX
3123int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, 3220int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
3124 struct buffer_head *bh_result, int create) 3221 struct buffer_head *bh_result, int create)
@@ -3179,13 +3276,12 @@ out:
3179 WARN_ON_ONCE(ret == 0 && create); 3276 WARN_ON_ONCE(ret == 0 && create);
3180 if (ret > 0) { 3277 if (ret > 0) {
3181 map_bh(bh_result, inode->i_sb, map.m_pblk); 3278 map_bh(bh_result, inode->i_sb, map.m_pblk);
3182 bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
3183 map.m_flags;
3184 /* 3279 /*
3185 * At least for now we have to clear BH_New so that DAX code 3280 * At least for now we have to clear BH_New so that DAX code
3186 * doesn't attempt to zero blocks again in a racy way. 3281 * doesn't attempt to zero blocks again in a racy way.
3187 */ 3282 */
3188 bh_result->b_state &= ~(1 << BH_New); 3283 map.m_flags &= ~EXT4_MAP_NEW;
3284 ext4_update_bh_state(bh_result, map.m_flags);
3189 bh_result->b_size = map.m_len << inode->i_blkbits; 3285 bh_result->b_size = map.m_len << inode->i_blkbits;
3190 ret = 0; 3286 ret = 0;
3191 } 3287 }
@@ -3193,24 +3289,32 @@ out:
3193} 3289}
3194#endif 3290#endif
3195 3291
3196static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, 3292static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3197 ssize_t size, void *private) 3293 ssize_t size, void *private)
3198{ 3294{
3199 ext4_io_end_t *io_end = iocb->private; 3295 ext4_io_end_t *io_end = private;
3200 3296
3201 /* if not async direct IO just return */ 3297 /* if not async direct IO just return */
3202 if (!io_end) 3298 if (!io_end)
3203 return; 3299 return 0;
3204 3300
3205 ext_debug("ext4_end_io_dio(): io_end 0x%p " 3301 ext_debug("ext4_end_io_dio(): io_end 0x%p "
3206 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", 3302 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
3207 iocb->private, io_end->inode->i_ino, iocb, offset, 3303 io_end, io_end->inode->i_ino, iocb, offset, size);
3208 size);
3209 3304
3210 iocb->private = NULL; 3305 /*
3306 * Error during AIO DIO. We cannot convert unwritten extents as the
3307 * data was not written. Just clear the unwritten flag and drop io_end.
3308 */
3309 if (size <= 0) {
3310 ext4_clear_io_unwritten_flag(io_end);
3311 size = 0;
3312 }
3211 io_end->offset = offset; 3313 io_end->offset = offset;
3212 io_end->size = size; 3314 io_end->size = size;
3213 ext4_put_io_end(io_end); 3315 ext4_put_io_end(io_end);
3316
3317 return 0;
3214} 3318}
3215 3319
3216/* 3320/*
@@ -3243,7 +3347,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3243 get_block_t *get_block_func = NULL; 3347 get_block_t *get_block_func = NULL;
3244 int dio_flags = 0; 3348 int dio_flags = 0;
3245 loff_t final_size = offset + count; 3349 loff_t final_size = offset + count;
3246 ext4_io_end_t *io_end = NULL;
3247 3350
3248 /* Use the old path for reads and writes beyond i_size. */ 3351 /* Use the old path for reads and writes beyond i_size. */
3249 if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size) 3352 if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
@@ -3268,16 +3371,17 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3268 /* 3371 /*
3269 * We could direct write to holes and fallocate. 3372 * We could direct write to holes and fallocate.
3270 * 3373 *
3271 * Allocated blocks to fill the hole are marked as 3374 * Allocated blocks to fill the hole are marked as unwritten to prevent
3272 * unwritten to prevent parallel buffered read to expose 3375 * parallel buffered read to expose the stale data before DIO complete
3273 * the stale data before DIO complete the data IO. 3376 * the data IO.
3274 * 3377 *
3275 * As to previously fallocated extents, ext4 get_block will 3378 * As to previously fallocated extents, ext4 get_block will just simply
3276 * just simply mark the buffer mapped but still keep the 3379 * mark the buffer mapped but still keep the extents unwritten.
3277 * extents unwritten.
3278 * 3380 *
3279 * For non AIO case, we will convert those unwritten extents 3381 * For non AIO case, we will convert those unwritten extents to written
3280 * to written after return back from blockdev_direct_IO. 3382 * after return back from blockdev_direct_IO. That way we save us from
3383 * allocating io_end structure and also the overhead of offloading
3384 * the extent convertion to a workqueue.
3281 * 3385 *
3282 * For async DIO, the conversion needs to be deferred when the 3386 * For async DIO, the conversion needs to be deferred when the
3283 * IO is completed. The ext4 end_io callback function will be 3387 * IO is completed. The ext4 end_io callback function will be
@@ -3285,30 +3389,13 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3285 * case, we allocate an io_end structure to hook to the iocb. 3389 * case, we allocate an io_end structure to hook to the iocb.
3286 */ 3390 */
3287 iocb->private = NULL; 3391 iocb->private = NULL;
3288 if (overwrite) { 3392 if (overwrite)
3289 get_block_func = ext4_get_block_overwrite; 3393 get_block_func = ext4_dio_get_block_overwrite;
3394 else if (is_sync_kiocb(iocb)) {
3395 get_block_func = ext4_dio_get_block_unwritten_sync;
3396 dio_flags = DIO_LOCKING;
3290 } else { 3397 } else {
3291 ext4_inode_aio_set(inode, NULL); 3398 get_block_func = ext4_dio_get_block_unwritten_async;
3292 if (!is_sync_kiocb(iocb)) {
3293 io_end = ext4_init_io_end(inode, GFP_NOFS);
3294 if (!io_end) {
3295 ret = -ENOMEM;
3296 goto retake_lock;
3297 }
3298 /*
3299 * Grab reference for DIO. Will be dropped in
3300 * ext4_end_io_dio()
3301 */
3302 iocb->private = ext4_get_io_end(io_end);
3303 /*
3304 * we save the io structure for current async direct
3305 * IO, so that later ext4_map_blocks() could flag the
3306 * io structure whether there is a unwritten extents
3307 * needs to be converted when IO is completed.
3308 */
3309 ext4_inode_aio_set(inode, io_end);
3310 }
3311 get_block_func = ext4_get_block_write;
3312 dio_flags = DIO_LOCKING; 3399 dio_flags = DIO_LOCKING;
3313 } 3400 }
3314#ifdef CONFIG_EXT4_FS_ENCRYPTION 3401#ifdef CONFIG_EXT4_FS_ENCRYPTION
@@ -3323,27 +3410,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3323 get_block_func, 3410 get_block_func,
3324 ext4_end_io_dio, NULL, dio_flags); 3411 ext4_end_io_dio, NULL, dio_flags);
3325 3412
3326 /*
3327 * Put our reference to io_end. This can free the io_end structure e.g.
3328 * in sync IO case or in case of error. It can even perform extent
3329 * conversion if all bios we submitted finished before we got here.
3330 * Note that in that case iocb->private can be already set to NULL
3331 * here.
3332 */
3333 if (io_end) {
3334 ext4_inode_aio_set(inode, NULL);
3335 ext4_put_io_end(io_end);
3336 /*
3337 * When no IO was submitted ext4_end_io_dio() was not
3338 * called so we have to put iocb's reference.
3339 */
3340 if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
3341 WARN_ON(iocb->private != io_end);
3342 WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
3343 ext4_put_io_end(io_end);
3344 iocb->private = NULL;
3345 }
3346 }
3347 if (ret > 0 && !overwrite && ext4_test_inode_state(inode, 3413 if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
3348 EXT4_STATE_DIO_UNWRITTEN)) { 3414 EXT4_STATE_DIO_UNWRITTEN)) {
3349 int err; 3415 int err;
@@ -3358,7 +3424,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3358 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3424 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3359 } 3425 }
3360 3426
3361retake_lock:
3362 if (iov_iter_rw(iter) == WRITE) 3427 if (iov_iter_rw(iter) == WRITE)
3363 inode_dio_end(inode); 3428 inode_dio_end(inode);
3364 /* take i_mutex locking again if we do a ovewrite dio */ 3429 /* take i_mutex locking again if we do a ovewrite dio */
@@ -5261,6 +5326,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5261 might_sleep(); 5326 might_sleep();
5262 trace_ext4_mark_inode_dirty(inode, _RET_IP_); 5327 trace_ext4_mark_inode_dirty(inode, _RET_IP_);
5263 err = ext4_reserve_inode_write(handle, inode, &iloc); 5328 err = ext4_reserve_inode_write(handle, inode, &iloc);
5329 if (err)
5330 return err;
5264 if (ext4_handle_valid(handle) && 5331 if (ext4_handle_valid(handle) &&
5265 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5332 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
5266 !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { 5333 !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
@@ -5291,9 +5358,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5291 } 5358 }
5292 } 5359 }
5293 } 5360 }
5294 if (!err) 5361 return ext4_mark_iloc_dirty(handle, inode, &iloc);
5295 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
5296 return err;
5297} 5362}
5298 5363
5299/* 5364/*
@@ -5502,7 +5567,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5502 unlock_page(page); 5567 unlock_page(page);
5503 /* OK, we need to fill the hole... */ 5568 /* OK, we need to fill the hole... */
5504 if (ext4_should_dioread_nolock(inode)) 5569 if (ext4_should_dioread_nolock(inode))
5505 get_block = ext4_get_block_write; 5570 get_block = ext4_get_block_unwritten;
5506 else 5571 else
5507 get_block = ext4_get_block; 5572 get_block = ext4_get_block;
5508retry_alloc: 5573retry_alloc:
@@ -5545,3 +5610,70 @@ int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
5545 5610
5546 return err; 5611 return err;
5547} 5612}
5613
5614/*
5615 * Find the first extent at or after @lblk in an inode that is not a hole.
5616 * Search for @map_len blocks at most. The extent is returned in @result.
5617 *
5618 * The function returns 1 if we found an extent. The function returns 0 in
5619 * case there is no extent at or after @lblk and in that case also sets
5620 * @result->es_len to 0. In case of error, the error code is returned.
5621 */
5622int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
5623 unsigned int map_len, struct extent_status *result)
5624{
5625 struct ext4_map_blocks map;
5626 struct extent_status es = {};
5627 int ret;
5628
5629 map.m_lblk = lblk;
5630 map.m_len = map_len;
5631
5632 /*
5633 * For non-extent based files this loop may iterate several times since
5634 * we do not determine full hole size.
5635 */
5636 while (map.m_len > 0) {
5637 ret = ext4_map_blocks(NULL, inode, &map, 0);
5638 if (ret < 0)
5639 return ret;
5640 /* There's extent covering m_lblk? Just return it. */
5641 if (ret > 0) {
5642 int status;
5643
5644 ext4_es_store_pblock(result, map.m_pblk);
5645 result->es_lblk = map.m_lblk;
5646 result->es_len = map.m_len;
5647 if (map.m_flags & EXT4_MAP_UNWRITTEN)
5648 status = EXTENT_STATUS_UNWRITTEN;
5649 else
5650 status = EXTENT_STATUS_WRITTEN;
5651 ext4_es_store_status(result, status);
5652 return 1;
5653 }
5654 ext4_es_find_delayed_extent_range(inode, map.m_lblk,
5655 map.m_lblk + map.m_len - 1,
5656 &es);
5657 /* Is delalloc data before next block in extent tree? */
5658 if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) {
5659 ext4_lblk_t offset = 0;
5660
5661 if (es.es_lblk < lblk)
5662 offset = lblk - es.es_lblk;
5663 result->es_lblk = es.es_lblk + offset;
5664 ext4_es_store_pblock(result,
5665 ext4_es_pblock(&es) + offset);
5666 result->es_len = es.es_len - offset;
5667 ext4_es_store_status(result, ext4_es_status(&es));
5668
5669 return 1;
5670 }
5671 /* There's a hole at m_lblk, advance us after it */
5672 map.m_lblk += map.m_len;
5673 map_len -= map.m_len;
5674 map.m_len = map_len;
5675 cond_resched();
5676 }
5677 result->es_len = 0;
5678 return 0;
5679}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4424b7bf8ac6..50e05df28f66 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -11,7 +11,7 @@
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public Licens 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
17 */ 17 */
@@ -815,7 +815,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b)
815 * for this page; do not hold this lock when calling this routine! 815 * for this page; do not hold this lock when calling this routine!
816 */ 816 */
817 817
818static int ext4_mb_init_cache(struct page *page, char *incore) 818static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
819{ 819{
820 ext4_group_t ngroups; 820 ext4_group_t ngroups;
821 int blocksize; 821 int blocksize;
@@ -848,7 +848,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
848 /* allocate buffer_heads to read bitmaps */ 848 /* allocate buffer_heads to read bitmaps */
849 if (groups_per_page > 1) { 849 if (groups_per_page > 1) {
850 i = sizeof(struct buffer_head *) * groups_per_page; 850 i = sizeof(struct buffer_head *) * groups_per_page;
851 bh = kzalloc(i, GFP_NOFS); 851 bh = kzalloc(i, gfp);
852 if (bh == NULL) { 852 if (bh == NULL) {
853 err = -ENOMEM; 853 err = -ENOMEM;
854 goto out; 854 goto out;
@@ -983,7 +983,7 @@ out:
983 * are on the same page e4b->bd_buddy_page is NULL and return value is 0. 983 * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
984 */ 984 */
985static int ext4_mb_get_buddy_page_lock(struct super_block *sb, 985static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
986 ext4_group_t group, struct ext4_buddy *e4b) 986 ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
987{ 987{
988 struct inode *inode = EXT4_SB(sb)->s_buddy_cache; 988 struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
989 int block, pnum, poff; 989 int block, pnum, poff;
@@ -1002,7 +1002,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
1002 block = group * 2; 1002 block = group * 2;
1003 pnum = block / blocks_per_page; 1003 pnum = block / blocks_per_page;
1004 poff = block % blocks_per_page; 1004 poff = block % blocks_per_page;
1005 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1005 page = find_or_create_page(inode->i_mapping, pnum, gfp);
1006 if (!page) 1006 if (!page)
1007 return -ENOMEM; 1007 return -ENOMEM;
1008 BUG_ON(page->mapping != inode->i_mapping); 1008 BUG_ON(page->mapping != inode->i_mapping);
@@ -1016,7 +1016,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
1016 1016
1017 block++; 1017 block++;
1018 pnum = block / blocks_per_page; 1018 pnum = block / blocks_per_page;
1019 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1019 page = find_or_create_page(inode->i_mapping, pnum, gfp);
1020 if (!page) 1020 if (!page)
1021 return -ENOMEM; 1021 return -ENOMEM;
1022 BUG_ON(page->mapping != inode->i_mapping); 1022 BUG_ON(page->mapping != inode->i_mapping);
@@ -1042,7 +1042,7 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1042 * calling this routine! 1042 * calling this routine!
1043 */ 1043 */
1044static noinline_for_stack 1044static noinline_for_stack
1045int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 1045int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
1046{ 1046{
1047 1047
1048 struct ext4_group_info *this_grp; 1048 struct ext4_group_info *this_grp;
@@ -1062,7 +1062,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1062 * The call to ext4_mb_get_buddy_page_lock will mark the 1062 * The call to ext4_mb_get_buddy_page_lock will mark the
1063 * page accessed. 1063 * page accessed.
1064 */ 1064 */
1065 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); 1065 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
1066 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { 1066 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
1067 /* 1067 /*
1068 * somebody initialized the group 1068 * somebody initialized the group
@@ -1072,7 +1072,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1072 } 1072 }
1073 1073
1074 page = e4b.bd_bitmap_page; 1074 page = e4b.bd_bitmap_page;
1075 ret = ext4_mb_init_cache(page, NULL); 1075 ret = ext4_mb_init_cache(page, NULL, gfp);
1076 if (ret) 1076 if (ret)
1077 goto err; 1077 goto err;
1078 if (!PageUptodate(page)) { 1078 if (!PageUptodate(page)) {
@@ -1091,7 +1091,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1091 } 1091 }
1092 /* init buddy cache */ 1092 /* init buddy cache */
1093 page = e4b.bd_buddy_page; 1093 page = e4b.bd_buddy_page;
1094 ret = ext4_mb_init_cache(page, e4b.bd_bitmap); 1094 ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
1095 if (ret) 1095 if (ret)
1096 goto err; 1096 goto err;
1097 if (!PageUptodate(page)) { 1097 if (!PageUptodate(page)) {
@@ -1109,8 +1109,8 @@ err:
1109 * calling this routine! 1109 * calling this routine!
1110 */ 1110 */
1111static noinline_for_stack int 1111static noinline_for_stack int
1112ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1112ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
1113 struct ext4_buddy *e4b) 1113 struct ext4_buddy *e4b, gfp_t gfp)
1114{ 1114{
1115 int blocks_per_page; 1115 int blocks_per_page;
1116 int block; 1116 int block;
@@ -1140,7 +1140,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1140 * we need full data about the group 1140 * we need full data about the group
1141 * to make a good selection 1141 * to make a good selection
1142 */ 1142 */
1143 ret = ext4_mb_init_group(sb, group); 1143 ret = ext4_mb_init_group(sb, group, gfp);
1144 if (ret) 1144 if (ret)
1145 return ret; 1145 return ret;
1146 } 1146 }
@@ -1168,11 +1168,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1168 * wait for it to initialize. 1168 * wait for it to initialize.
1169 */ 1169 */
1170 page_cache_release(page); 1170 page_cache_release(page);
1171 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1171 page = find_or_create_page(inode->i_mapping, pnum, gfp);
1172 if (page) { 1172 if (page) {
1173 BUG_ON(page->mapping != inode->i_mapping); 1173 BUG_ON(page->mapping != inode->i_mapping);
1174 if (!PageUptodate(page)) { 1174 if (!PageUptodate(page)) {
1175 ret = ext4_mb_init_cache(page, NULL); 1175 ret = ext4_mb_init_cache(page, NULL, gfp);
1176 if (ret) { 1176 if (ret) {
1177 unlock_page(page); 1177 unlock_page(page);
1178 goto err; 1178 goto err;
@@ -1204,11 +1204,12 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1204 if (page == NULL || !PageUptodate(page)) { 1204 if (page == NULL || !PageUptodate(page)) {
1205 if (page) 1205 if (page)
1206 page_cache_release(page); 1206 page_cache_release(page);
1207 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1207 page = find_or_create_page(inode->i_mapping, pnum, gfp);
1208 if (page) { 1208 if (page) {
1209 BUG_ON(page->mapping != inode->i_mapping); 1209 BUG_ON(page->mapping != inode->i_mapping);
1210 if (!PageUptodate(page)) { 1210 if (!PageUptodate(page)) {
1211 ret = ext4_mb_init_cache(page, e4b->bd_bitmap); 1211 ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
1212 gfp);
1212 if (ret) { 1213 if (ret) {
1213 unlock_page(page); 1214 unlock_page(page);
1214 goto err; 1215 goto err;
@@ -1247,6 +1248,12 @@ err:
1247 return ret; 1248 return ret;
1248} 1249}
1249 1250
1251static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1252 struct ext4_buddy *e4b)
1253{
1254 return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
1255}
1256
1250static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) 1257static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1251{ 1258{
1252 if (e4b->bd_bitmap_page) 1259 if (e4b->bd_bitmap_page)
@@ -2045,7 +2052,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
2045 2052
2046 /* We only do this if the grp has never been initialized */ 2053 /* We only do this if the grp has never been initialized */
2047 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 2054 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
2048 int ret = ext4_mb_init_group(ac->ac_sb, group); 2055 int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
2049 if (ret) 2056 if (ret)
2050 return ret; 2057 return ret;
2051 } 2058 }
@@ -4695,16 +4702,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4695 } 4702 }
4696 4703
4697 /* 4704 /*
4698 * We need to make sure we don't reuse the freed block until
4699 * after the transaction is committed, which we can do by
4700 * treating the block as metadata, below. We make an
4701 * exception if the inode is to be written in writeback mode
4702 * since writeback mode has weak data consistency guarantees.
4703 */
4704 if (!ext4_should_writeback_data(inode))
4705 flags |= EXT4_FREE_BLOCKS_METADATA;
4706
4707 /*
4708 * If the extent to be freed does not begin on a cluster 4705 * If the extent to be freed does not begin on a cluster
4709 * boundary, we need to deal with partial clusters at the 4706 * boundary, we need to deal with partial clusters at the
4710 * beginning and end of the extent. Normally we will free 4707 * beginning and end of the extent. Normally we will free
@@ -4738,14 +4735,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4738 4735
4739 if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { 4736 if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
4740 int i; 4737 int i;
4738 int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
4741 4739
4742 for (i = 0; i < count; i++) { 4740 for (i = 0; i < count; i++) {
4743 cond_resched(); 4741 cond_resched();
4744 bh = sb_find_get_block(inode->i_sb, block + i); 4742 if (is_metadata)
4745 if (!bh) 4743 bh = sb_find_get_block(inode->i_sb, block + i);
4746 continue; 4744 ext4_forget(handle, is_metadata, inode, bh, block + i);
4747 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4748 inode, bh, block + i);
4749 } 4745 }
4750 } 4746 }
4751 4747
@@ -4815,16 +4811,23 @@ do_more:
4815#endif 4811#endif
4816 trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); 4812 trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
4817 4813
4818 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4814 /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
4815 err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
4816 GFP_NOFS|__GFP_NOFAIL);
4819 if (err) 4817 if (err)
4820 goto error_return; 4818 goto error_return;
4821 4819
4822 if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { 4820 /*
4821 * We need to make sure we don't reuse the freed block until after the
4822 * transaction is committed. We make an exception if the inode is to be
4823 * written in writeback mode since writeback mode has weak data
4824 * consistency guarantees.
4825 */
4826 if (ext4_handle_valid(handle) &&
4827 ((flags & EXT4_FREE_BLOCKS_METADATA) ||
4828 !ext4_should_writeback_data(inode))) {
4823 struct ext4_free_data *new_entry; 4829 struct ext4_free_data *new_entry;
4824 /* 4830 /*
4825 * blocks being freed are metadata. these blocks shouldn't
4826 * be used until this transaction is committed
4827 *
4828 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed 4831 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
4829 * to fail. 4832 * to fail.
4830 */ 4833 */
@@ -5217,7 +5220,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
5217 grp = ext4_get_group_info(sb, group); 5220 grp = ext4_get_group_info(sb, group);
5218 /* We only do this if the grp has never been initialized */ 5221 /* We only do this if the grp has never been initialized */
5219 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 5222 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
5220 ret = ext4_mb_init_group(sb, group); 5223 ret = ext4_mb_init_group(sb, group, GFP_NOFS);
5221 if (ret) 5224 if (ret)
5222 break; 5225 break;
5223 } 5226 }
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index d634e183b4d4..3ef1df6ae9ec 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -23,18 +23,6 @@
23#include "ext4.h" 23#include "ext4.h"
24 24
25/* 25/*
26 * with AGGRESSIVE_CHECK allocator runs consistency checks over
27 * structures. these checks slow things down a lot
28 */
29#define AGGRESSIVE_CHECK__
30
31/*
32 * with DOUBLE_CHECK defined mballoc creates persistent in-core
33 * bitmaps, maintains and uses them to check for double allocations
34 */
35#define DOUBLE_CHECK__
36
37/*
38 */ 26 */
39#ifdef CONFIG_EXT4_DEBUG 27#ifdef CONFIG_EXT4_DEBUG
40extern ushort ext4_mballoc_debug; 28extern ushort ext4_mballoc_debug;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a4651894cc33..364ea4d4a943 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -361,7 +361,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
361 * blocks. 361 * blocks.
362 * 362 *
363 * While converting to extents we need not 363 * While converting to extents we need not
364 * update the orignal inode i_blocks for extent blocks 364 * update the original inode i_blocks for extent blocks
365 * via quota APIs. The quota update happened via tmp_inode already. 365 * via quota APIs. The quota update happened via tmp_inode already.
366 */ 366 */
367 spin_lock(&inode->i_lock); 367 spin_lock(&inode->i_lock);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 0a512aa81bf7..24445275d330 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -91,21 +91,22 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
91 submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); 91 submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
92 wait_on_buffer(*bh); 92 wait_on_buffer(*bh);
93 if (!buffer_uptodate(*bh)) { 93 if (!buffer_uptodate(*bh)) {
94 brelse(*bh);
95 *bh = NULL;
96 ret = -EIO; 94 ret = -EIO;
97 goto warn_exit; 95 goto warn_exit;
98 } 96 }
99
100 mmp = (struct mmp_struct *)((*bh)->b_data); 97 mmp = (struct mmp_struct *)((*bh)->b_data);
101 if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) 98 if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) {
102 ret = -EFSCORRUPTED; 99 ret = -EFSCORRUPTED;
103 else if (!ext4_mmp_csum_verify(sb, mmp)) 100 goto warn_exit;
101 }
102 if (!ext4_mmp_csum_verify(sb, mmp)) {
104 ret = -EFSBADCRC; 103 ret = -EFSBADCRC;
105 else 104 goto warn_exit;
106 return 0; 105 }
107 106 return 0;
108warn_exit: 107warn_exit:
108 brelse(*bh);
109 *bh = NULL;
109 ext4_warning(sb, "Error %d while reading MMP block %llu", 110 ext4_warning(sb, "Error %d while reading MMP block %llu",
110 ret, mmp_block); 111 ret, mmp_block);
111 return ret; 112 return ret;
@@ -181,15 +182,13 @@ static int kmmpd(void *data)
181 EXT4_FEATURE_INCOMPAT_MMP)) { 182 EXT4_FEATURE_INCOMPAT_MMP)) {
182 ext4_warning(sb, "kmmpd being stopped since MMP feature" 183 ext4_warning(sb, "kmmpd being stopped since MMP feature"
183 " has been disabled."); 184 " has been disabled.");
184 EXT4_SB(sb)->s_mmp_tsk = NULL; 185 goto exit_thread;
185 goto failed;
186 } 186 }
187 187
188 if (sb->s_flags & MS_RDONLY) { 188 if (sb->s_flags & MS_RDONLY) {
189 ext4_warning(sb, "kmmpd being stopped since filesystem " 189 ext4_warning(sb, "kmmpd being stopped since filesystem "
190 "has been remounted as readonly."); 190 "has been remounted as readonly.");
191 EXT4_SB(sb)->s_mmp_tsk = NULL; 191 goto exit_thread;
192 goto failed;
193 } 192 }
194 193
195 diff = jiffies - last_update_time; 194 diff = jiffies - last_update_time;
@@ -211,9 +210,7 @@ static int kmmpd(void *data)
211 if (retval) { 210 if (retval) {
212 ext4_error(sb, "error reading MMP data: %d", 211 ext4_error(sb, "error reading MMP data: %d",
213 retval); 212 retval);
214 213 goto exit_thread;
215 EXT4_SB(sb)->s_mmp_tsk = NULL;
216 goto failed;
217 } 214 }
218 215
219 mmp_check = (struct mmp_struct *)(bh_check->b_data); 216 mmp_check = (struct mmp_struct *)(bh_check->b_data);
@@ -225,7 +222,9 @@ static int kmmpd(void *data)
225 "The filesystem seems to have been" 222 "The filesystem seems to have been"
226 " multiply mounted."); 223 " multiply mounted.");
227 ext4_error(sb, "abort"); 224 ext4_error(sb, "abort");
228 goto failed; 225 put_bh(bh_check);
226 retval = -EBUSY;
227 goto exit_thread;
229 } 228 }
230 put_bh(bh_check); 229 put_bh(bh_check);
231 } 230 }
@@ -248,7 +247,8 @@ static int kmmpd(void *data)
248 247
249 retval = write_mmp_block(sb, bh); 248 retval = write_mmp_block(sb, bh);
250 249
251failed: 250exit_thread:
251 EXT4_SB(sb)->s_mmp_tsk = NULL;
252 kfree(data); 252 kfree(data);
253 brelse(bh); 253 brelse(bh);
254 return retval; 254 return retval;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 090b3498638e..d77d15f4b674 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -128,9 +128,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
128 BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); 128 BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
129 WARN_ON(io_end->handle); 129 WARN_ON(io_end->handle);
130 130
131 if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
132 wake_up_all(ext4_ioend_wq(io_end->inode));
133
134 for (bio = io_end->bio; bio; bio = next_bio) { 131 for (bio = io_end->bio; bio; bio = next_bio) {
135 next_bio = bio->bi_private; 132 next_bio = bio->bi_private;
136 ext4_finish_bio(bio); 133 ext4_finish_bio(bio);
@@ -139,16 +136,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
139 kmem_cache_free(io_end_cachep, io_end); 136 kmem_cache_free(io_end_cachep, io_end);
140} 137}
141 138
142static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
143{
144 struct inode *inode = io_end->inode;
145
146 io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
147 /* Wake up anyone waiting on unwritten extent conversion */
148 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
149 wake_up_all(ext4_ioend_wq(inode));
150}
151
152/* 139/*
153 * Check a range of space and convert unwritten extents to written. Note that 140 * Check a range of space and convert unwritten extents to written. Note that
154 * we are protected from truncate touching same part of extent tree by the 141 * we are protected from truncate touching same part of extent tree by the
@@ -265,7 +252,6 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
265{ 252{
266 ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); 253 ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
267 if (io) { 254 if (io) {
268 atomic_inc(&EXT4_I(inode)->i_ioend_count);
269 io->inode = inode; 255 io->inode = inode;
270 INIT_LIST_HEAD(&io->list); 256 INIT_LIST_HEAD(&io->list);
271 atomic_set(&io->count, 1); 257 atomic_set(&io->count, 1);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3ed01ec011d7..539297515896 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -55,7 +55,6 @@
55 55
56static struct ext4_lazy_init *ext4_li_info; 56static struct ext4_lazy_init *ext4_li_info;
57static struct mutex ext4_li_mtx; 57static struct mutex ext4_li_mtx;
58static int ext4_mballoc_ready;
59static struct ratelimit_state ext4_mount_msg_ratelimit; 58static struct ratelimit_state ext4_mount_msg_ratelimit;
60 59
61static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 60static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
@@ -844,7 +843,6 @@ static void ext4_put_super(struct super_block *sb)
844 ext4_release_system_zone(sb); 843 ext4_release_system_zone(sb);
845 ext4_mb_release(sb); 844 ext4_mb_release(sb);
846 ext4_ext_release(sb); 845 ext4_ext_release(sb);
847 ext4_xattr_put_super(sb);
848 846
849 if (!(sb->s_flags & MS_RDONLY)) { 847 if (!(sb->s_flags & MS_RDONLY)) {
850 ext4_clear_feature_journal_needs_recovery(sb); 848 ext4_clear_feature_journal_needs_recovery(sb);
@@ -944,7 +942,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
944 spin_lock_init(&ei->i_completed_io_lock); 942 spin_lock_init(&ei->i_completed_io_lock);
945 ei->i_sync_tid = 0; 943 ei->i_sync_tid = 0;
946 ei->i_datasync_tid = 0; 944 ei->i_datasync_tid = 0;
947 atomic_set(&ei->i_ioend_count, 0);
948 atomic_set(&ei->i_unwritten, 0); 945 atomic_set(&ei->i_unwritten, 0);
949 INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); 946 INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
950#ifdef CONFIG_EXT4_FS_ENCRYPTION 947#ifdef CONFIG_EXT4_FS_ENCRYPTION
@@ -1132,6 +1129,7 @@ static const struct dquot_operations ext4_quota_operations = {
1132 .alloc_dquot = dquot_alloc, 1129 .alloc_dquot = dquot_alloc,
1133 .destroy_dquot = dquot_destroy, 1130 .destroy_dquot = dquot_destroy,
1134 .get_projid = ext4_get_projid, 1131 .get_projid = ext4_get_projid,
1132 .get_next_id = dquot_get_next_id,
1135}; 1133};
1136 1134
1137static const struct quotactl_ops ext4_qctl_operations = { 1135static const struct quotactl_ops ext4_qctl_operations = {
@@ -1141,7 +1139,8 @@ static const struct quotactl_ops ext4_qctl_operations = {
1141 .get_state = dquot_get_state, 1139 .get_state = dquot_get_state,
1142 .set_info = dquot_set_dqinfo, 1140 .set_info = dquot_set_dqinfo,
1143 .get_dqblk = dquot_get_dqblk, 1141 .get_dqblk = dquot_get_dqblk,
1144 .set_dqblk = dquot_set_dqblk 1142 .set_dqblk = dquot_set_dqblk,
1143 .get_nextdqblk = dquot_get_next_dqblk,
1145}; 1144};
1146#endif 1145#endif
1147 1146
@@ -1425,9 +1424,9 @@ static const struct mount_opts {
1425 {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, 1424 {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
1426 {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, 1425 {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
1427 {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, 1426 {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
1428 MOPT_NO_EXT2 | MOPT_SET}, 1427 MOPT_NO_EXT2},
1429 {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, 1428 {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
1430 MOPT_NO_EXT2 | MOPT_CLEAR}, 1429 MOPT_NO_EXT2},
1431 {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, 1430 {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
1432 {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, 1431 {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
1433 {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, 1432 {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
@@ -1705,6 +1704,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
1705 ext4_msg(sb, KERN_INFO, "dax option not supported"); 1704 ext4_msg(sb, KERN_INFO, "dax option not supported");
1706 return -1; 1705 return -1;
1707#endif 1706#endif
1707 } else if (token == Opt_data_err_abort) {
1708 sbi->s_mount_opt |= m->mount_opt;
1709 } else if (token == Opt_data_err_ignore) {
1710 sbi->s_mount_opt &= ~m->mount_opt;
1708 } else { 1711 } else {
1709 if (!args->from) 1712 if (!args->from)
1710 arg = 1; 1713 arg = 1;
@@ -1914,6 +1917,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
1914 SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); 1917 SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
1915 if (nodefs || sbi->s_max_dir_size_kb) 1918 if (nodefs || sbi->s_max_dir_size_kb)
1916 SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); 1919 SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
1920 if (test_opt(sb, DATA_ERR_ABORT))
1921 SEQ_OPTS_PUTS("data_err=abort");
1917 1922
1918 ext4_show_quota_options(seq, sb); 1923 ext4_show_quota_options(seq, sb);
1919 return 0; 1924 return 0;
@@ -3796,12 +3801,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3796 sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; 3801 sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
3797 3802
3798no_journal: 3803no_journal:
3799 if (ext4_mballoc_ready) { 3804 sbi->s_mb_cache = ext4_xattr_create_cache();
3800 sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id); 3805 if (!sbi->s_mb_cache) {
3801 if (!sbi->s_mb_cache) { 3806 ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
3802 ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); 3807 goto failed_mount_wq;
3803 goto failed_mount_wq;
3804 }
3805 } 3808 }
3806 3809
3807 if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && 3810 if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
@@ -4027,6 +4030,10 @@ failed_mount4:
4027 if (EXT4_SB(sb)->rsv_conversion_wq) 4030 if (EXT4_SB(sb)->rsv_conversion_wq)
4028 destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); 4031 destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
4029failed_mount_wq: 4032failed_mount_wq:
4033 if (sbi->s_mb_cache) {
4034 ext4_xattr_destroy_cache(sbi->s_mb_cache);
4035 sbi->s_mb_cache = NULL;
4036 }
4030 if (sbi->s_journal) { 4037 if (sbi->s_journal) {
4031 jbd2_journal_destroy(sbi->s_journal); 4038 jbd2_journal_destroy(sbi->s_journal);
4032 sbi->s_journal = NULL; 4039 sbi->s_journal = NULL;
@@ -5321,7 +5328,6 @@ MODULE_ALIAS_FS("ext4");
5321 5328
5322/* Shared across all ext4 file systems */ 5329/* Shared across all ext4 file systems */
5323wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; 5330wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
5324struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
5325 5331
5326static int __init ext4_init_fs(void) 5332static int __init ext4_init_fs(void)
5327{ 5333{
@@ -5334,10 +5340,8 @@ static int __init ext4_init_fs(void)
5334 /* Build-time check for flags consistency */ 5340 /* Build-time check for flags consistency */
5335 ext4_check_flag_values(); 5341 ext4_check_flag_values();
5336 5342
5337 for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { 5343 for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
5338 mutex_init(&ext4__aio_mutex[i]);
5339 init_waitqueue_head(&ext4__ioend_wq[i]); 5344 init_waitqueue_head(&ext4__ioend_wq[i]);
5340 }
5341 5345
5342 err = ext4_init_es(); 5346 err = ext4_init_es();
5343 if (err) 5347 if (err)
@@ -5358,8 +5362,6 @@ static int __init ext4_init_fs(void)
5358 err = ext4_init_mballoc(); 5362 err = ext4_init_mballoc();
5359 if (err) 5363 if (err)
5360 goto out2; 5364 goto out2;
5361 else
5362 ext4_mballoc_ready = 1;
5363 err = init_inodecache(); 5365 err = init_inodecache();
5364 if (err) 5366 if (err)
5365 goto out1; 5367 goto out1;
@@ -5375,7 +5377,6 @@ out:
5375 unregister_as_ext3(); 5377 unregister_as_ext3();
5376 destroy_inodecache(); 5378 destroy_inodecache();
5377out1: 5379out1:
5378 ext4_mballoc_ready = 0;
5379 ext4_exit_mballoc(); 5380 ext4_exit_mballoc();
5380out2: 5381out2:
5381 ext4_exit_sysfs(); 5382 ext4_exit_sysfs();
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index a95151e875bd..0441e055c8e8 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -545,30 +545,44 @@ static void
545ext4_xattr_release_block(handle_t *handle, struct inode *inode, 545ext4_xattr_release_block(handle_t *handle, struct inode *inode,
546 struct buffer_head *bh) 546 struct buffer_head *bh)
547{ 547{
548 struct mb_cache_entry *ce = NULL;
549 int error = 0;
550 struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); 548 struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
549 u32 hash, ref;
550 int error = 0;
551 551
552 ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
553 BUFFER_TRACE(bh, "get_write_access"); 552 BUFFER_TRACE(bh, "get_write_access");
554 error = ext4_journal_get_write_access(handle, bh); 553 error = ext4_journal_get_write_access(handle, bh);
555 if (error) 554 if (error)
556 goto out; 555 goto out;
557 556
558 lock_buffer(bh); 557 lock_buffer(bh);
559 if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { 558 hash = le32_to_cpu(BHDR(bh)->h_hash);
559 ref = le32_to_cpu(BHDR(bh)->h_refcount);
560 if (ref == 1) {
560 ea_bdebug(bh, "refcount now=0; freeing"); 561 ea_bdebug(bh, "refcount now=0; freeing");
561 if (ce) 562 /*
562 mb_cache_entry_free(ce); 563 * This must happen under buffer lock for
564 * ext4_xattr_block_set() to reliably detect freed block
565 */
566 mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr);
563 get_bh(bh); 567 get_bh(bh);
564 unlock_buffer(bh); 568 unlock_buffer(bh);
565 ext4_free_blocks(handle, inode, bh, 0, 1, 569 ext4_free_blocks(handle, inode, bh, 0, 1,
566 EXT4_FREE_BLOCKS_METADATA | 570 EXT4_FREE_BLOCKS_METADATA |
567 EXT4_FREE_BLOCKS_FORGET); 571 EXT4_FREE_BLOCKS_FORGET);
568 } else { 572 } else {
569 le32_add_cpu(&BHDR(bh)->h_refcount, -1); 573 ref--;
570 if (ce) 574 BHDR(bh)->h_refcount = cpu_to_le32(ref);
571 mb_cache_entry_release(ce); 575 if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) {
576 struct mb_cache_entry *ce;
577
578 ce = mb_cache_entry_get(ext4_mb_cache, hash,
579 bh->b_blocknr);
580 if (ce) {
581 ce->e_reusable = 1;
582 mb_cache_entry_put(ext4_mb_cache, ce);
583 }
584 }
585
572 /* 586 /*
573 * Beware of this ugliness: Releasing of xattr block references 587 * Beware of this ugliness: Releasing of xattr block references
574 * from different inodes can race and so we have to protect 588 * from different inodes can race and so we have to protect
@@ -790,8 +804,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
790 if (i->value && i->value_len > sb->s_blocksize) 804 if (i->value && i->value_len > sb->s_blocksize)
791 return -ENOSPC; 805 return -ENOSPC;
792 if (s->base) { 806 if (s->base) {
793 ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
794 bs->bh->b_blocknr);
795 BUFFER_TRACE(bs->bh, "get_write_access"); 807 BUFFER_TRACE(bs->bh, "get_write_access");
796 error = ext4_journal_get_write_access(handle, bs->bh); 808 error = ext4_journal_get_write_access(handle, bs->bh);
797 if (error) 809 if (error)
@@ -799,10 +811,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
799 lock_buffer(bs->bh); 811 lock_buffer(bs->bh);
800 812
801 if (header(s->base)->h_refcount == cpu_to_le32(1)) { 813 if (header(s->base)->h_refcount == cpu_to_le32(1)) {
802 if (ce) { 814 __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash);
803 mb_cache_entry_free(ce); 815
804 ce = NULL; 816 /*
805 } 817 * This must happen under buffer lock for
818 * ext4_xattr_block_set() to reliably detect modified
819 * block
820 */
821 mb_cache_entry_delete_block(ext4_mb_cache, hash,
822 bs->bh->b_blocknr);
806 ea_bdebug(bs->bh, "modifying in-place"); 823 ea_bdebug(bs->bh, "modifying in-place");
807 error = ext4_xattr_set_entry(i, s); 824 error = ext4_xattr_set_entry(i, s);
808 if (!error) { 825 if (!error) {
@@ -826,10 +843,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
826 int offset = (char *)s->here - bs->bh->b_data; 843 int offset = (char *)s->here - bs->bh->b_data;
827 844
828 unlock_buffer(bs->bh); 845 unlock_buffer(bs->bh);
829 if (ce) {
830 mb_cache_entry_release(ce);
831 ce = NULL;
832 }
833 ea_bdebug(bs->bh, "cloning"); 846 ea_bdebug(bs->bh, "cloning");
834 s->base = kmalloc(bs->bh->b_size, GFP_NOFS); 847 s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
835 error = -ENOMEM; 848 error = -ENOMEM;
@@ -872,6 +885,8 @@ inserted:
872 if (new_bh == bs->bh) 885 if (new_bh == bs->bh)
873 ea_bdebug(new_bh, "keeping"); 886 ea_bdebug(new_bh, "keeping");
874 else { 887 else {
888 u32 ref;
889
875 /* The old block is released after updating 890 /* The old block is released after updating
876 the inode. */ 891 the inode. */
877 error = dquot_alloc_block(inode, 892 error = dquot_alloc_block(inode,
@@ -884,9 +899,40 @@ inserted:
884 if (error) 899 if (error)
885 goto cleanup_dquot; 900 goto cleanup_dquot;
886 lock_buffer(new_bh); 901 lock_buffer(new_bh);
887 le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); 902 /*
903 * We have to be careful about races with
904 * freeing, rehashing or adding references to
905 * xattr block. Once we hold buffer lock xattr
906 * block's state is stable so we can check
907 * whether the block got freed / rehashed or
908 * not. Since we unhash mbcache entry under
909 * buffer lock when freeing / rehashing xattr
910 * block, checking whether entry is still
911 * hashed is reliable. Same rules hold for
912 * e_reusable handling.
913 */
914 if (hlist_bl_unhashed(&ce->e_hash_list) ||
915 !ce->e_reusable) {
916 /*
917 * Undo everything and check mbcache
918 * again.
919 */
920 unlock_buffer(new_bh);
921 dquot_free_block(inode,
922 EXT4_C2B(EXT4_SB(sb),
923 1));
924 brelse(new_bh);
925 mb_cache_entry_put(ext4_mb_cache, ce);
926 ce = NULL;
927 new_bh = NULL;
928 goto inserted;
929 }
930 ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
931 BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
932 if (ref >= EXT4_XATTR_REFCOUNT_MAX)
933 ce->e_reusable = 0;
888 ea_bdebug(new_bh, "reusing; refcount now=%d", 934 ea_bdebug(new_bh, "reusing; refcount now=%d",
889 le32_to_cpu(BHDR(new_bh)->h_refcount)); 935 ref);
890 unlock_buffer(new_bh); 936 unlock_buffer(new_bh);
891 error = ext4_handle_dirty_xattr_block(handle, 937 error = ext4_handle_dirty_xattr_block(handle,
892 inode, 938 inode,
@@ -894,7 +940,8 @@ inserted:
894 if (error) 940 if (error)
895 goto cleanup_dquot; 941 goto cleanup_dquot;
896 } 942 }
897 mb_cache_entry_release(ce); 943 mb_cache_entry_touch(ext4_mb_cache, ce);
944 mb_cache_entry_put(ext4_mb_cache, ce);
898 ce = NULL; 945 ce = NULL;
899 } else if (bs->bh && s->base == bs->bh->b_data) { 946 } else if (bs->bh && s->base == bs->bh->b_data) {
900 /* We were modifying this block in-place. */ 947 /* We were modifying this block in-place. */
@@ -959,7 +1006,7 @@ getblk_failed:
959 1006
960cleanup: 1007cleanup:
961 if (ce) 1008 if (ce)
962 mb_cache_entry_release(ce); 1009 mb_cache_entry_put(ext4_mb_cache, ce);
963 brelse(new_bh); 1010 brelse(new_bh);
964 if (!(bs->bh && s->base == bs->bh->b_data)) 1011 if (!(bs->bh && s->base == bs->bh->b_data))
965 kfree(s->base); 1012 kfree(s->base);
@@ -1070,6 +1117,17 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
1070 return 0; 1117 return 0;
1071} 1118}
1072 1119
1120static int ext4_xattr_value_same(struct ext4_xattr_search *s,
1121 struct ext4_xattr_info *i)
1122{
1123 void *value;
1124
1125 if (le32_to_cpu(s->here->e_value_size) != i->value_len)
1126 return 0;
1127 value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs);
1128 return !memcmp(value, i->value, i->value_len);
1129}
1130
1073/* 1131/*
1074 * ext4_xattr_set_handle() 1132 * ext4_xattr_set_handle()
1075 * 1133 *
@@ -1146,6 +1204,13 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1146 else if (!bs.s.not_found) 1204 else if (!bs.s.not_found)
1147 error = ext4_xattr_block_set(handle, inode, &i, &bs); 1205 error = ext4_xattr_block_set(handle, inode, &i, &bs);
1148 } else { 1206 } else {
1207 error = 0;
1208 /* Xattr value did not change? Save us some work and bail out */
1209 if (!is.s.not_found && ext4_xattr_value_same(&is.s, &i))
1210 goto cleanup;
1211 if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
1212 goto cleanup;
1213
1149 error = ext4_xattr_ibody_set(handle, inode, &i, &is); 1214 error = ext4_xattr_ibody_set(handle, inode, &i, &is);
1150 if (!error && !bs.s.not_found) { 1215 if (!error && !bs.s.not_found) {
1151 i.value = NULL; 1216 i.value = NULL;
@@ -1512,17 +1577,6 @@ cleanup:
1512} 1577}
1513 1578
1514/* 1579/*
1515 * ext4_xattr_put_super()
1516 *
1517 * This is called when a file system is unmounted.
1518 */
1519void
1520ext4_xattr_put_super(struct super_block *sb)
1521{
1522 mb_cache_shrink(sb->s_bdev);
1523}
1524
1525/*
1526 * ext4_xattr_cache_insert() 1580 * ext4_xattr_cache_insert()
1527 * 1581 *
1528 * Create a new entry in the extended attribute cache, and insert 1582 * Create a new entry in the extended attribute cache, and insert
@@ -1533,26 +1587,19 @@ ext4_xattr_put_super(struct super_block *sb)
1533static void 1587static void
1534ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) 1588ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
1535{ 1589{
1536 __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); 1590 struct ext4_xattr_header *header = BHDR(bh);
1537 struct mb_cache_entry *ce; 1591 __u32 hash = le32_to_cpu(header->h_hash);
1592 int reusable = le32_to_cpu(header->h_refcount) <
1593 EXT4_XATTR_REFCOUNT_MAX;
1538 int error; 1594 int error;
1539 1595
1540 ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS); 1596 error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash,
1541 if (!ce) { 1597 bh->b_blocknr, reusable);
1542 ea_bdebug(bh, "out of memory");
1543 return;
1544 }
1545 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
1546 if (error) { 1598 if (error) {
1547 mb_cache_entry_free(ce); 1599 if (error == -EBUSY)
1548 if (error == -EBUSY) {
1549 ea_bdebug(bh, "already in cache"); 1600 ea_bdebug(bh, "already in cache");
1550 error = 0; 1601 } else
1551 }
1552 } else {
1553 ea_bdebug(bh, "inserting [%x]", (int)hash); 1602 ea_bdebug(bh, "inserting [%x]", (int)hash);
1554 mb_cache_entry_release(ce);
1555 }
1556} 1603}
1557 1604
1558/* 1605/*
@@ -1614,33 +1661,20 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
1614 if (!header->h_hash) 1661 if (!header->h_hash)
1615 return NULL; /* never share */ 1662 return NULL; /* never share */
1616 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); 1663 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
1617again: 1664 ce = mb_cache_entry_find_first(ext4_mb_cache, hash);
1618 ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
1619 hash);
1620 while (ce) { 1665 while (ce) {
1621 struct buffer_head *bh; 1666 struct buffer_head *bh;
1622 1667
1623 if (IS_ERR(ce)) {
1624 if (PTR_ERR(ce) == -EAGAIN)
1625 goto again;
1626 break;
1627 }
1628 bh = sb_bread(inode->i_sb, ce->e_block); 1668 bh = sb_bread(inode->i_sb, ce->e_block);
1629 if (!bh) { 1669 if (!bh) {
1630 EXT4_ERROR_INODE(inode, "block %lu read error", 1670 EXT4_ERROR_INODE(inode, "block %lu read error",
1631 (unsigned long) ce->e_block); 1671 (unsigned long) ce->e_block);
1632 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
1633 EXT4_XATTR_REFCOUNT_MAX) {
1634 ea_idebug(inode, "block %lu refcount %d>=%d",
1635 (unsigned long) ce->e_block,
1636 le32_to_cpu(BHDR(bh)->h_refcount),
1637 EXT4_XATTR_REFCOUNT_MAX);
1638 } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { 1672 } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
1639 *pce = ce; 1673 *pce = ce;
1640 return bh; 1674 return bh;
1641 } 1675 }
1642 brelse(bh); 1676 brelse(bh);
1643 ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); 1677 ce = mb_cache_entry_find_next(ext4_mb_cache, ce);
1644 } 1678 }
1645 return NULL; 1679 return NULL;
1646} 1680}
@@ -1716,9 +1750,9 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
1716#define HASH_BUCKET_BITS 10 1750#define HASH_BUCKET_BITS 10
1717 1751
1718struct mb_cache * 1752struct mb_cache *
1719ext4_xattr_create_cache(char *name) 1753ext4_xattr_create_cache(void)
1720{ 1754{
1721 return mb_cache_create(name, HASH_BUCKET_BITS); 1755 return mb_cache_create(HASH_BUCKET_BITS);
1722} 1756}
1723 1757
1724void ext4_xattr_destroy_cache(struct mb_cache *cache) 1758void ext4_xattr_destroy_cache(struct mb_cache *cache)
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index ddc0957760ba..69dd3e6566e0 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -108,7 +108,6 @@ extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_
108extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); 108extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
109 109
110extern void ext4_xattr_delete_inode(handle_t *, struct inode *); 110extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
111extern void ext4_xattr_put_super(struct super_block *);
112 111
113extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, 112extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
114 struct ext4_inode *raw_inode, handle_t *handle); 113 struct ext4_inode *raw_inode, handle_t *handle);
@@ -124,7 +123,7 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
124 struct ext4_xattr_info *i, 123 struct ext4_xattr_info *i,
125 struct ext4_xattr_ibody_find *is); 124 struct ext4_xattr_ibody_find *is);
126 125
127extern struct mb_cache *ext4_xattr_create_cache(char *name); 126extern struct mb_cache *ext4_xattr_create_cache(void);
128extern void ext4_xattr_destroy_cache(struct mb_cache *); 127extern void ext4_xattr_destroy_cache(struct mb_cache *);
129 128
130#ifdef CONFIG_EXT4_FS_SECURITY 129#ifdef CONFIG_EXT4_FS_SECURITY
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index b0a9dc929f88..1f8982a957f1 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -1,6 +1,8 @@
1config F2FS_FS 1config F2FS_FS
2 tristate "F2FS filesystem support" 2 tristate "F2FS filesystem support"
3 depends on BLOCK 3 depends on BLOCK
4 select CRYPTO
5 select CRYPTO_CRC32
4 help 6 help
5 F2FS is based on Log-structured File System (LFS), which supports 7 F2FS is based on Log-structured File System (LFS), which supports
6 versatile "flash-friendly" features. The design has been focused on 8 versatile "flash-friendly" features. The design has been focused on
@@ -76,15 +78,7 @@ config F2FS_FS_ENCRYPTION
76 bool "F2FS Encryption" 78 bool "F2FS Encryption"
77 depends on F2FS_FS 79 depends on F2FS_FS
78 depends on F2FS_FS_XATTR 80 depends on F2FS_FS_XATTR
79 select CRYPTO_AES 81 select FS_ENCRYPTION
80 select CRYPTO_CBC
81 select CRYPTO_ECB
82 select CRYPTO_XTS
83 select CRYPTO_CTS
84 select CRYPTO_CTR
85 select CRYPTO_SHA256
86 select KEYS
87 select ENCRYPTED_KEYS
88 help 82 help
89 Enable encryption of f2fs files and directories. This 83 Enable encryption of f2fs files and directories. This
90 feature is similar to ecryptfs, but it is more memory 84 feature is similar to ecryptfs, but it is more memory
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 08e101ed914c..ca949ea7c02f 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -7,5 +7,3 @@ f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
7f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o 7f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
8f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o 8f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
9f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o 9f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
10f2fs-$(CONFIG_F2FS_FS_ENCRYPTION) += crypto_policy.o crypto.o \
11 crypto_key.o crypto_fname.o
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 3842af954cd5..0955312e5ca0 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -39,7 +39,7 @@ repeat:
39 cond_resched(); 39 cond_resched();
40 goto repeat; 40 goto repeat;
41 } 41 }
42 f2fs_wait_on_page_writeback(page, META); 42 f2fs_wait_on_page_writeback(page, META, true);
43 SetPageUptodate(page); 43 SetPageUptodate(page);
44 return page; 44 return page;
45} 45}
@@ -56,7 +56,8 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
56 .sbi = sbi, 56 .sbi = sbi,
57 .type = META, 57 .type = META,
58 .rw = READ_SYNC | REQ_META | REQ_PRIO, 58 .rw = READ_SYNC | REQ_META | REQ_PRIO,
59 .blk_addr = index, 59 .old_blkaddr = index,
60 .new_blkaddr = index,
60 .encrypted_page = NULL, 61 .encrypted_page = NULL,
61 }; 62 };
62 63
@@ -143,7 +144,6 @@ bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
143int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, 144int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
144 int type, bool sync) 145 int type, bool sync)
145{ 146{
146 block_t prev_blk_addr = 0;
147 struct page *page; 147 struct page *page;
148 block_t blkno = start; 148 block_t blkno = start;
149 struct f2fs_io_info fio = { 149 struct f2fs_io_info fio = {
@@ -152,10 +152,12 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
152 .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA, 152 .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA,
153 .encrypted_page = NULL, 153 .encrypted_page = NULL,
154 }; 154 };
155 struct blk_plug plug;
155 156
156 if (unlikely(type == META_POR)) 157 if (unlikely(type == META_POR))
157 fio.rw &= ~REQ_META; 158 fio.rw &= ~REQ_META;
158 159
160 blk_start_plug(&plug);
159 for (; nrpages-- > 0; blkno++) { 161 for (; nrpages-- > 0; blkno++) {
160 162
161 if (!is_valid_blkaddr(sbi, blkno, type)) 163 if (!is_valid_blkaddr(sbi, blkno, type))
@@ -167,27 +169,24 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
167 NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid))) 169 NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid)))
168 blkno = 0; 170 blkno = 0;
169 /* get nat block addr */ 171 /* get nat block addr */
170 fio.blk_addr = current_nat_addr(sbi, 172 fio.new_blkaddr = current_nat_addr(sbi,
171 blkno * NAT_ENTRY_PER_BLOCK); 173 blkno * NAT_ENTRY_PER_BLOCK);
172 break; 174 break;
173 case META_SIT: 175 case META_SIT:
174 /* get sit block addr */ 176 /* get sit block addr */
175 fio.blk_addr = current_sit_addr(sbi, 177 fio.new_blkaddr = current_sit_addr(sbi,
176 blkno * SIT_ENTRY_PER_BLOCK); 178 blkno * SIT_ENTRY_PER_BLOCK);
177 if (blkno != start && prev_blk_addr + 1 != fio.blk_addr)
178 goto out;
179 prev_blk_addr = fio.blk_addr;
180 break; 179 break;
181 case META_SSA: 180 case META_SSA:
182 case META_CP: 181 case META_CP:
183 case META_POR: 182 case META_POR:
184 fio.blk_addr = blkno; 183 fio.new_blkaddr = blkno;
185 break; 184 break;
186 default: 185 default:
187 BUG(); 186 BUG();
188 } 187 }
189 188
190 page = grab_cache_page(META_MAPPING(sbi), fio.blk_addr); 189 page = grab_cache_page(META_MAPPING(sbi), fio.new_blkaddr);
191 if (!page) 190 if (!page)
192 continue; 191 continue;
193 if (PageUptodate(page)) { 192 if (PageUptodate(page)) {
@@ -196,11 +195,13 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
196 } 195 }
197 196
198 fio.page = page; 197 fio.page = page;
198 fio.old_blkaddr = fio.new_blkaddr;
199 f2fs_submit_page_mbio(&fio); 199 f2fs_submit_page_mbio(&fio);
200 f2fs_put_page(page, 0); 200 f2fs_put_page(page, 0);
201 } 201 }
202out: 202out:
203 f2fs_submit_merged_bio(sbi, META, READ); 203 f2fs_submit_merged_bio(sbi, META, READ);
204 blk_finish_plug(&plug);
204 return blkno - start; 205 return blkno - start;
205} 206}
206 207
@@ -232,13 +233,17 @@ static int f2fs_write_meta_page(struct page *page,
232 if (unlikely(f2fs_cp_error(sbi))) 233 if (unlikely(f2fs_cp_error(sbi)))
233 goto redirty_out; 234 goto redirty_out;
234 235
235 f2fs_wait_on_page_writeback(page, META);
236 write_meta_page(sbi, page); 236 write_meta_page(sbi, page);
237 dec_page_count(sbi, F2FS_DIRTY_META); 237 dec_page_count(sbi, F2FS_DIRTY_META);
238
239 if (wbc->for_reclaim)
240 f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, META, WRITE);
241
238 unlock_page(page); 242 unlock_page(page);
239 243
240 if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi))) 244 if (unlikely(f2fs_cp_error(sbi)))
241 f2fs_submit_merged_bio(sbi, META, WRITE); 245 f2fs_submit_merged_bio(sbi, META, WRITE);
246
242 return 0; 247 return 0;
243 248
244redirty_out: 249redirty_out:
@@ -252,13 +257,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
252 struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); 257 struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
253 long diff, written; 258 long diff, written;
254 259
255 trace_f2fs_writepages(mapping->host, wbc, META);
256
257 /* collect a number of dirty meta pages and write together */ 260 /* collect a number of dirty meta pages and write together */
258 if (wbc->for_kupdate || 261 if (wbc->for_kupdate ||
259 get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) 262 get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
260 goto skip_write; 263 goto skip_write;
261 264
265 trace_f2fs_writepages(mapping->host, wbc, META);
266
262 /* if mounting is failed, skip writing node pages */ 267 /* if mounting is failed, skip writing node pages */
263 mutex_lock(&sbi->cp_mutex); 268 mutex_lock(&sbi->cp_mutex);
264 diff = nr_pages_to_write(sbi, META, wbc); 269 diff = nr_pages_to_write(sbi, META, wbc);
@@ -269,6 +274,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
269 274
270skip_write: 275skip_write:
271 wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META); 276 wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META);
277 trace_f2fs_writepages(mapping->host, wbc, META);
272 return 0; 278 return 0;
273} 279}
274 280
@@ -276,15 +282,18 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
276 long nr_to_write) 282 long nr_to_write)
277{ 283{
278 struct address_space *mapping = META_MAPPING(sbi); 284 struct address_space *mapping = META_MAPPING(sbi);
279 pgoff_t index = 0, end = LONG_MAX, prev = LONG_MAX; 285 pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX;
280 struct pagevec pvec; 286 struct pagevec pvec;
281 long nwritten = 0; 287 long nwritten = 0;
282 struct writeback_control wbc = { 288 struct writeback_control wbc = {
283 .for_reclaim = 0, 289 .for_reclaim = 0,
284 }; 290 };
291 struct blk_plug plug;
285 292
286 pagevec_init(&pvec, 0); 293 pagevec_init(&pvec, 0);
287 294
295 blk_start_plug(&plug);
296
288 while (index <= end) { 297 while (index <= end) {
289 int i, nr_pages; 298 int i, nr_pages;
290 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 299 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
@@ -296,7 +305,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
296 for (i = 0; i < nr_pages; i++) { 305 for (i = 0; i < nr_pages; i++) {
297 struct page *page = pvec.pages[i]; 306 struct page *page = pvec.pages[i];
298 307
299 if (prev == LONG_MAX) 308 if (prev == ULONG_MAX)
300 prev = page->index - 1; 309 prev = page->index - 1;
301 if (nr_to_write != LONG_MAX && page->index != prev + 1) { 310 if (nr_to_write != LONG_MAX && page->index != prev + 1) {
302 pagevec_release(&pvec); 311 pagevec_release(&pvec);
@@ -315,6 +324,9 @@ continue_unlock:
315 goto continue_unlock; 324 goto continue_unlock;
316 } 325 }
317 326
327 f2fs_wait_on_page_writeback(page, META, true);
328
329 BUG_ON(PageWriteback(page));
318 if (!clear_page_dirty_for_io(page)) 330 if (!clear_page_dirty_for_io(page))
319 goto continue_unlock; 331 goto continue_unlock;
320 332
@@ -334,6 +346,8 @@ stop:
334 if (nwritten) 346 if (nwritten)
335 f2fs_submit_merged_bio(sbi, type, WRITE); 347 f2fs_submit_merged_bio(sbi, type, WRITE);
336 348
349 blk_finish_plug(&plug);
350
337 return nwritten; 351 return nwritten;
338} 352}
339 353
@@ -621,7 +635,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
621 goto invalid_cp1; 635 goto invalid_cp1;
622 636
623 crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset))); 637 crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
624 if (!f2fs_crc_valid(crc, cp_block, crc_offset)) 638 if (!f2fs_crc_valid(sbi, crc, cp_block, crc_offset))
625 goto invalid_cp1; 639 goto invalid_cp1;
626 640
627 pre_version = cur_cp_version(cp_block); 641 pre_version = cur_cp_version(cp_block);
@@ -636,7 +650,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
636 goto invalid_cp2; 650 goto invalid_cp2;
637 651
638 crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset))); 652 crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
639 if (!f2fs_crc_valid(crc, cp_block, crc_offset)) 653 if (!f2fs_crc_valid(sbi, crc, cp_block, crc_offset))
640 goto invalid_cp2; 654 goto invalid_cp2;
641 655
642 cur_version = cur_cp_version(cp_block); 656 cur_version = cur_cp_version(cp_block);
@@ -696,6 +710,10 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
696 cp_block = (struct f2fs_checkpoint *)page_address(cur_page); 710 cp_block = (struct f2fs_checkpoint *)page_address(cur_page);
697 memcpy(sbi->ckpt, cp_block, blk_size); 711 memcpy(sbi->ckpt, cp_block, blk_size);
698 712
713 /* Sanity checking of checkpoint */
714 if (sanity_check_ckpt(sbi))
715 goto fail_no_cp;
716
699 if (cp_blks <= 1) 717 if (cp_blks <= 1)
700 goto done; 718 goto done;
701 719
@@ -902,7 +920,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
902 if (!get_pages(sbi, F2FS_WRITEBACK)) 920 if (!get_pages(sbi, F2FS_WRITEBACK))
903 break; 921 break;
904 922
905 io_schedule(); 923 io_schedule_timeout(5*HZ);
906 } 924 }
907 finish_wait(&sbi->cp_wait, &wait); 925 finish_wait(&sbi->cp_wait, &wait);
908} 926}
@@ -921,6 +939,9 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
921 int cp_payload_blks = __cp_payload(sbi); 939 int cp_payload_blks = __cp_payload(sbi);
922 block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg); 940 block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg);
923 bool invalidate = false; 941 bool invalidate = false;
942 struct super_block *sb = sbi->sb;
943 struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
944 u64 kbytes_written;
924 945
925 /* 946 /*
926 * This avoids to conduct wrong roll-forward operations and uses 947 * This avoids to conduct wrong roll-forward operations and uses
@@ -1008,7 +1029,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1008 get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); 1029 get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
1009 get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); 1030 get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
1010 1031
1011 crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset)); 1032 crc32 = f2fs_crc32(sbi, ckpt, le32_to_cpu(ckpt->checksum_offset));
1012 *((__le32 *)((unsigned char *)ckpt + 1033 *((__le32 *)((unsigned char *)ckpt +
1013 le32_to_cpu(ckpt->checksum_offset))) 1034 le32_to_cpu(ckpt->checksum_offset)))
1014 = cpu_to_le32(crc32); 1035 = cpu_to_le32(crc32);
@@ -1034,6 +1055,14 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1034 1055
1035 write_data_summaries(sbi, start_blk); 1056 write_data_summaries(sbi, start_blk);
1036 start_blk += data_sum_blocks; 1057 start_blk += data_sum_blocks;
1058
1059 /* Record write statistics in the hot node summary */
1060 kbytes_written = sbi->kbytes_written;
1061 if (sb->s_bdev->bd_part)
1062 kbytes_written += BD_PART_WRITTEN(sbi);
1063
1064 seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written);
1065
1037 if (__remain_node_summaries(cpc->reason)) { 1066 if (__remain_node_summaries(cpc->reason)) {
1038 write_node_summaries(sbi, start_blk); 1067 write_node_summaries(sbi, start_blk);
1039 start_blk += NR_CURSEG_NODE_TYPE; 1068 start_blk += NR_CURSEG_NODE_TYPE;
@@ -1048,8 +1077,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1048 if (unlikely(f2fs_cp_error(sbi))) 1077 if (unlikely(f2fs_cp_error(sbi)))
1049 return -EIO; 1078 return -EIO;
1050 1079
1051 filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX); 1080 filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX);
1052 filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX); 1081 filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX);
1053 1082
1054 /* update user_block_counts */ 1083 /* update user_block_counts */
1055 sbi->last_valid_block_count = sbi->total_valid_block_count; 1084 sbi->last_valid_block_count = sbi->total_valid_block_count;
@@ -1112,9 +1141,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1112 1141
1113 trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); 1142 trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
1114 1143
1115 f2fs_submit_merged_bio(sbi, DATA, WRITE); 1144 f2fs_flush_merged_bios(sbi);
1116 f2fs_submit_merged_bio(sbi, NODE, WRITE);
1117 f2fs_submit_merged_bio(sbi, META, WRITE);
1118 1145
1119 /* 1146 /*
1120 * update checkpoint pack index 1147 * update checkpoint pack index
diff --git a/fs/f2fs/crypto.c b/fs/f2fs/crypto.c
deleted file mode 100644
index 4a62ef14e932..000000000000
--- a/fs/f2fs/crypto.c
+++ /dev/null
@@ -1,491 +0,0 @@
1/*
2 * linux/fs/f2fs/crypto.c
3 *
4 * Copied from linux/fs/ext4/crypto.c
5 *
6 * Copyright (C) 2015, Google, Inc.
7 * Copyright (C) 2015, Motorola Mobility
8 *
9 * This contains encryption functions for f2fs
10 *
11 * Written by Michael Halcrow, 2014.
12 *
13 * Filename encryption additions
14 * Uday Savagaonkar, 2014
15 * Encryption policy handling additions
16 * Ildar Muslukhov, 2014
17 * Remove ext4_encrypted_zeroout(),
18 * add f2fs_restore_and_release_control_page()
19 * Jaegeuk Kim, 2015.
20 *
21 * This has not yet undergone a rigorous security audit.
22 *
23 * The usage of AES-XTS should conform to recommendations in NIST
24 * Special Publication 800-38E and IEEE P1619/D16.
25 */
26#include <crypto/hash.h>
27#include <crypto/sha.h>
28#include <keys/user-type.h>
29#include <keys/encrypted-type.h>
30#include <linux/crypto.h>
31#include <linux/ecryptfs.h>
32#include <linux/gfp.h>
33#include <linux/kernel.h>
34#include <linux/key.h>
35#include <linux/list.h>
36#include <linux/mempool.h>
37#include <linux/module.h>
38#include <linux/mutex.h>
39#include <linux/random.h>
40#include <linux/scatterlist.h>
41#include <linux/spinlock_types.h>
42#include <linux/f2fs_fs.h>
43#include <linux/ratelimit.h>
44#include <linux/bio.h>
45
46#include "f2fs.h"
47#include "xattr.h"
48
49/* Encryption added and removed here! (L: */
50
51static unsigned int num_prealloc_crypto_pages = 32;
52static unsigned int num_prealloc_crypto_ctxs = 128;
53
54module_param(num_prealloc_crypto_pages, uint, 0444);
55MODULE_PARM_DESC(num_prealloc_crypto_pages,
56 "Number of crypto pages to preallocate");
57module_param(num_prealloc_crypto_ctxs, uint, 0444);
58MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
59 "Number of crypto contexts to preallocate");
60
61static mempool_t *f2fs_bounce_page_pool;
62
63static LIST_HEAD(f2fs_free_crypto_ctxs);
64static DEFINE_SPINLOCK(f2fs_crypto_ctx_lock);
65
66static struct workqueue_struct *f2fs_read_workqueue;
67static DEFINE_MUTEX(crypto_init);
68
69static struct kmem_cache *f2fs_crypto_ctx_cachep;
70struct kmem_cache *f2fs_crypt_info_cachep;
71
72/**
73 * f2fs_release_crypto_ctx() - Releases an encryption context
74 * @ctx: The encryption context to release.
75 *
76 * If the encryption context was allocated from the pre-allocated pool, returns
77 * it to that pool. Else, frees it.
78 *
79 * If there's a bounce page in the context, this frees that.
80 */
81void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *ctx)
82{
83 unsigned long flags;
84
85 if (ctx->flags & F2FS_WRITE_PATH_FL && ctx->w.bounce_page) {
86 mempool_free(ctx->w.bounce_page, f2fs_bounce_page_pool);
87 ctx->w.bounce_page = NULL;
88 }
89 ctx->w.control_page = NULL;
90 if (ctx->flags & F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL) {
91 kmem_cache_free(f2fs_crypto_ctx_cachep, ctx);
92 } else {
93 spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags);
94 list_add(&ctx->free_list, &f2fs_free_crypto_ctxs);
95 spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags);
96 }
97}
98
99/**
100 * f2fs_get_crypto_ctx() - Gets an encryption context
101 * @inode: The inode for which we are doing the crypto
102 *
103 * Allocates and initializes an encryption context.
104 *
105 * Return: An allocated and initialized encryption context on success; error
106 * value or NULL otherwise.
107 */
108struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *inode)
109{
110 struct f2fs_crypto_ctx *ctx = NULL;
111 unsigned long flags;
112 struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
113
114 if (ci == NULL)
115 return ERR_PTR(-ENOKEY);
116
117 /*
118 * We first try getting the ctx from a free list because in
119 * the common case the ctx will have an allocated and
120 * initialized crypto tfm, so it's probably a worthwhile
121 * optimization. For the bounce page, we first try getting it
122 * from the kernel allocator because that's just about as fast
123 * as getting it from a list and because a cache of free pages
124 * should generally be a "last resort" option for a filesystem
125 * to be able to do its job.
126 */
127 spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags);
128 ctx = list_first_entry_or_null(&f2fs_free_crypto_ctxs,
129 struct f2fs_crypto_ctx, free_list);
130 if (ctx)
131 list_del(&ctx->free_list);
132 spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags);
133 if (!ctx) {
134 ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_NOFS);
135 if (!ctx)
136 return ERR_PTR(-ENOMEM);
137 ctx->flags |= F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
138 } else {
139 ctx->flags &= ~F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
140 }
141 ctx->flags &= ~F2FS_WRITE_PATH_FL;
142 return ctx;
143}
144
145/*
146 * Call f2fs_decrypt on every single page, reusing the encryption
147 * context.
148 */
149static void completion_pages(struct work_struct *work)
150{
151 struct f2fs_crypto_ctx *ctx =
152 container_of(work, struct f2fs_crypto_ctx, r.work);
153 struct bio *bio = ctx->r.bio;
154 struct bio_vec *bv;
155 int i;
156
157 bio_for_each_segment_all(bv, bio, i) {
158 struct page *page = bv->bv_page;
159 int ret = f2fs_decrypt(ctx, page);
160
161 if (ret) {
162 WARN_ON_ONCE(1);
163 SetPageError(page);
164 } else
165 SetPageUptodate(page);
166 unlock_page(page);
167 }
168 f2fs_release_crypto_ctx(ctx);
169 bio_put(bio);
170}
171
172void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *ctx, struct bio *bio)
173{
174 INIT_WORK(&ctx->r.work, completion_pages);
175 ctx->r.bio = bio;
176 queue_work(f2fs_read_workqueue, &ctx->r.work);
177}
178
179static void f2fs_crypto_destroy(void)
180{
181 struct f2fs_crypto_ctx *pos, *n;
182
183 list_for_each_entry_safe(pos, n, &f2fs_free_crypto_ctxs, free_list)
184 kmem_cache_free(f2fs_crypto_ctx_cachep, pos);
185 INIT_LIST_HEAD(&f2fs_free_crypto_ctxs);
186 if (f2fs_bounce_page_pool)
187 mempool_destroy(f2fs_bounce_page_pool);
188 f2fs_bounce_page_pool = NULL;
189}
190
191/**
192 * f2fs_crypto_initialize() - Set up for f2fs encryption.
193 *
194 * We only call this when we start accessing encrypted files, since it
195 * results in memory getting allocated that wouldn't otherwise be used.
196 *
197 * Return: Zero on success, non-zero otherwise.
198 */
199int f2fs_crypto_initialize(void)
200{
201 int i, res = -ENOMEM;
202
203 if (f2fs_bounce_page_pool)
204 return 0;
205
206 mutex_lock(&crypto_init);
207 if (f2fs_bounce_page_pool)
208 goto already_initialized;
209
210 for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
211 struct f2fs_crypto_ctx *ctx;
212
213 ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_KERNEL);
214 if (!ctx)
215 goto fail;
216 list_add(&ctx->free_list, &f2fs_free_crypto_ctxs);
217 }
218
219 /* must be allocated at the last step to avoid race condition above */
220 f2fs_bounce_page_pool =
221 mempool_create_page_pool(num_prealloc_crypto_pages, 0);
222 if (!f2fs_bounce_page_pool)
223 goto fail;
224
225already_initialized:
226 mutex_unlock(&crypto_init);
227 return 0;
228fail:
229 f2fs_crypto_destroy();
230 mutex_unlock(&crypto_init);
231 return res;
232}
233
234/**
235 * f2fs_exit_crypto() - Shutdown the f2fs encryption system
236 */
237void f2fs_exit_crypto(void)
238{
239 f2fs_crypto_destroy();
240
241 if (f2fs_read_workqueue)
242 destroy_workqueue(f2fs_read_workqueue);
243 if (f2fs_crypto_ctx_cachep)
244 kmem_cache_destroy(f2fs_crypto_ctx_cachep);
245 if (f2fs_crypt_info_cachep)
246 kmem_cache_destroy(f2fs_crypt_info_cachep);
247}
248
249int __init f2fs_init_crypto(void)
250{
251 int res = -ENOMEM;
252
253 f2fs_read_workqueue = alloc_workqueue("f2fs_crypto", WQ_HIGHPRI, 0);
254 if (!f2fs_read_workqueue)
255 goto fail;
256
257 f2fs_crypto_ctx_cachep = KMEM_CACHE(f2fs_crypto_ctx,
258 SLAB_RECLAIM_ACCOUNT);
259 if (!f2fs_crypto_ctx_cachep)
260 goto fail;
261
262 f2fs_crypt_info_cachep = KMEM_CACHE(f2fs_crypt_info,
263 SLAB_RECLAIM_ACCOUNT);
264 if (!f2fs_crypt_info_cachep)
265 goto fail;
266
267 return 0;
268fail:
269 f2fs_exit_crypto();
270 return res;
271}
272
273void f2fs_restore_and_release_control_page(struct page **page)
274{
275 struct f2fs_crypto_ctx *ctx;
276 struct page *bounce_page;
277
278 /* The bounce data pages are unmapped. */
279 if ((*page)->mapping)
280 return;
281
282 /* The bounce data page is unmapped. */
283 bounce_page = *page;
284 ctx = (struct f2fs_crypto_ctx *)page_private(bounce_page);
285
286 /* restore control page */
287 *page = ctx->w.control_page;
288
289 f2fs_restore_control_page(bounce_page);
290}
291
292void f2fs_restore_control_page(struct page *data_page)
293{
294 struct f2fs_crypto_ctx *ctx =
295 (struct f2fs_crypto_ctx *)page_private(data_page);
296
297 set_page_private(data_page, (unsigned long)NULL);
298 ClearPagePrivate(data_page);
299 unlock_page(data_page);
300 f2fs_release_crypto_ctx(ctx);
301}
302
303/**
304 * f2fs_crypt_complete() - The completion callback for page encryption
305 * @req: The asynchronous encryption request context
306 * @res: The result of the encryption operation
307 */
308static void f2fs_crypt_complete(struct crypto_async_request *req, int res)
309{
310 struct f2fs_completion_result *ecr = req->data;
311
312 if (res == -EINPROGRESS)
313 return;
314 ecr->res = res;
315 complete(&ecr->completion);
316}
317
318typedef enum {
319 F2FS_DECRYPT = 0,
320 F2FS_ENCRYPT,
321} f2fs_direction_t;
322
323static int f2fs_page_crypto(struct f2fs_crypto_ctx *ctx,
324 struct inode *inode,
325 f2fs_direction_t rw,
326 pgoff_t index,
327 struct page *src_page,
328 struct page *dest_page)
329{
330 u8 xts_tweak[F2FS_XTS_TWEAK_SIZE];
331 struct ablkcipher_request *req = NULL;
332 DECLARE_F2FS_COMPLETION_RESULT(ecr);
333 struct scatterlist dst, src;
334 struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
335 struct crypto_ablkcipher *tfm = ci->ci_ctfm;
336 int res = 0;
337
338 req = ablkcipher_request_alloc(tfm, GFP_NOFS);
339 if (!req) {
340 printk_ratelimited(KERN_ERR
341 "%s: crypto_request_alloc() failed\n",
342 __func__);
343 return -ENOMEM;
344 }
345 ablkcipher_request_set_callback(
346 req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
347 f2fs_crypt_complete, &ecr);
348
349 BUILD_BUG_ON(F2FS_XTS_TWEAK_SIZE < sizeof(index));
350 memcpy(xts_tweak, &index, sizeof(index));
351 memset(&xts_tweak[sizeof(index)], 0,
352 F2FS_XTS_TWEAK_SIZE - sizeof(index));
353
354 sg_init_table(&dst, 1);
355 sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
356 sg_init_table(&src, 1);
357 sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
358 ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
359 xts_tweak);
360 if (rw == F2FS_DECRYPT)
361 res = crypto_ablkcipher_decrypt(req);
362 else
363 res = crypto_ablkcipher_encrypt(req);
364 if (res == -EINPROGRESS || res == -EBUSY) {
365 BUG_ON(req->base.data != &ecr);
366 wait_for_completion(&ecr.completion);
367 res = ecr.res;
368 }
369 ablkcipher_request_free(req);
370 if (res) {
371 printk_ratelimited(KERN_ERR
372 "%s: crypto_ablkcipher_encrypt() returned %d\n",
373 __func__, res);
374 return res;
375 }
376 return 0;
377}
378
379static struct page *alloc_bounce_page(struct f2fs_crypto_ctx *ctx)
380{
381 ctx->w.bounce_page = mempool_alloc(f2fs_bounce_page_pool, GFP_NOWAIT);
382 if (ctx->w.bounce_page == NULL)
383 return ERR_PTR(-ENOMEM);
384 ctx->flags |= F2FS_WRITE_PATH_FL;
385 return ctx->w.bounce_page;
386}
387
388/**
389 * f2fs_encrypt() - Encrypts a page
390 * @inode: The inode for which the encryption should take place
391 * @plaintext_page: The page to encrypt. Must be locked.
392 *
393 * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
394 * encryption context.
395 *
396 * Called on the page write path. The caller must call
397 * f2fs_restore_control_page() on the returned ciphertext page to
398 * release the bounce buffer and the encryption context.
399 *
400 * Return: An allocated page with the encrypted content on success. Else, an
401 * error value or NULL.
402 */
403struct page *f2fs_encrypt(struct inode *inode,
404 struct page *plaintext_page)
405{
406 struct f2fs_crypto_ctx *ctx;
407 struct page *ciphertext_page = NULL;
408 int err;
409
410 BUG_ON(!PageLocked(plaintext_page));
411
412 ctx = f2fs_get_crypto_ctx(inode);
413 if (IS_ERR(ctx))
414 return (struct page *)ctx;
415
416 /* The encryption operation will require a bounce page. */
417 ciphertext_page = alloc_bounce_page(ctx);
418 if (IS_ERR(ciphertext_page))
419 goto err_out;
420
421 ctx->w.control_page = plaintext_page;
422 err = f2fs_page_crypto(ctx, inode, F2FS_ENCRYPT, plaintext_page->index,
423 plaintext_page, ciphertext_page);
424 if (err) {
425 ciphertext_page = ERR_PTR(err);
426 goto err_out;
427 }
428
429 SetPagePrivate(ciphertext_page);
430 set_page_private(ciphertext_page, (unsigned long)ctx);
431 lock_page(ciphertext_page);
432 return ciphertext_page;
433
434err_out:
435 f2fs_release_crypto_ctx(ctx);
436 return ciphertext_page;
437}
438
439/**
440 * f2fs_decrypt() - Decrypts a page in-place
441 * @ctx: The encryption context.
442 * @page: The page to decrypt. Must be locked.
443 *
444 * Decrypts page in-place using the ctx encryption context.
445 *
446 * Called from the read completion callback.
447 *
448 * Return: Zero on success, non-zero otherwise.
449 */
450int f2fs_decrypt(struct f2fs_crypto_ctx *ctx, struct page *page)
451{
452 BUG_ON(!PageLocked(page));
453
454 return f2fs_page_crypto(ctx, page->mapping->host,
455 F2FS_DECRYPT, page->index, page, page);
456}
457
458/*
459 * Convenience function which takes care of allocating and
460 * deallocating the encryption context
461 */
462int f2fs_decrypt_one(struct inode *inode, struct page *page)
463{
464 struct f2fs_crypto_ctx *ctx = f2fs_get_crypto_ctx(inode);
465 int ret;
466
467 if (IS_ERR(ctx))
468 return PTR_ERR(ctx);
469 ret = f2fs_decrypt(ctx, page);
470 f2fs_release_crypto_ctx(ctx);
471 return ret;
472}
473
474bool f2fs_valid_contents_enc_mode(uint32_t mode)
475{
476 return (mode == F2FS_ENCRYPTION_MODE_AES_256_XTS);
477}
478
479/**
480 * f2fs_validate_encryption_key_size() - Validate the encryption key size
481 * @mode: The key mode.
482 * @size: The key size to validate.
483 *
484 * Return: The validated key size for @mode. Zero if invalid.
485 */
486uint32_t f2fs_validate_encryption_key_size(uint32_t mode, uint32_t size)
487{
488 if (size == f2fs_encryption_key_size(mode))
489 return size;
490 return 0;
491}
diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c
deleted file mode 100644
index 5de2d866a25c..000000000000
--- a/fs/f2fs/crypto_key.c
+++ /dev/null
@@ -1,254 +0,0 @@
1/*
2 * linux/fs/f2fs/crypto_key.c
3 *
4 * Copied from linux/fs/f2fs/crypto_key.c
5 *
6 * Copyright (C) 2015, Google, Inc.
7 *
8 * This contains encryption key functions for f2fs
9 *
10 * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
11 */
12#include <keys/encrypted-type.h>
13#include <keys/user-type.h>
14#include <linux/random.h>
15#include <linux/scatterlist.h>
16#include <uapi/linux/keyctl.h>
17#include <crypto/hash.h>
18#include <linux/f2fs_fs.h>
19
20#include "f2fs.h"
21#include "xattr.h"
22
23static void derive_crypt_complete(struct crypto_async_request *req, int rc)
24{
25 struct f2fs_completion_result *ecr = req->data;
26
27 if (rc == -EINPROGRESS)
28 return;
29
30 ecr->res = rc;
31 complete(&ecr->completion);
32}
33
34/**
35 * f2fs_derive_key_aes() - Derive a key using AES-128-ECB
36 * @deriving_key: Encryption key used for derivatio.
37 * @source_key: Source key to which to apply derivation.
38 * @derived_key: Derived key.
39 *
40 * Return: Zero on success; non-zero otherwise.
41 */
42static int f2fs_derive_key_aes(char deriving_key[F2FS_AES_128_ECB_KEY_SIZE],
43 char source_key[F2FS_AES_256_XTS_KEY_SIZE],
44 char derived_key[F2FS_AES_256_XTS_KEY_SIZE])
45{
46 int res = 0;
47 struct ablkcipher_request *req = NULL;
48 DECLARE_F2FS_COMPLETION_RESULT(ecr);
49 struct scatterlist src_sg, dst_sg;
50 struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0,
51 0);
52
53 if (IS_ERR(tfm)) {
54 res = PTR_ERR(tfm);
55 tfm = NULL;
56 goto out;
57 }
58 crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
59 req = ablkcipher_request_alloc(tfm, GFP_NOFS);
60 if (!req) {
61 res = -ENOMEM;
62 goto out;
63 }
64 ablkcipher_request_set_callback(req,
65 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
66 derive_crypt_complete, &ecr);
67 res = crypto_ablkcipher_setkey(tfm, deriving_key,
68 F2FS_AES_128_ECB_KEY_SIZE);
69 if (res < 0)
70 goto out;
71
72 sg_init_one(&src_sg, source_key, F2FS_AES_256_XTS_KEY_SIZE);
73 sg_init_one(&dst_sg, derived_key, F2FS_AES_256_XTS_KEY_SIZE);
74 ablkcipher_request_set_crypt(req, &src_sg, &dst_sg,
75 F2FS_AES_256_XTS_KEY_SIZE, NULL);
76 res = crypto_ablkcipher_encrypt(req);
77 if (res == -EINPROGRESS || res == -EBUSY) {
78 BUG_ON(req->base.data != &ecr);
79 wait_for_completion(&ecr.completion);
80 res = ecr.res;
81 }
82out:
83 if (req)
84 ablkcipher_request_free(req);
85 if (tfm)
86 crypto_free_ablkcipher(tfm);
87 return res;
88}
89
90static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci)
91{
92 if (!ci)
93 return;
94
95 key_put(ci->ci_keyring_key);
96 crypto_free_ablkcipher(ci->ci_ctfm);
97 kmem_cache_free(f2fs_crypt_info_cachep, ci);
98}
99
100void f2fs_free_encryption_info(struct inode *inode, struct f2fs_crypt_info *ci)
101{
102 struct f2fs_inode_info *fi = F2FS_I(inode);
103 struct f2fs_crypt_info *prev;
104
105 if (ci == NULL)
106 ci = ACCESS_ONCE(fi->i_crypt_info);
107 if (ci == NULL)
108 return;
109 prev = cmpxchg(&fi->i_crypt_info, ci, NULL);
110 if (prev != ci)
111 return;
112
113 f2fs_free_crypt_info(ci);
114}
115
116int _f2fs_get_encryption_info(struct inode *inode)
117{
118 struct f2fs_inode_info *fi = F2FS_I(inode);
119 struct f2fs_crypt_info *crypt_info;
120 char full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE +
121 (F2FS_KEY_DESCRIPTOR_SIZE * 2) + 1];
122 struct key *keyring_key = NULL;
123 struct f2fs_encryption_key *master_key;
124 struct f2fs_encryption_context ctx;
125 const struct user_key_payload *ukp;
126 struct crypto_ablkcipher *ctfm;
127 const char *cipher_str;
128 char raw_key[F2FS_MAX_KEY_SIZE];
129 char mode;
130 int res;
131
132 res = f2fs_crypto_initialize();
133 if (res)
134 return res;
135retry:
136 crypt_info = ACCESS_ONCE(fi->i_crypt_info);
137 if (crypt_info) {
138 if (!crypt_info->ci_keyring_key ||
139 key_validate(crypt_info->ci_keyring_key) == 0)
140 return 0;
141 f2fs_free_encryption_info(inode, crypt_info);
142 goto retry;
143 }
144
145 res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
146 F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
147 &ctx, sizeof(ctx), NULL);
148 if (res < 0)
149 return res;
150 else if (res != sizeof(ctx))
151 return -EINVAL;
152 res = 0;
153
154 crypt_info = kmem_cache_alloc(f2fs_crypt_info_cachep, GFP_NOFS);
155 if (!crypt_info)
156 return -ENOMEM;
157
158 crypt_info->ci_flags = ctx.flags;
159 crypt_info->ci_data_mode = ctx.contents_encryption_mode;
160 crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
161 crypt_info->ci_ctfm = NULL;
162 crypt_info->ci_keyring_key = NULL;
163 memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
164 sizeof(crypt_info->ci_master_key));
165 if (S_ISREG(inode->i_mode))
166 mode = crypt_info->ci_data_mode;
167 else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
168 mode = crypt_info->ci_filename_mode;
169 else
170 BUG();
171
172 switch (mode) {
173 case F2FS_ENCRYPTION_MODE_AES_256_XTS:
174 cipher_str = "xts(aes)";
175 break;
176 case F2FS_ENCRYPTION_MODE_AES_256_CTS:
177 cipher_str = "cts(cbc(aes))";
178 break;
179 default:
180 printk_once(KERN_WARNING
181 "f2fs: unsupported key mode %d (ino %u)\n",
182 mode, (unsigned) inode->i_ino);
183 res = -ENOKEY;
184 goto out;
185 }
186
187 memcpy(full_key_descriptor, F2FS_KEY_DESC_PREFIX,
188 F2FS_KEY_DESC_PREFIX_SIZE);
189 sprintf(full_key_descriptor + F2FS_KEY_DESC_PREFIX_SIZE,
190 "%*phN", F2FS_KEY_DESCRIPTOR_SIZE,
191 ctx.master_key_descriptor);
192 full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE +
193 (2 * F2FS_KEY_DESCRIPTOR_SIZE)] = '\0';
194 keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
195 if (IS_ERR(keyring_key)) {
196 res = PTR_ERR(keyring_key);
197 keyring_key = NULL;
198 goto out;
199 }
200 crypt_info->ci_keyring_key = keyring_key;
201 BUG_ON(keyring_key->type != &key_type_logon);
202 ukp = user_key_payload(keyring_key);
203 if (ukp->datalen != sizeof(struct f2fs_encryption_key)) {
204 res = -EINVAL;
205 goto out;
206 }
207 master_key = (struct f2fs_encryption_key *)ukp->data;
208 BUILD_BUG_ON(F2FS_AES_128_ECB_KEY_SIZE !=
209 F2FS_KEY_DERIVATION_NONCE_SIZE);
210 BUG_ON(master_key->size != F2FS_AES_256_XTS_KEY_SIZE);
211 res = f2fs_derive_key_aes(ctx.nonce, master_key->raw,
212 raw_key);
213 if (res)
214 goto out;
215
216 ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0);
217 if (!ctfm || IS_ERR(ctfm)) {
218 res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
219 printk(KERN_DEBUG
220 "%s: error %d (inode %u) allocating crypto tfm\n",
221 __func__, res, (unsigned) inode->i_ino);
222 goto out;
223 }
224 crypt_info->ci_ctfm = ctfm;
225 crypto_ablkcipher_clear_flags(ctfm, ~0);
226 crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm),
227 CRYPTO_TFM_REQ_WEAK_KEY);
228 res = crypto_ablkcipher_setkey(ctfm, raw_key,
229 f2fs_encryption_key_size(mode));
230 if (res)
231 goto out;
232
233 memzero_explicit(raw_key, sizeof(raw_key));
234 if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) != NULL) {
235 f2fs_free_crypt_info(crypt_info);
236 goto retry;
237 }
238 return 0;
239
240out:
241 if (res == -ENOKEY && !S_ISREG(inode->i_mode))
242 res = 0;
243
244 f2fs_free_crypt_info(crypt_info);
245 memzero_explicit(raw_key, sizeof(raw_key));
246 return res;
247}
248
249int f2fs_has_encryption_key(struct inode *inode)
250{
251 struct f2fs_inode_info *fi = F2FS_I(inode);
252
253 return (fi->i_crypt_info != NULL);
254}
diff --git a/fs/f2fs/crypto_policy.c b/fs/f2fs/crypto_policy.c
deleted file mode 100644
index d4a96af513c2..000000000000
--- a/fs/f2fs/crypto_policy.c
+++ /dev/null
@@ -1,209 +0,0 @@
1/*
2 * copied from linux/fs/ext4/crypto_policy.c
3 *
4 * Copyright (C) 2015, Google, Inc.
5 * Copyright (C) 2015, Motorola Mobility.
6 *
7 * This contains encryption policy functions for f2fs with some modifications
8 * to support f2fs-specific xattr APIs.
9 *
10 * Written by Michael Halcrow, 2015.
11 * Modified by Jaegeuk Kim, 2015.
12 */
13#include <linux/random.h>
14#include <linux/string.h>
15#include <linux/types.h>
16#include <linux/f2fs_fs.h>
17
18#include "f2fs.h"
19#include "xattr.h"
20
21static int f2fs_inode_has_encryption_context(struct inode *inode)
22{
23 int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
24 F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0, NULL);
25 return (res > 0);
26}
27
28/*
29 * check whether the policy is consistent with the encryption context
30 * for the inode
31 */
32static int f2fs_is_encryption_context_consistent_with_policy(
33 struct inode *inode, const struct f2fs_encryption_policy *policy)
34{
35 struct f2fs_encryption_context ctx;
36 int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
37 F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
38 sizeof(ctx), NULL);
39
40 if (res != sizeof(ctx))
41 return 0;
42
43 return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
44 F2FS_KEY_DESCRIPTOR_SIZE) == 0 &&
45 (ctx.flags == policy->flags) &&
46 (ctx.contents_encryption_mode ==
47 policy->contents_encryption_mode) &&
48 (ctx.filenames_encryption_mode ==
49 policy->filenames_encryption_mode));
50}
51
52static int f2fs_create_encryption_context_from_policy(
53 struct inode *inode, const struct f2fs_encryption_policy *policy)
54{
55 struct f2fs_encryption_context ctx;
56
57 ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1;
58 memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
59 F2FS_KEY_DESCRIPTOR_SIZE);
60
61 if (!f2fs_valid_contents_enc_mode(policy->contents_encryption_mode)) {
62 printk(KERN_WARNING
63 "%s: Invalid contents encryption mode %d\n", __func__,
64 policy->contents_encryption_mode);
65 return -EINVAL;
66 }
67
68 if (!f2fs_valid_filenames_enc_mode(policy->filenames_encryption_mode)) {
69 printk(KERN_WARNING
70 "%s: Invalid filenames encryption mode %d\n", __func__,
71 policy->filenames_encryption_mode);
72 return -EINVAL;
73 }
74
75 if (policy->flags & ~F2FS_POLICY_FLAGS_VALID)
76 return -EINVAL;
77
78 ctx.contents_encryption_mode = policy->contents_encryption_mode;
79 ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
80 ctx.flags = policy->flags;
81 BUILD_BUG_ON(sizeof(ctx.nonce) != F2FS_KEY_DERIVATION_NONCE_SIZE);
82 get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE);
83
84 return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
85 F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
86 sizeof(ctx), NULL, XATTR_CREATE);
87}
88
89int f2fs_process_policy(const struct f2fs_encryption_policy *policy,
90 struct inode *inode)
91{
92 if (policy->version != 0)
93 return -EINVAL;
94
95 if (!S_ISDIR(inode->i_mode))
96 return -EINVAL;
97
98 if (!f2fs_inode_has_encryption_context(inode)) {
99 if (!f2fs_empty_dir(inode))
100 return -ENOTEMPTY;
101 return f2fs_create_encryption_context_from_policy(inode,
102 policy);
103 }
104
105 if (f2fs_is_encryption_context_consistent_with_policy(inode, policy))
106 return 0;
107
108 printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
109 __func__);
110 return -EINVAL;
111}
112
113int f2fs_get_policy(struct inode *inode, struct f2fs_encryption_policy *policy)
114{
115 struct f2fs_encryption_context ctx;
116 int res;
117
118 if (!f2fs_encrypted_inode(inode))
119 return -ENODATA;
120
121 res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
122 F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
123 &ctx, sizeof(ctx), NULL);
124 if (res != sizeof(ctx))
125 return -ENODATA;
126 if (ctx.format != F2FS_ENCRYPTION_CONTEXT_FORMAT_V1)
127 return -EINVAL;
128
129 policy->version = 0;
130 policy->contents_encryption_mode = ctx.contents_encryption_mode;
131 policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
132 policy->flags = ctx.flags;
133 memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
134 F2FS_KEY_DESCRIPTOR_SIZE);
135 return 0;
136}
137
138int f2fs_is_child_context_consistent_with_parent(struct inode *parent,
139 struct inode *child)
140{
141 struct f2fs_crypt_info *parent_ci, *child_ci;
142 int res;
143
144 if ((parent == NULL) || (child == NULL)) {
145 pr_err("parent %p child %p\n", parent, child);
146 BUG_ON(1);
147 }
148
149 /* no restrictions if the parent directory is not encrypted */
150 if (!f2fs_encrypted_inode(parent))
151 return 1;
152 /* if the child directory is not encrypted, this is always a problem */
153 if (!f2fs_encrypted_inode(child))
154 return 0;
155 res = f2fs_get_encryption_info(parent);
156 if (res)
157 return 0;
158 res = f2fs_get_encryption_info(child);
159 if (res)
160 return 0;
161 parent_ci = F2FS_I(parent)->i_crypt_info;
162 child_ci = F2FS_I(child)->i_crypt_info;
163 if (!parent_ci && !child_ci)
164 return 1;
165 if (!parent_ci || !child_ci)
166 return 0;
167
168 return (memcmp(parent_ci->ci_master_key,
169 child_ci->ci_master_key,
170 F2FS_KEY_DESCRIPTOR_SIZE) == 0 &&
171 (parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
172 (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) &&
173 (parent_ci->ci_flags == child_ci->ci_flags));
174}
175
176/**
177 * f2fs_inherit_context() - Sets a child context from its parent
178 * @parent: Parent inode from which the context is inherited.
179 * @child: Child inode that inherits the context from @parent.
180 *
181 * Return: Zero on success, non-zero otherwise
182 */
183int f2fs_inherit_context(struct inode *parent, struct inode *child,
184 struct page *ipage)
185{
186 struct f2fs_encryption_context ctx;
187 struct f2fs_crypt_info *ci;
188 int res;
189
190 res = f2fs_get_encryption_info(parent);
191 if (res < 0)
192 return res;
193
194 ci = F2FS_I(parent)->i_crypt_info;
195 BUG_ON(ci == NULL);
196
197 ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1;
198
199 ctx.contents_encryption_mode = ci->ci_data_mode;
200 ctx.filenames_encryption_mode = ci->ci_filename_mode;
201 ctx.flags = ci->ci_flags;
202 memcpy(ctx.master_key_descriptor, ci->ci_master_key,
203 F2FS_KEY_DESCRIPTOR_SIZE);
204
205 get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE);
206 return f2fs_setxattr(child, F2FS_XATTR_INDEX_ENCRYPTION,
207 F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
208 sizeof(ctx), ipage, XATTR_CREATE);
209}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 5c06db17e41f..e5c762b37239 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -34,9 +34,9 @@ static void f2fs_read_end_io(struct bio *bio)
34 34
35 if (f2fs_bio_encrypted(bio)) { 35 if (f2fs_bio_encrypted(bio)) {
36 if (bio->bi_error) { 36 if (bio->bi_error) {
37 f2fs_release_crypto_ctx(bio->bi_private); 37 fscrypt_release_ctx(bio->bi_private);
38 } else { 38 } else {
39 f2fs_end_io_crypto_work(bio->bi_private, bio); 39 fscrypt_decrypt_bio_pages(bio->bi_private, bio);
40 return; 40 return;
41 } 41 }
42 } 42 }
@@ -64,10 +64,9 @@ static void f2fs_write_end_io(struct bio *bio)
64 bio_for_each_segment_all(bvec, bio, i) { 64 bio_for_each_segment_all(bvec, bio, i) {
65 struct page *page = bvec->bv_page; 65 struct page *page = bvec->bv_page;
66 66
67 f2fs_restore_and_release_control_page(&page); 67 fscrypt_pullback_bio_page(&page, true);
68 68
69 if (unlikely(bio->bi_error)) { 69 if (unlikely(bio->bi_error)) {
70 set_page_dirty(page);
71 set_bit(AS_EIO, &page->mapping->flags); 70 set_bit(AS_EIO, &page->mapping->flags);
72 f2fs_stop_checkpoint(sbi); 71 f2fs_stop_checkpoint(sbi);
73 } 72 }
@@ -75,8 +74,7 @@ static void f2fs_write_end_io(struct bio *bio)
75 dec_page_count(sbi, F2FS_WRITEBACK); 74 dec_page_count(sbi, F2FS_WRITEBACK);
76 } 75 }
77 76
78 if (!get_pages(sbi, F2FS_WRITEBACK) && 77 if (!get_pages(sbi, F2FS_WRITEBACK) && wq_has_sleeper(&sbi->cp_wait))
79 !list_empty(&sbi->cp_wait.task_list))
80 wake_up(&sbi->cp_wait); 78 wake_up(&sbi->cp_wait);
81 79
82 bio_put(bio); 80 bio_put(bio);
@@ -116,8 +114,54 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
116 io->bio = NULL; 114 io->bio = NULL;
117} 115}
118 116
119void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, 117static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
120 enum page_type type, int rw) 118 struct page *page, nid_t ino)
119{
120 struct bio_vec *bvec;
121 struct page *target;
122 int i;
123
124 if (!io->bio)
125 return false;
126
127 if (!inode && !page && !ino)
128 return true;
129
130 bio_for_each_segment_all(bvec, io->bio, i) {
131
132 if (bvec->bv_page->mapping)
133 target = bvec->bv_page;
134 else
135 target = fscrypt_control_page(bvec->bv_page);
136
137 if (inode && inode == target->mapping->host)
138 return true;
139 if (page && page == target)
140 return true;
141 if (ino && ino == ino_of_node(target))
142 return true;
143 }
144
145 return false;
146}
147
148static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode,
149 struct page *page, nid_t ino,
150 enum page_type type)
151{
152 enum page_type btype = PAGE_TYPE_OF_BIO(type);
153 struct f2fs_bio_info *io = &sbi->write_io[btype];
154 bool ret;
155
156 down_read(&io->io_rwsem);
157 ret = __has_merged_page(io, inode, page, ino);
158 up_read(&io->io_rwsem);
159 return ret;
160}
161
162static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
163 struct inode *inode, struct page *page,
164 nid_t ino, enum page_type type, int rw)
121{ 165{
122 enum page_type btype = PAGE_TYPE_OF_BIO(type); 166 enum page_type btype = PAGE_TYPE_OF_BIO(type);
123 struct f2fs_bio_info *io; 167 struct f2fs_bio_info *io;
@@ -126,6 +170,9 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
126 170
127 down_write(&io->io_rwsem); 171 down_write(&io->io_rwsem);
128 172
173 if (!__has_merged_page(io, inode, page, ino))
174 goto out;
175
129 /* change META to META_FLUSH in the checkpoint procedure */ 176 /* change META to META_FLUSH in the checkpoint procedure */
130 if (type >= META_FLUSH) { 177 if (type >= META_FLUSH) {
131 io->fio.type = META_FLUSH; 178 io->fio.type = META_FLUSH;
@@ -135,9 +182,31 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
135 io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; 182 io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
136 } 183 }
137 __submit_merged_bio(io); 184 __submit_merged_bio(io);
185out:
138 up_write(&io->io_rwsem); 186 up_write(&io->io_rwsem);
139} 187}
140 188
189void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type,
190 int rw)
191{
192 __f2fs_submit_merged_bio(sbi, NULL, NULL, 0, type, rw);
193}
194
195void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi,
196 struct inode *inode, struct page *page,
197 nid_t ino, enum page_type type, int rw)
198{
199 if (has_merged_page(sbi, inode, page, ino, type))
200 __f2fs_submit_merged_bio(sbi, inode, page, ino, type, rw);
201}
202
203void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi)
204{
205 f2fs_submit_merged_bio(sbi, DATA, WRITE);
206 f2fs_submit_merged_bio(sbi, NODE, WRITE);
207 f2fs_submit_merged_bio(sbi, META, WRITE);
208}
209
141/* 210/*
142 * Fill the locked page with data located in the block address. 211 * Fill the locked page with data located in the block address.
143 * Return unlocked page. 212 * Return unlocked page.
@@ -145,13 +214,14 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
145int f2fs_submit_page_bio(struct f2fs_io_info *fio) 214int f2fs_submit_page_bio(struct f2fs_io_info *fio)
146{ 215{
147 struct bio *bio; 216 struct bio *bio;
148 struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; 217 struct page *page = fio->encrypted_page ?
218 fio->encrypted_page : fio->page;
149 219
150 trace_f2fs_submit_page_bio(page, fio); 220 trace_f2fs_submit_page_bio(page, fio);
151 f2fs_trace_ios(fio, 0); 221 f2fs_trace_ios(fio, 0);
152 222
153 /* Allocate a new bio */ 223 /* Allocate a new bio */
154 bio = __bio_alloc(fio->sbi, fio->blk_addr, 1, is_read_io(fio->rw)); 224 bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->rw));
155 225
156 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { 226 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
157 bio_put(bio); 227 bio_put(bio);
@@ -172,21 +242,24 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
172 242
173 io = is_read ? &sbi->read_io : &sbi->write_io[btype]; 243 io = is_read ? &sbi->read_io : &sbi->write_io[btype];
174 244
175 verify_block_addr(sbi, fio->blk_addr); 245 if (fio->old_blkaddr != NEW_ADDR)
246 verify_block_addr(sbi, fio->old_blkaddr);
247 verify_block_addr(sbi, fio->new_blkaddr);
176 248
177 down_write(&io->io_rwsem); 249 down_write(&io->io_rwsem);
178 250
179 if (!is_read) 251 if (!is_read)
180 inc_page_count(sbi, F2FS_WRITEBACK); 252 inc_page_count(sbi, F2FS_WRITEBACK);
181 253
182 if (io->bio && (io->last_block_in_bio != fio->blk_addr - 1 || 254 if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
183 io->fio.rw != fio->rw)) 255 io->fio.rw != fio->rw))
184 __submit_merged_bio(io); 256 __submit_merged_bio(io);
185alloc_new: 257alloc_new:
186 if (io->bio == NULL) { 258 if (io->bio == NULL) {
187 int bio_blocks = MAX_BIO_BLOCKS(sbi); 259 int bio_blocks = MAX_BIO_BLOCKS(sbi);
188 260
189 io->bio = __bio_alloc(sbi, fio->blk_addr, bio_blocks, is_read); 261 io->bio = __bio_alloc(sbi, fio->new_blkaddr,
262 bio_blocks, is_read);
190 io->fio = *fio; 263 io->fio = *fio;
191 } 264 }
192 265
@@ -198,7 +271,7 @@ alloc_new:
198 goto alloc_new; 271 goto alloc_new;
199 } 272 }
200 273
201 io->last_block_in_bio = fio->blk_addr; 274 io->last_block_in_bio = fio->new_blkaddr;
202 f2fs_trace_ios(fio, 0); 275 f2fs_trace_ios(fio, 0);
203 276
204 up_write(&io->io_rwsem); 277 up_write(&io->io_rwsem);
@@ -218,7 +291,7 @@ void set_data_blkaddr(struct dnode_of_data *dn)
218 struct page *node_page = dn->node_page; 291 struct page *node_page = dn->node_page;
219 unsigned int ofs_in_node = dn->ofs_in_node; 292 unsigned int ofs_in_node = dn->ofs_in_node;
220 293
221 f2fs_wait_on_page_writeback(node_page, NODE); 294 f2fs_wait_on_page_writeback(node_page, NODE, true);
222 295
223 rn = F2FS_NODE(node_page); 296 rn = F2FS_NODE(node_page);
224 297
@@ -229,6 +302,13 @@ void set_data_blkaddr(struct dnode_of_data *dn)
229 dn->node_changed = true; 302 dn->node_changed = true;
230} 303}
231 304
305void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
306{
307 dn->data_blkaddr = blkaddr;
308 set_data_blkaddr(dn);
309 f2fs_update_extent_cache(dn);
310}
311
232int reserve_new_block(struct dnode_of_data *dn) 312int reserve_new_block(struct dnode_of_data *dn)
233{ 313{
234 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); 314 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
@@ -332,7 +412,7 @@ got_it:
332 return page; 412 return page;
333 } 413 }
334 414
335 fio.blk_addr = dn.data_blkaddr; 415 fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
336 fio.page = page; 416 fio.page = page;
337 err = f2fs_submit_page_bio(&fio); 417 err = f2fs_submit_page_bio(&fio);
338 if (err) 418 if (err)
@@ -461,7 +541,6 @@ got_it:
461static int __allocate_data_block(struct dnode_of_data *dn) 541static int __allocate_data_block(struct dnode_of_data *dn)
462{ 542{
463 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); 543 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
464 struct f2fs_inode_info *fi = F2FS_I(dn->inode);
465 struct f2fs_summary sum; 544 struct f2fs_summary sum;
466 struct node_info ni; 545 struct node_info ni;
467 int seg = CURSEG_WARM_DATA; 546 int seg = CURSEG_WARM_DATA;
@@ -489,7 +568,7 @@ alloc:
489 set_data_blkaddr(dn); 568 set_data_blkaddr(dn);
490 569
491 /* update i_size */ 570 /* update i_size */
492 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + 571 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
493 dn->ofs_in_node; 572 dn->ofs_in_node;
494 if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)) 573 if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT))
495 i_size_write(dn->inode, 574 i_size_write(dn->inode,
@@ -497,67 +576,33 @@ alloc:
497 return 0; 576 return 0;
498} 577}
499 578
500static int __allocate_data_blocks(struct inode *inode, loff_t offset, 579ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
501 size_t count)
502{ 580{
503 struct f2fs_sb_info *sbi = F2FS_I_SB(inode); 581 struct inode *inode = file_inode(iocb->ki_filp);
504 struct dnode_of_data dn; 582 struct f2fs_map_blocks map;
505 u64 start = F2FS_BYTES_TO_BLK(offset); 583 ssize_t ret = 0;
506 u64 len = F2FS_BYTES_TO_BLK(count);
507 bool allocated;
508 u64 end_offset;
509 int err = 0;
510
511 while (len) {
512 f2fs_lock_op(sbi);
513
514 /* When reading holes, we need its node page */
515 set_new_dnode(&dn, inode, NULL, NULL, 0);
516 err = get_dnode_of_data(&dn, start, ALLOC_NODE);
517 if (err)
518 goto out;
519
520 allocated = false;
521 end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
522
523 while (dn.ofs_in_node < end_offset && len) {
524 block_t blkaddr;
525
526 if (unlikely(f2fs_cp_error(sbi))) {
527 err = -EIO;
528 goto sync_out;
529 }
530
531 blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
532 if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) {
533 err = __allocate_data_block(&dn);
534 if (err)
535 goto sync_out;
536 allocated = true;
537 }
538 len--;
539 start++;
540 dn.ofs_in_node++;
541 }
542 584
543 if (allocated) 585 map.m_lblk = F2FS_BYTES_TO_BLK(iocb->ki_pos);
544 sync_inode_page(&dn); 586 map.m_len = F2FS_BLK_ALIGN(iov_iter_count(from));
587 map.m_next_pgofs = NULL;
545 588
546 f2fs_put_dnode(&dn); 589 if (f2fs_encrypted_inode(inode))
547 f2fs_unlock_op(sbi); 590 return 0;
548 591
549 f2fs_balance_fs(sbi, dn.node_changed); 592 if (iocb->ki_flags & IOCB_DIRECT) {
593 ret = f2fs_convert_inline_inode(inode);
594 if (ret)
595 return ret;
596 return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
550 } 597 }
551 return err; 598 if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) {
552 599 ret = f2fs_convert_inline_inode(inode);
553sync_out: 600 if (ret)
554 if (allocated) 601 return ret;
555 sync_inode_page(&dn); 602 }
556 f2fs_put_dnode(&dn); 603 if (!f2fs_has_inline_data(inode))
557out: 604 return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
558 f2fs_unlock_op(sbi); 605 return ret;
559 f2fs_balance_fs(sbi, dn.node_changed);
560 return err;
561} 606}
562 607
563/* 608/*
@@ -588,13 +633,14 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
588 /* it only supports block size == page size */ 633 /* it only supports block size == page size */
589 pgofs = (pgoff_t)map->m_lblk; 634 pgofs = (pgoff_t)map->m_lblk;
590 635
591 if (f2fs_lookup_extent_cache(inode, pgofs, &ei)) { 636 if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
592 map->m_pblk = ei.blk + pgofs - ei.fofs; 637 map->m_pblk = ei.blk + pgofs - ei.fofs;
593 map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs); 638 map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs);
594 map->m_flags = F2FS_MAP_MAPPED; 639 map->m_flags = F2FS_MAP_MAPPED;
595 goto out; 640 goto out;
596 } 641 }
597 642
643next_dnode:
598 if (create) 644 if (create)
599 f2fs_lock_op(sbi); 645 f2fs_lock_op(sbi);
600 646
@@ -602,120 +648,98 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
602 set_new_dnode(&dn, inode, NULL, NULL, 0); 648 set_new_dnode(&dn, inode, NULL, NULL, 0);
603 err = get_dnode_of_data(&dn, pgofs, mode); 649 err = get_dnode_of_data(&dn, pgofs, mode);
604 if (err) { 650 if (err) {
605 if (err == -ENOENT) 651 if (err == -ENOENT) {
606 err = 0; 652 err = 0;
653 if (map->m_next_pgofs)
654 *map->m_next_pgofs =
655 get_next_page_offset(&dn, pgofs);
656 }
607 goto unlock_out; 657 goto unlock_out;
608 } 658 }
609 659
610 if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) { 660 end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
661
662next_block:
663 blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
664
665 if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
611 if (create) { 666 if (create) {
612 if (unlikely(f2fs_cp_error(sbi))) { 667 if (unlikely(f2fs_cp_error(sbi))) {
613 err = -EIO; 668 err = -EIO;
614 goto put_out; 669 goto sync_out;
670 }
671 if (flag == F2FS_GET_BLOCK_PRE_AIO) {
672 if (blkaddr == NULL_ADDR)
673 err = reserve_new_block(&dn);
674 } else {
675 err = __allocate_data_block(&dn);
615 } 676 }
616 err = __allocate_data_block(&dn);
617 if (err) 677 if (err)
618 goto put_out; 678 goto sync_out;
619 allocated = true; 679 allocated = true;
620 map->m_flags = F2FS_MAP_NEW; 680 map->m_flags = F2FS_MAP_NEW;
681 blkaddr = dn.data_blkaddr;
621 } else { 682 } else {
683 if (flag == F2FS_GET_BLOCK_FIEMAP &&
684 blkaddr == NULL_ADDR) {
685 if (map->m_next_pgofs)
686 *map->m_next_pgofs = pgofs + 1;
687 }
622 if (flag != F2FS_GET_BLOCK_FIEMAP || 688 if (flag != F2FS_GET_BLOCK_FIEMAP ||
623 dn.data_blkaddr != NEW_ADDR) { 689 blkaddr != NEW_ADDR) {
624 if (flag == F2FS_GET_BLOCK_BMAP) 690 if (flag == F2FS_GET_BLOCK_BMAP)
625 err = -ENOENT; 691 err = -ENOENT;
626 goto put_out; 692 goto sync_out;
627 } 693 }
628
629 /*
630 * preallocated unwritten block should be mapped
631 * for fiemap.
632 */
633 if (dn.data_blkaddr == NEW_ADDR)
634 map->m_flags = F2FS_MAP_UNWRITTEN;
635 } 694 }
636 } 695 }
637 696
638 map->m_flags |= F2FS_MAP_MAPPED; 697 if (map->m_len == 0) {
639 map->m_pblk = dn.data_blkaddr; 698 /* preallocated unwritten block should be mapped for fiemap. */
640 map->m_len = 1; 699 if (blkaddr == NEW_ADDR)
700 map->m_flags |= F2FS_MAP_UNWRITTEN;
701 map->m_flags |= F2FS_MAP_MAPPED;
702
703 map->m_pblk = blkaddr;
704 map->m_len = 1;
705 } else if ((map->m_pblk != NEW_ADDR &&
706 blkaddr == (map->m_pblk + ofs)) ||
707 (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) ||
708 flag == F2FS_GET_BLOCK_PRE_DIO ||
709 flag == F2FS_GET_BLOCK_PRE_AIO) {
710 ofs++;
711 map->m_len++;
712 } else {
713 goto sync_out;
714 }
641 715
642 end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
643 dn.ofs_in_node++; 716 dn.ofs_in_node++;
644 pgofs++; 717 pgofs++;
645 718
646get_next: 719 if (map->m_len < maxblocks) {
647 if (map->m_len >= maxblocks) 720 if (dn.ofs_in_node < end_offset)
648 goto sync_out; 721 goto next_block;
649 722
650 if (dn.ofs_in_node >= end_offset) {
651 if (allocated) 723 if (allocated)
652 sync_inode_page(&dn); 724 sync_inode_page(&dn);
653 allocated = false;
654 f2fs_put_dnode(&dn); 725 f2fs_put_dnode(&dn);
655 726
656 if (create) { 727 if (create) {
657 f2fs_unlock_op(sbi); 728 f2fs_unlock_op(sbi);
658 f2fs_balance_fs(sbi, dn.node_changed); 729 f2fs_balance_fs(sbi, allocated);
659 f2fs_lock_op(sbi);
660 }
661
662 set_new_dnode(&dn, inode, NULL, NULL, 0);
663 err = get_dnode_of_data(&dn, pgofs, mode);
664 if (err) {
665 if (err == -ENOENT)
666 err = 0;
667 goto unlock_out;
668 }
669
670 end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
671 }
672
673 blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
674
675 if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
676 if (create) {
677 if (unlikely(f2fs_cp_error(sbi))) {
678 err = -EIO;
679 goto sync_out;
680 }
681 err = __allocate_data_block(&dn);
682 if (err)
683 goto sync_out;
684 allocated = true;
685 map->m_flags |= F2FS_MAP_NEW;
686 blkaddr = dn.data_blkaddr;
687 } else {
688 /*
689 * we only merge preallocated unwritten blocks
690 * for fiemap.
691 */
692 if (flag != F2FS_GET_BLOCK_FIEMAP ||
693 blkaddr != NEW_ADDR)
694 goto sync_out;
695 } 730 }
696 } 731 allocated = false;
697 732 goto next_dnode;
698 /* Give more consecutive addresses for the readahead */
699 if ((map->m_pblk != NEW_ADDR &&
700 blkaddr == (map->m_pblk + ofs)) ||
701 (map->m_pblk == NEW_ADDR &&
702 blkaddr == NEW_ADDR)) {
703 ofs++;
704 dn.ofs_in_node++;
705 pgofs++;
706 map->m_len++;
707 goto get_next;
708 } 733 }
709 734
710sync_out: 735sync_out:
711 if (allocated) 736 if (allocated)
712 sync_inode_page(&dn); 737 sync_inode_page(&dn);
713put_out:
714 f2fs_put_dnode(&dn); 738 f2fs_put_dnode(&dn);
715unlock_out: 739unlock_out:
716 if (create) { 740 if (create) {
717 f2fs_unlock_op(sbi); 741 f2fs_unlock_op(sbi);
718 f2fs_balance_fs(sbi, dn.node_changed); 742 f2fs_balance_fs(sbi, allocated);
719 } 743 }
720out: 744out:
721 trace_f2fs_map_blocks(inode, map, err); 745 trace_f2fs_map_blocks(inode, map, err);
@@ -723,13 +747,15 @@ out:
723} 747}
724 748
725static int __get_data_block(struct inode *inode, sector_t iblock, 749static int __get_data_block(struct inode *inode, sector_t iblock,
726 struct buffer_head *bh, int create, int flag) 750 struct buffer_head *bh, int create, int flag,
751 pgoff_t *next_pgofs)
727{ 752{
728 struct f2fs_map_blocks map; 753 struct f2fs_map_blocks map;
729 int ret; 754 int ret;
730 755
731 map.m_lblk = iblock; 756 map.m_lblk = iblock;
732 map.m_len = bh->b_size >> inode->i_blkbits; 757 map.m_len = bh->b_size >> inode->i_blkbits;
758 map.m_next_pgofs = next_pgofs;
733 759
734 ret = f2fs_map_blocks(inode, &map, create, flag); 760 ret = f2fs_map_blocks(inode, &map, create, flag);
735 if (!ret) { 761 if (!ret) {
@@ -741,16 +767,18 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
741} 767}
742 768
743static int get_data_block(struct inode *inode, sector_t iblock, 769static int get_data_block(struct inode *inode, sector_t iblock,
744 struct buffer_head *bh_result, int create, int flag) 770 struct buffer_head *bh_result, int create, int flag,
771 pgoff_t *next_pgofs)
745{ 772{
746 return __get_data_block(inode, iblock, bh_result, create, flag); 773 return __get_data_block(inode, iblock, bh_result, create,
774 flag, next_pgofs);
747} 775}
748 776
749static int get_data_block_dio(struct inode *inode, sector_t iblock, 777static int get_data_block_dio(struct inode *inode, sector_t iblock,
750 struct buffer_head *bh_result, int create) 778 struct buffer_head *bh_result, int create)
751{ 779{
752 return __get_data_block(inode, iblock, bh_result, create, 780 return __get_data_block(inode, iblock, bh_result, create,
753 F2FS_GET_BLOCK_DIO); 781 F2FS_GET_BLOCK_DIO, NULL);
754} 782}
755 783
756static int get_data_block_bmap(struct inode *inode, sector_t iblock, 784static int get_data_block_bmap(struct inode *inode, sector_t iblock,
@@ -761,7 +789,7 @@ static int get_data_block_bmap(struct inode *inode, sector_t iblock,
761 return -EFBIG; 789 return -EFBIG;
762 790
763 return __get_data_block(inode, iblock, bh_result, create, 791 return __get_data_block(inode, iblock, bh_result, create,
764 F2FS_GET_BLOCK_BMAP); 792 F2FS_GET_BLOCK_BMAP, NULL);
765} 793}
766 794
767static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) 795static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
@@ -779,6 +807,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
779{ 807{
780 struct buffer_head map_bh; 808 struct buffer_head map_bh;
781 sector_t start_blk, last_blk; 809 sector_t start_blk, last_blk;
810 pgoff_t next_pgofs;
782 loff_t isize; 811 loff_t isize;
783 u64 logical = 0, phys = 0, size = 0; 812 u64 logical = 0, phys = 0, size = 0;
784 u32 flags = 0; 813 u32 flags = 0;
@@ -814,14 +843,15 @@ next:
814 map_bh.b_size = len; 843 map_bh.b_size = len;
815 844
816 ret = get_data_block(inode, start_blk, &map_bh, 0, 845 ret = get_data_block(inode, start_blk, &map_bh, 0,
817 F2FS_GET_BLOCK_FIEMAP); 846 F2FS_GET_BLOCK_FIEMAP, &next_pgofs);
818 if (ret) 847 if (ret)
819 goto out; 848 goto out;
820 849
821 /* HOLE */ 850 /* HOLE */
822 if (!buffer_mapped(&map_bh)) { 851 if (!buffer_mapped(&map_bh)) {
852 start_blk = next_pgofs;
823 /* Go through holes util pass the EOF */ 853 /* Go through holes util pass the EOF */
824 if (blk_to_logical(inode, start_blk++) < isize) 854 if (blk_to_logical(inode, start_blk) < isize)
825 goto prep_next; 855 goto prep_next;
826 /* Found a hole beyond isize means no more extents. 856 /* Found a hole beyond isize means no more extents.
827 * Note that the premise is that filesystems don't 857 * Note that the premise is that filesystems don't
@@ -889,6 +919,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
889 map.m_lblk = 0; 919 map.m_lblk = 0;
890 map.m_len = 0; 920 map.m_len = 0;
891 map.m_flags = 0; 921 map.m_flags = 0;
922 map.m_next_pgofs = NULL;
892 923
893 for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { 924 for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
894 925
@@ -927,7 +958,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
927 map.m_len = last_block - block_in_file; 958 map.m_len = last_block - block_in_file;
928 959
929 if (f2fs_map_blocks(inode, &map, 0, 960 if (f2fs_map_blocks(inode, &map, 0,
930 F2FS_GET_BLOCK_READ)) 961 F2FS_GET_BLOCK_READ))
931 goto set_error_page; 962 goto set_error_page;
932 } 963 }
933got_it: 964got_it:
@@ -956,12 +987,12 @@ submit_and_realloc:
956 bio = NULL; 987 bio = NULL;
957 } 988 }
958 if (bio == NULL) { 989 if (bio == NULL) {
959 struct f2fs_crypto_ctx *ctx = NULL; 990 struct fscrypt_ctx *ctx = NULL;
960 991
961 if (f2fs_encrypted_inode(inode) && 992 if (f2fs_encrypted_inode(inode) &&
962 S_ISREG(inode->i_mode)) { 993 S_ISREG(inode->i_mode)) {
963 994
964 ctx = f2fs_get_crypto_ctx(inode); 995 ctx = fscrypt_get_ctx(inode);
965 if (IS_ERR(ctx)) 996 if (IS_ERR(ctx))
966 goto set_error_page; 997 goto set_error_page;
967 998
@@ -974,7 +1005,7 @@ submit_and_realloc:
974 min_t(int, nr_pages, BIO_MAX_PAGES)); 1005 min_t(int, nr_pages, BIO_MAX_PAGES));
975 if (!bio) { 1006 if (!bio) {
976 if (ctx) 1007 if (ctx)
977 f2fs_release_crypto_ctx(ctx); 1008 fscrypt_release_ctx(ctx);
978 goto set_error_page; 1009 goto set_error_page;
979 } 1010 }
980 bio->bi_bdev = bdev; 1011 bio->bi_bdev = bdev;
@@ -1052,10 +1083,10 @@ int do_write_data_page(struct f2fs_io_info *fio)
1052 if (err) 1083 if (err)
1053 return err; 1084 return err;
1054 1085
1055 fio->blk_addr = dn.data_blkaddr; 1086 fio->old_blkaddr = dn.data_blkaddr;
1056 1087
1057 /* This page is already truncated */ 1088 /* This page is already truncated */
1058 if (fio->blk_addr == NULL_ADDR) { 1089 if (fio->old_blkaddr == NULL_ADDR) {
1059 ClearPageUptodate(page); 1090 ClearPageUptodate(page);
1060 goto out_writepage; 1091 goto out_writepage;
1061 } 1092 }
@@ -1064,9 +1095,9 @@ int do_write_data_page(struct f2fs_io_info *fio)
1064 1095
1065 /* wait for GCed encrypted page writeback */ 1096 /* wait for GCed encrypted page writeback */
1066 f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), 1097 f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode),
1067 fio->blk_addr); 1098 fio->old_blkaddr);
1068 1099
1069 fio->encrypted_page = f2fs_encrypt(inode, fio->page); 1100 fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page);
1070 if (IS_ERR(fio->encrypted_page)) { 1101 if (IS_ERR(fio->encrypted_page)) {
1071 err = PTR_ERR(fio->encrypted_page); 1102 err = PTR_ERR(fio->encrypted_page);
1072 goto out_writepage; 1103 goto out_writepage;
@@ -1079,7 +1110,7 @@ int do_write_data_page(struct f2fs_io_info *fio)
1079 * If current allocation needs SSR, 1110 * If current allocation needs SSR,
1080 * it had better in-place writes for updated data. 1111 * it had better in-place writes for updated data.
1081 */ 1112 */
1082 if (unlikely(fio->blk_addr != NEW_ADDR && 1113 if (unlikely(fio->old_blkaddr != NEW_ADDR &&
1083 !is_cold_data(page) && 1114 !is_cold_data(page) &&
1084 !IS_ATOMIC_WRITTEN_PAGE(page) && 1115 !IS_ATOMIC_WRITTEN_PAGE(page) &&
1085 need_inplace_update(inode))) { 1116 need_inplace_update(inode))) {
@@ -1088,8 +1119,6 @@ int do_write_data_page(struct f2fs_io_info *fio)
1088 trace_f2fs_do_write_data_page(page, IPU); 1119 trace_f2fs_do_write_data_page(page, IPU);
1089 } else { 1120 } else {
1090 write_data_page(&dn, fio); 1121 write_data_page(&dn, fio);
1091 set_data_blkaddr(&dn);
1092 f2fs_update_extent_cache(&dn);
1093 trace_f2fs_do_write_data_page(page, OPU); 1122 trace_f2fs_do_write_data_page(page, OPU);
1094 set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); 1123 set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
1095 if (page->index == 0) 1124 if (page->index == 0)
@@ -1177,12 +1206,18 @@ out:
1177 inode_dec_dirty_pages(inode); 1206 inode_dec_dirty_pages(inode);
1178 if (err) 1207 if (err)
1179 ClearPageUptodate(page); 1208 ClearPageUptodate(page);
1209
1210 if (wbc->for_reclaim) {
1211 f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, DATA, WRITE);
1212 remove_dirty_inode(inode);
1213 }
1214
1180 unlock_page(page); 1215 unlock_page(page);
1181 f2fs_balance_fs(sbi, need_balance_fs); 1216 f2fs_balance_fs(sbi, need_balance_fs);
1182 if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi))) { 1217
1218 if (unlikely(f2fs_cp_error(sbi)))
1183 f2fs_submit_merged_bio(sbi, DATA, WRITE); 1219 f2fs_submit_merged_bio(sbi, DATA, WRITE);
1184 remove_dirty_inode(inode); 1220
1185 }
1186 return 0; 1221 return 0;
1187 1222
1188redirty_out: 1223redirty_out:
@@ -1282,7 +1317,8 @@ continue_unlock:
1282 1317
1283 if (PageWriteback(page)) { 1318 if (PageWriteback(page)) {
1284 if (wbc->sync_mode != WB_SYNC_NONE) 1319 if (wbc->sync_mode != WB_SYNC_NONE)
1285 f2fs_wait_on_page_writeback(page, DATA); 1320 f2fs_wait_on_page_writeback(page,
1321 DATA, true);
1286 else 1322 else
1287 goto continue_unlock; 1323 goto continue_unlock;
1288 } 1324 }
@@ -1339,8 +1375,6 @@ static int f2fs_write_data_pages(struct address_space *mapping,
1339 int ret; 1375 int ret;
1340 long diff; 1376 long diff;
1341 1377
1342 trace_f2fs_writepages(mapping->host, wbc, DATA);
1343
1344 /* deal with chardevs and other special file */ 1378 /* deal with chardevs and other special file */
1345 if (!mapping->a_ops->writepage) 1379 if (!mapping->a_ops->writepage)
1346 return 0; 1380 return 0;
@@ -1362,14 +1396,16 @@ static int f2fs_write_data_pages(struct address_space *mapping,
1362 if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) 1396 if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
1363 goto skip_write; 1397 goto skip_write;
1364 1398
1399 trace_f2fs_writepages(mapping->host, wbc, DATA);
1400
1365 diff = nr_pages_to_write(sbi, DATA, wbc); 1401 diff = nr_pages_to_write(sbi, DATA, wbc);
1366 1402
1367 if (!S_ISDIR(inode->i_mode)) { 1403 if (!S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_ALL) {
1368 mutex_lock(&sbi->writepages); 1404 mutex_lock(&sbi->writepages);
1369 locked = true; 1405 locked = true;
1370 } 1406 }
1371 ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); 1407 ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
1372 f2fs_submit_merged_bio(sbi, DATA, WRITE); 1408 f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE);
1373 if (locked) 1409 if (locked)
1374 mutex_unlock(&sbi->writepages); 1410 mutex_unlock(&sbi->writepages);
1375 1411
@@ -1380,6 +1416,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
1380 1416
1381skip_write: 1417skip_write:
1382 wbc->pages_skipped += get_dirty_pages(inode); 1418 wbc->pages_skipped += get_dirty_pages(inode);
1419 trace_f2fs_writepages(mapping->host, wbc, DATA);
1383 return 0; 1420 return 0;
1384} 1421}
1385 1422
@@ -1406,6 +1443,14 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
1406 struct extent_info ei; 1443 struct extent_info ei;
1407 int err = 0; 1444 int err = 0;
1408 1445
1446 /*
1447 * we already allocated all the blocks, so we don't need to get
1448 * the block addresses when there is no need to fill the page.
1449 */
1450 if (!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode) &&
1451 len == PAGE_CACHE_SIZE)
1452 return 0;
1453
1409 if (f2fs_has_inline_data(inode) || 1454 if (f2fs_has_inline_data(inode) ||
1410 (pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { 1455 (pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
1411 f2fs_lock_op(sbi); 1456 f2fs_lock_op(sbi);
@@ -1425,7 +1470,7 @@ restart:
1425 if (pos + len <= MAX_INLINE_DATA) { 1470 if (pos + len <= MAX_INLINE_DATA) {
1426 read_inline_data(page, ipage); 1471 read_inline_data(page, ipage);
1427 set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); 1472 set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
1428 sync_inode_page(&dn); 1473 set_inline_node(ipage);
1429 } else { 1474 } else {
1430 err = f2fs_convert_inline_page(&dn, page); 1475 err = f2fs_convert_inline_page(&dn, page);
1431 if (err) 1476 if (err)
@@ -1439,13 +1484,9 @@ restart:
1439 if (f2fs_lookup_extent_cache(inode, index, &ei)) { 1484 if (f2fs_lookup_extent_cache(inode, index, &ei)) {
1440 dn.data_blkaddr = ei.blk + index - ei.fofs; 1485 dn.data_blkaddr = ei.blk + index - ei.fofs;
1441 } else { 1486 } else {
1442 bool restart = false;
1443
1444 /* hole case */ 1487 /* hole case */
1445 err = get_dnode_of_data(&dn, index, LOOKUP_NODE); 1488 err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
1446 if (err || (!err && dn.data_blkaddr == NULL_ADDR)) 1489 if (err || (!err && dn.data_blkaddr == NULL_ADDR)) {
1447 restart = true;
1448 if (restart) {
1449 f2fs_put_dnode(&dn); 1490 f2fs_put_dnode(&dn);
1450 f2fs_lock_op(sbi); 1491 f2fs_lock_op(sbi);
1451 locked = true; 1492 locked = true;
@@ -1514,7 +1555,7 @@ repeat:
1514 } 1555 }
1515 } 1556 }
1516 1557
1517 f2fs_wait_on_page_writeback(page, DATA); 1558 f2fs_wait_on_page_writeback(page, DATA, false);
1518 1559
1519 /* wait for GCed encrypted page writeback */ 1560 /* wait for GCed encrypted page writeback */
1520 if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) 1561 if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
@@ -1541,7 +1582,8 @@ repeat:
1541 .sbi = sbi, 1582 .sbi = sbi,
1542 .type = DATA, 1583 .type = DATA,
1543 .rw = READ_SYNC, 1584 .rw = READ_SYNC,
1544 .blk_addr = blkaddr, 1585 .old_blkaddr = blkaddr,
1586 .new_blkaddr = blkaddr,
1545 .page = page, 1587 .page = page,
1546 .encrypted_page = NULL, 1588 .encrypted_page = NULL,
1547 }; 1589 };
@@ -1561,7 +1603,7 @@ repeat:
1561 1603
1562 /* avoid symlink page */ 1604 /* avoid symlink page */
1563 if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { 1605 if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
1564 err = f2fs_decrypt_one(inode, page); 1606 err = fscrypt_decrypt_page(page);
1565 if (err) 1607 if (err)
1566 goto fail; 1608 goto fail;
1567 } 1609 }
@@ -1592,7 +1634,6 @@ static int f2fs_write_end(struct file *file,
1592 if (pos + copied > i_size_read(inode)) { 1634 if (pos + copied > i_size_read(inode)) {
1593 i_size_write(inode, pos + copied); 1635 i_size_write(inode, pos + copied);
1594 mark_inode_dirty(inode); 1636 mark_inode_dirty(inode);
1595 update_inode_page(inode);
1596 } 1637 }
1597 1638
1598 f2fs_put_page(page, 1); 1639 f2fs_put_page(page, 1);
@@ -1617,34 +1658,21 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter,
1617static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, 1658static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
1618 loff_t offset) 1659 loff_t offset)
1619{ 1660{
1620 struct file *file = iocb->ki_filp; 1661 struct address_space *mapping = iocb->ki_filp->f_mapping;
1621 struct address_space *mapping = file->f_mapping;
1622 struct inode *inode = mapping->host; 1662 struct inode *inode = mapping->host;
1623 size_t count = iov_iter_count(iter); 1663 size_t count = iov_iter_count(iter);
1624 int err; 1664 int err;
1625 1665
1626 /* we don't need to use inline_data strictly */ 1666 err = check_direct_IO(inode, iter, offset);
1627 err = f2fs_convert_inline_inode(inode);
1628 if (err) 1667 if (err)
1629 return err; 1668 return err;
1630 1669
1631 if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) 1670 if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
1632 return 0; 1671 return 0;
1633 1672
1634 err = check_direct_IO(inode, iter, offset);
1635 if (err)
1636 return err;
1637
1638 trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); 1673 trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
1639 1674
1640 if (iov_iter_rw(iter) == WRITE) {
1641 err = __allocate_data_blocks(inode, offset, count);
1642 if (err)
1643 goto out;
1644 }
1645
1646 err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); 1675 err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio);
1647out:
1648 if (err < 0 && iov_iter_rw(iter) == WRITE) 1676 if (err < 0 && iov_iter_rw(iter) == WRITE)
1649 f2fs_write_failed(mapping, offset + count); 1677 f2fs_write_failed(mapping, offset + count);
1650 1678
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index faa7495e2d7e..80641ad82745 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -77,7 +77,7 @@ static unsigned long dir_block_index(unsigned int level,
77} 77}
78 78
79static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, 79static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
80 struct f2fs_filename *fname, 80 struct fscrypt_name *fname,
81 f2fs_hash_t namehash, 81 f2fs_hash_t namehash,
82 int *max_slots, 82 int *max_slots,
83 struct page **res_page) 83 struct page **res_page)
@@ -103,15 +103,15 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
103 return de; 103 return de;
104} 104}
105 105
106struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname, 106struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
107 f2fs_hash_t namehash, int *max_slots, 107 f2fs_hash_t namehash, int *max_slots,
108 struct f2fs_dentry_ptr *d) 108 struct f2fs_dentry_ptr *d)
109{ 109{
110 struct f2fs_dir_entry *de; 110 struct f2fs_dir_entry *de;
111 unsigned long bit_pos = 0; 111 unsigned long bit_pos = 0;
112 int max_len = 0; 112 int max_len = 0;
113 struct f2fs_str de_name = FSTR_INIT(NULL, 0); 113 struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
114 struct f2fs_str *name = &fname->disk_name; 114 struct fscrypt_str *name = &fname->disk_name;
115 115
116 if (max_slots) 116 if (max_slots)
117 *max_slots = 0; 117 *max_slots = 0;
@@ -157,7 +157,7 @@ found:
157 157
158static struct f2fs_dir_entry *find_in_level(struct inode *dir, 158static struct f2fs_dir_entry *find_in_level(struct inode *dir,
159 unsigned int level, 159 unsigned int level,
160 struct f2fs_filename *fname, 160 struct fscrypt_name *fname,
161 struct page **res_page) 161 struct page **res_page)
162{ 162{
163 struct qstr name = FSTR_TO_QSTR(&fname->disk_name); 163 struct qstr name = FSTR_TO_QSTR(&fname->disk_name);
@@ -218,12 +218,12 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
218 struct f2fs_dir_entry *de = NULL; 218 struct f2fs_dir_entry *de = NULL;
219 unsigned int max_depth; 219 unsigned int max_depth;
220 unsigned int level; 220 unsigned int level;
221 struct f2fs_filename fname; 221 struct fscrypt_name fname;
222 int err; 222 int err;
223 223
224 *res_page = NULL; 224 *res_page = NULL;
225 225
226 err = f2fs_fname_setup_filename(dir, child, 1, &fname); 226 err = fscrypt_setup_filename(dir, child, 1, &fname);
227 if (err) 227 if (err)
228 return NULL; 228 return NULL;
229 229
@@ -251,7 +251,7 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
251 break; 251 break;
252 } 252 }
253out: 253out:
254 f2fs_fname_free_filename(&fname); 254 fscrypt_free_filename(&fname);
255 return de; 255 return de;
256} 256}
257 257
@@ -296,7 +296,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
296{ 296{
297 enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; 297 enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA;
298 lock_page(page); 298 lock_page(page);
299 f2fs_wait_on_page_writeback(page, type); 299 f2fs_wait_on_page_writeback(page, type, true);
300 de->ino = cpu_to_le32(inode->i_ino); 300 de->ino = cpu_to_le32(inode->i_ino);
301 set_de_type(de, inode->i_mode); 301 set_de_type(de, inode->i_mode);
302 f2fs_dentry_kunmap(dir, page); 302 f2fs_dentry_kunmap(dir, page);
@@ -311,7 +311,7 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
311{ 311{
312 struct f2fs_inode *ri; 312 struct f2fs_inode *ri;
313 313
314 f2fs_wait_on_page_writeback(ipage, NODE); 314 f2fs_wait_on_page_writeback(ipage, NODE, true);
315 315
316 /* copy name info. to this inode page */ 316 /* copy name info. to this inode page */
317 ri = F2FS_INODE(ipage); 317 ri = F2FS_INODE(ipage);
@@ -341,24 +341,14 @@ int update_dent_inode(struct inode *inode, struct inode *to,
341void do_make_empty_dir(struct inode *inode, struct inode *parent, 341void do_make_empty_dir(struct inode *inode, struct inode *parent,
342 struct f2fs_dentry_ptr *d) 342 struct f2fs_dentry_ptr *d)
343{ 343{
344 struct f2fs_dir_entry *de; 344 struct qstr dot = QSTR_INIT(".", 1);
345 345 struct qstr dotdot = QSTR_INIT("..", 2);
346 de = &d->dentry[0];
347 de->name_len = cpu_to_le16(1);
348 de->hash_code = 0;
349 de->ino = cpu_to_le32(inode->i_ino);
350 memcpy(d->filename[0], ".", 1);
351 set_de_type(de, inode->i_mode);
352 346
353 de = &d->dentry[1]; 347 /* update dirent of "." */
354 de->hash_code = 0; 348 f2fs_update_dentry(inode->i_ino, inode->i_mode, d, &dot, 0, 0);
355 de->name_len = cpu_to_le16(2);
356 de->ino = cpu_to_le32(parent->i_ino);
357 memcpy(d->filename[1], "..", 2);
358 set_de_type(de, parent->i_mode);
359 349
360 test_and_set_bit_le(0, (void *)d->bitmap); 350 /* update dirent of ".." */
361 test_and_set_bit_le(1, (void *)d->bitmap); 351 f2fs_update_dentry(parent->i_ino, parent->i_mode, d, &dotdot, 0, 1);
362} 352}
363 353
364static int make_empty_dir(struct inode *inode, 354static int make_empty_dir(struct inode *inode,
@@ -413,7 +403,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
413 goto put_error; 403 goto put_error;
414 404
415 if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) { 405 if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) {
416 err = f2fs_inherit_context(dir, inode, page); 406 err = fscrypt_inherit_context(dir, inode, page, false);
417 if (err) 407 if (err)
418 goto put_error; 408 goto put_error;
419 } 409 }
@@ -511,8 +501,12 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
511 memcpy(d->filename[bit_pos], name->name, name->len); 501 memcpy(d->filename[bit_pos], name->name, name->len);
512 de->ino = cpu_to_le32(ino); 502 de->ino = cpu_to_le32(ino);
513 set_de_type(de, mode); 503 set_de_type(de, mode);
514 for (i = 0; i < slots; i++) 504 for (i = 0; i < slots; i++) {
515 test_and_set_bit_le(bit_pos + i, (void *)d->bitmap); 505 test_and_set_bit_le(bit_pos + i, (void *)d->bitmap);
506 /* avoid wrong garbage data for readdir */
507 if (i)
508 (de + i)->name_len = 0;
509 }
516} 510}
517 511
518/* 512/*
@@ -532,11 +526,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
532 struct f2fs_dentry_block *dentry_blk = NULL; 526 struct f2fs_dentry_block *dentry_blk = NULL;
533 struct f2fs_dentry_ptr d; 527 struct f2fs_dentry_ptr d;
534 struct page *page = NULL; 528 struct page *page = NULL;
535 struct f2fs_filename fname; 529 struct fscrypt_name fname;
536 struct qstr new_name; 530 struct qstr new_name;
537 int slots, err; 531 int slots, err;
538 532
539 err = f2fs_fname_setup_filename(dir, name, 0, &fname); 533 err = fscrypt_setup_filename(dir, name, 0, &fname);
540 if (err) 534 if (err)
541 return err; 535 return err;
542 536
@@ -598,7 +592,7 @@ start:
598 ++level; 592 ++level;
599 goto start; 593 goto start;
600add_dentry: 594add_dentry:
601 f2fs_wait_on_page_writeback(dentry_page, DATA); 595 f2fs_wait_on_page_writeback(dentry_page, DATA, true);
602 596
603 if (inode) { 597 if (inode) {
604 down_write(&F2FS_I(inode)->i_sem); 598 down_write(&F2FS_I(inode)->i_sem);
@@ -635,7 +629,7 @@ fail:
635 kunmap(dentry_page); 629 kunmap(dentry_page);
636 f2fs_put_page(dentry_page, 1); 630 f2fs_put_page(dentry_page, 1);
637out: 631out:
638 f2fs_fname_free_filename(&fname); 632 fscrypt_free_filename(&fname);
639 f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); 633 f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
640 return err; 634 return err;
641} 635}
@@ -709,7 +703,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
709 return f2fs_delete_inline_entry(dentry, page, dir, inode); 703 return f2fs_delete_inline_entry(dentry, page, dir, inode);
710 704
711 lock_page(page); 705 lock_page(page);
712 f2fs_wait_on_page_writeback(page, DATA); 706 f2fs_wait_on_page_writeback(page, DATA, true);
713 707
714 dentry_blk = page_address(page); 708 dentry_blk = page_address(page);
715 bit_pos = dentry - dentry_blk->dentry; 709 bit_pos = dentry - dentry_blk->dentry;
@@ -777,12 +771,12 @@ bool f2fs_empty_dir(struct inode *dir)
777} 771}
778 772
779bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, 773bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
780 unsigned int start_pos, struct f2fs_str *fstr) 774 unsigned int start_pos, struct fscrypt_str *fstr)
781{ 775{
782 unsigned char d_type = DT_UNKNOWN; 776 unsigned char d_type = DT_UNKNOWN;
783 unsigned int bit_pos; 777 unsigned int bit_pos;
784 struct f2fs_dir_entry *de = NULL; 778 struct f2fs_dir_entry *de = NULL;
785 struct f2fs_str de_name = FSTR_INIT(NULL, 0); 779 struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
786 780
787 bit_pos = ((unsigned long)ctx->pos % d->max); 781 bit_pos = ((unsigned long)ctx->pos % d->max);
788 782
@@ -792,6 +786,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
792 break; 786 break;
793 787
794 de = &d->dentry[bit_pos]; 788 de = &d->dentry[bit_pos];
789 if (de->name_len == 0) {
790 bit_pos++;
791 ctx->pos = start_pos + bit_pos;
792 continue;
793 }
794
795 if (de->file_type < F2FS_FT_MAX) 795 if (de->file_type < F2FS_FT_MAX)
796 d_type = f2fs_filetype_table[de->file_type]; 796 d_type = f2fs_filetype_table[de->file_type];
797 else 797 else
@@ -810,8 +810,9 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
810 810
811 memcpy(de_name.name, d->filename[bit_pos], de_name.len); 811 memcpy(de_name.name, d->filename[bit_pos], de_name.len);
812 812
813 ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code, 813 ret = fscrypt_fname_disk_to_usr(d->inode,
814 &de_name, fstr); 814 (u32)de->hash_code, 0,
815 &de_name, fstr);
815 kfree(de_name.name); 816 kfree(de_name.name);
816 if (ret < 0) 817 if (ret < 0)
817 return true; 818 return true;
@@ -839,16 +840,15 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
839 struct file_ra_state *ra = &file->f_ra; 840 struct file_ra_state *ra = &file->f_ra;
840 unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); 841 unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
841 struct f2fs_dentry_ptr d; 842 struct f2fs_dentry_ptr d;
842 struct f2fs_str fstr = FSTR_INIT(NULL, 0); 843 struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
843 int err = 0; 844 int err = 0;
844 845
845 if (f2fs_encrypted_inode(inode)) { 846 if (f2fs_encrypted_inode(inode)) {
846 err = f2fs_get_encryption_info(inode); 847 err = fscrypt_get_encryption_info(inode);
847 if (err) 848 if (err && err != -ENOKEY)
848 return err; 849 return err;
849 850
850 err = f2fs_fname_crypto_alloc_buffer(inode, F2FS_NAME_LEN, 851 err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr);
851 &fstr);
852 if (err < 0) 852 if (err < 0)
853 return err; 853 return err;
854 } 854 }
@@ -888,15 +888,23 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
888 f2fs_put_page(dentry_page, 1); 888 f2fs_put_page(dentry_page, 1);
889 } 889 }
890out: 890out:
891 f2fs_fname_crypto_free_buffer(&fstr); 891 fscrypt_fname_free_buffer(&fstr);
892 return err; 892 return err;
893} 893}
894 894
895static int f2fs_dir_open(struct inode *inode, struct file *filp)
896{
897 if (f2fs_encrypted_inode(inode))
898 return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
899 return 0;
900}
901
895const struct file_operations f2fs_dir_operations = { 902const struct file_operations f2fs_dir_operations = {
896 .llseek = generic_file_llseek, 903 .llseek = generic_file_llseek,
897 .read = generic_read_dir, 904 .read = generic_read_dir,
898 .iterate = f2fs_readdir, 905 .iterate = f2fs_readdir,
899 .fsync = f2fs_sync_file, 906 .fsync = f2fs_sync_file,
907 .open = f2fs_dir_open,
900 .unlocked_ioctl = f2fs_ioctl, 908 .unlocked_ioctl = f2fs_ioctl,
901#ifdef CONFIG_COMPAT 909#ifdef CONFIG_COMPAT
902 .compat_ioctl = f2fs_compat_ioctl, 910 .compat_ioctl = f2fs_compat_ioctl,
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index ccd5c636d3fe..c859bb044728 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -33,6 +33,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
33 33
34 en->ei = *ei; 34 en->ei = *ei;
35 INIT_LIST_HEAD(&en->list); 35 INIT_LIST_HEAD(&en->list);
36 en->et = et;
36 37
37 rb_link_node(&en->rb_node, parent, p); 38 rb_link_node(&en->rb_node, parent, p);
38 rb_insert_color(&en->rb_node, &et->root); 39 rb_insert_color(&en->rb_node, &et->root);
@@ -50,6 +51,24 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi,
50 51
51 if (et->cached_en == en) 52 if (et->cached_en == en)
52 et->cached_en = NULL; 53 et->cached_en = NULL;
54 kmem_cache_free(extent_node_slab, en);
55}
56
57/*
58 * Flow to release an extent_node:
59 * 1. list_del_init
60 * 2. __detach_extent_node
61 * 3. kmem_cache_free.
62 */
63static void __release_extent_node(struct f2fs_sb_info *sbi,
64 struct extent_tree *et, struct extent_node *en)
65{
66 spin_lock(&sbi->extent_lock);
67 f2fs_bug_on(sbi, list_empty(&en->list));
68 list_del_init(&en->list);
69 spin_unlock(&sbi->extent_lock);
70
71 __detach_extent_node(sbi, et, en);
53} 72}
54 73
55static struct extent_tree *__grab_extent_tree(struct inode *inode) 74static struct extent_tree *__grab_extent_tree(struct inode *inode)
@@ -129,7 +148,7 @@ static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi,
129} 148}
130 149
131static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, 150static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
132 struct extent_tree *et, bool free_all) 151 struct extent_tree *et)
133{ 152{
134 struct rb_node *node, *next; 153 struct rb_node *node, *next;
135 struct extent_node *en; 154 struct extent_node *en;
@@ -139,18 +158,7 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
139 while (node) { 158 while (node) {
140 next = rb_next(node); 159 next = rb_next(node);
141 en = rb_entry(node, struct extent_node, rb_node); 160 en = rb_entry(node, struct extent_node, rb_node);
142 161 __release_extent_node(sbi, et, en);
143 if (free_all) {
144 spin_lock(&sbi->extent_lock);
145 if (!list_empty(&en->list))
146 list_del_init(&en->list);
147 spin_unlock(&sbi->extent_lock);
148 }
149
150 if (free_all || list_empty(&en->list)) {
151 __detach_extent_node(sbi, et, en);
152 kmem_cache_free(extent_node_slab, en);
153 }
154 node = next; 162 node = next;
155 } 163 }
156 164
@@ -232,9 +240,10 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
232 if (en) { 240 if (en) {
233 *ei = en->ei; 241 *ei = en->ei;
234 spin_lock(&sbi->extent_lock); 242 spin_lock(&sbi->extent_lock);
235 if (!list_empty(&en->list)) 243 if (!list_empty(&en->list)) {
236 list_move_tail(&en->list, &sbi->extent_list); 244 list_move_tail(&en->list, &sbi->extent_list);
237 et->cached_en = en; 245 et->cached_en = en;
246 }
238 spin_unlock(&sbi->extent_lock); 247 spin_unlock(&sbi->extent_lock);
239 ret = true; 248 ret = true;
240 } 249 }
@@ -329,7 +338,6 @@ lookup_neighbors:
329 338
330static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, 339static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
331 struct extent_tree *et, struct extent_info *ei, 340 struct extent_tree *et, struct extent_info *ei,
332 struct extent_node **den,
333 struct extent_node *prev_ex, 341 struct extent_node *prev_ex,
334 struct extent_node *next_ex) 342 struct extent_node *next_ex)
335{ 343{
@@ -342,20 +350,25 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
342 } 350 }
343 351
344 if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { 352 if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) {
345 if (en) { 353 if (en)
346 __detach_extent_node(sbi, et, prev_ex); 354 __release_extent_node(sbi, et, prev_ex);
347 *den = prev_ex;
348 }
349 next_ex->ei.fofs = ei->fofs; 355 next_ex->ei.fofs = ei->fofs;
350 next_ex->ei.blk = ei->blk; 356 next_ex->ei.blk = ei->blk;
351 next_ex->ei.len += ei->len; 357 next_ex->ei.len += ei->len;
352 en = next_ex; 358 en = next_ex;
353 } 359 }
354 360
355 if (en) { 361 if (!en)
356 __try_update_largest_extent(et, en); 362 return NULL;
363
364 __try_update_largest_extent(et, en);
365
366 spin_lock(&sbi->extent_lock);
367 if (!list_empty(&en->list)) {
368 list_move_tail(&en->list, &sbi->extent_list);
357 et->cached_en = en; 369 et->cached_en = en;
358 } 370 }
371 spin_unlock(&sbi->extent_lock);
359 return en; 372 return en;
360} 373}
361 374
@@ -391,7 +404,12 @@ do_insert:
391 return NULL; 404 return NULL;
392 405
393 __try_update_largest_extent(et, en); 406 __try_update_largest_extent(et, en);
407
408 /* update in global extent list */
409 spin_lock(&sbi->extent_lock);
410 list_add_tail(&en->list, &sbi->extent_list);
394 et->cached_en = en; 411 et->cached_en = en;
412 spin_unlock(&sbi->extent_lock);
395 return en; 413 return en;
396} 414}
397 415
@@ -479,7 +497,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
479 if (parts) 497 if (parts)
480 __try_update_largest_extent(et, en); 498 __try_update_largest_extent(et, en);
481 else 499 else
482 __detach_extent_node(sbi, et, en); 500 __release_extent_node(sbi, et, en);
483 501
484 /* 502 /*
485 * if original extent is split into zero or two parts, extent 503 * if original extent is split into zero or two parts, extent
@@ -490,31 +508,15 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
490 insert_p = NULL; 508 insert_p = NULL;
491 insert_parent = NULL; 509 insert_parent = NULL;
492 } 510 }
493
494 /* update in global extent list */
495 spin_lock(&sbi->extent_lock);
496 if (!parts && !list_empty(&en->list))
497 list_del(&en->list);
498 if (en1)
499 list_add_tail(&en1->list, &sbi->extent_list);
500 spin_unlock(&sbi->extent_lock);
501
502 /* release extent node */
503 if (!parts)
504 kmem_cache_free(extent_node_slab, en);
505
506 en = next_en; 511 en = next_en;
507 } 512 }
508 513
509 /* 3. update extent in extent cache */ 514 /* 3. update extent in extent cache */
510 if (blkaddr) { 515 if (blkaddr) {
511 struct extent_node *den = NULL;
512 516
513 set_extent_info(&ei, fofs, blkaddr, len); 517 set_extent_info(&ei, fofs, blkaddr, len);
514 en1 = __try_merge_extent_node(sbi, et, &ei, &den, 518 if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
515 prev_en, next_en); 519 __insert_extent_tree(sbi, et, &ei,
516 if (!en1)
517 en1 = __insert_extent_tree(sbi, et, &ei,
518 insert_p, insert_parent); 520 insert_p, insert_parent);
519 521
520 /* give up extent_cache, if split and small updates happen */ 522 /* give up extent_cache, if split and small updates happen */
@@ -524,24 +526,10 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
524 et->largest.len = 0; 526 et->largest.len = 0;
525 set_inode_flag(F2FS_I(inode), FI_NO_EXTENT); 527 set_inode_flag(F2FS_I(inode), FI_NO_EXTENT);
526 } 528 }
527
528 spin_lock(&sbi->extent_lock);
529 if (en1) {
530 if (list_empty(&en1->list))
531 list_add_tail(&en1->list, &sbi->extent_list);
532 else
533 list_move_tail(&en1->list, &sbi->extent_list);
534 }
535 if (den && !list_empty(&den->list))
536 list_del(&den->list);
537 spin_unlock(&sbi->extent_lock);
538
539 if (den)
540 kmem_cache_free(extent_node_slab, den);
541 } 529 }
542 530
543 if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) 531 if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
544 __free_extent_tree(sbi, et, true); 532 __free_extent_tree(sbi, et);
545 533
546 write_unlock(&et->lock); 534 write_unlock(&et->lock);
547 535
@@ -550,14 +538,10 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
550 538
551unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) 539unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
552{ 540{
553 struct extent_tree *treevec[EXT_TREE_VEC_SIZE];
554 struct extent_tree *et, *next; 541 struct extent_tree *et, *next;
555 struct extent_node *en, *tmp; 542 struct extent_node *en;
556 unsigned long ino = F2FS_ROOT_INO(sbi);
557 unsigned int found;
558 unsigned int node_cnt = 0, tree_cnt = 0; 543 unsigned int node_cnt = 0, tree_cnt = 0;
559 int remained; 544 int remained;
560 bool do_free = false;
561 545
562 if (!test_opt(sbi, EXTENT_CACHE)) 546 if (!test_opt(sbi, EXTENT_CACHE))
563 return 0; 547 return 0;
@@ -572,10 +556,10 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
572 list_for_each_entry_safe(et, next, &sbi->zombie_list, list) { 556 list_for_each_entry_safe(et, next, &sbi->zombie_list, list) {
573 if (atomic_read(&et->node_cnt)) { 557 if (atomic_read(&et->node_cnt)) {
574 write_lock(&et->lock); 558 write_lock(&et->lock);
575 node_cnt += __free_extent_tree(sbi, et, true); 559 node_cnt += __free_extent_tree(sbi, et);
576 write_unlock(&et->lock); 560 write_unlock(&et->lock);
577 } 561 }
578 562 f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
579 list_del_init(&et->list); 563 list_del_init(&et->list);
580 radix_tree_delete(&sbi->extent_tree_root, et->ino); 564 radix_tree_delete(&sbi->extent_tree_root, et->ino);
581 kmem_cache_free(extent_tree_slab, et); 565 kmem_cache_free(extent_tree_slab, et);
@@ -585,6 +569,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
585 569
586 if (node_cnt + tree_cnt >= nr_shrink) 570 if (node_cnt + tree_cnt >= nr_shrink)
587 goto unlock_out; 571 goto unlock_out;
572 cond_resched();
588 } 573 }
589 up_write(&sbi->extent_tree_lock); 574 up_write(&sbi->extent_tree_lock);
590 575
@@ -596,42 +581,29 @@ free_node:
596 remained = nr_shrink - (node_cnt + tree_cnt); 581 remained = nr_shrink - (node_cnt + tree_cnt);
597 582
598 spin_lock(&sbi->extent_lock); 583 spin_lock(&sbi->extent_lock);
599 list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) { 584 for (; remained > 0; remained--) {
600 if (!remained--) 585 if (list_empty(&sbi->extent_list))
601 break; 586 break;
602 list_del_init(&en->list); 587 en = list_first_entry(&sbi->extent_list,
603 do_free = true; 588 struct extent_node, list);
604 } 589 et = en->et;
605 spin_unlock(&sbi->extent_lock); 590 if (!write_trylock(&et->lock)) {
606 591 /* refresh this extent node's position in extent list */
607 if (do_free == false) 592 list_move_tail(&en->list, &sbi->extent_list);
608 goto unlock_out; 593 continue;
609 594 }
610 /*
611 * reset ino for searching victims from beginning of global extent tree.
612 */
613 ino = F2FS_ROOT_INO(sbi);
614
615 while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root,
616 (void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
617 unsigned i;
618
619 ino = treevec[found - 1]->ino + 1;
620 for (i = 0; i < found; i++) {
621 struct extent_tree *et = treevec[i];
622 595
623 if (!atomic_read(&et->node_cnt)) 596 list_del_init(&en->list);
624 continue; 597 spin_unlock(&sbi->extent_lock);
625 598
626 if (write_trylock(&et->lock)) { 599 __detach_extent_node(sbi, et, en);
627 node_cnt += __free_extent_tree(sbi, et, false);
628 write_unlock(&et->lock);
629 }
630 600
631 if (node_cnt + tree_cnt >= nr_shrink) 601 write_unlock(&et->lock);
632 goto unlock_out; 602 node_cnt++;
633 } 603 spin_lock(&sbi->extent_lock);
634 } 604 }
605 spin_unlock(&sbi->extent_lock);
606
635unlock_out: 607unlock_out:
636 up_write(&sbi->extent_tree_lock); 608 up_write(&sbi->extent_tree_lock);
637out: 609out:
@@ -650,7 +622,7 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode)
650 return 0; 622 return 0;
651 623
652 write_lock(&et->lock); 624 write_lock(&et->lock);
653 node_cnt = __free_extent_tree(sbi, et, true); 625 node_cnt = __free_extent_tree(sbi, et);
654 write_unlock(&et->lock); 626 write_unlock(&et->lock);
655 627
656 return node_cnt; 628 return node_cnt;
@@ -701,19 +673,21 @@ bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
701 673
702void f2fs_update_extent_cache(struct dnode_of_data *dn) 674void f2fs_update_extent_cache(struct dnode_of_data *dn)
703{ 675{
704 struct f2fs_inode_info *fi = F2FS_I(dn->inode);
705 pgoff_t fofs; 676 pgoff_t fofs;
677 block_t blkaddr;
706 678
707 if (!f2fs_may_extent_tree(dn->inode)) 679 if (!f2fs_may_extent_tree(dn->inode))
708 return; 680 return;
709 681
710 f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); 682 if (dn->data_blkaddr == NEW_ADDR)
711 683 blkaddr = NULL_ADDR;
684 else
685 blkaddr = dn->data_blkaddr;
712 686
713 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + 687 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
714 dn->ofs_in_node; 688 dn->ofs_in_node;
715 689
716 if (f2fs_update_extent_tree_range(dn->inode, fofs, dn->data_blkaddr, 1)) 690 if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1))
717 sync_inode_page(dn); 691 sync_inode_page(dn);
718} 692}
719 693
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ff79054c6cf6..bbe2cd1265d0 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -22,10 +22,11 @@
22#include <linux/vmalloc.h> 22#include <linux/vmalloc.h>
23#include <linux/bio.h> 23#include <linux/bio.h>
24#include <linux/blkdev.h> 24#include <linux/blkdev.h>
25#include <linux/fscrypto.h>
26#include <crypto/hash.h>
25 27
26#ifdef CONFIG_F2FS_CHECK_FS 28#ifdef CONFIG_F2FS_CHECK_FS
27#define f2fs_bug_on(sbi, condition) BUG_ON(condition) 29#define f2fs_bug_on(sbi, condition) BUG_ON(condition)
28#define f2fs_down_write(x, y) down_write_nest_lock(x, y)
29#else 30#else
30#define f2fs_bug_on(sbi, condition) \ 31#define f2fs_bug_on(sbi, condition) \
31 do { \ 32 do { \
@@ -34,7 +35,6 @@
34 set_sbi_flag(sbi, SBI_NEED_FSCK); \ 35 set_sbi_flag(sbi, SBI_NEED_FSCK); \
35 } \ 36 } \
36 } while (0) 37 } while (0)
37#define f2fs_down_write(x, y) down_write(x)
38#endif 38#endif
39 39
40/* 40/*
@@ -84,27 +84,6 @@ struct f2fs_mount_info {
84#define F2FS_CLEAR_FEATURE(sb, mask) \ 84#define F2FS_CLEAR_FEATURE(sb, mask) \
85 F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask) 85 F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)
86 86
87#define CRCPOLY_LE 0xedb88320
88
89static inline __u32 f2fs_crc32(void *buf, size_t len)
90{
91 unsigned char *p = (unsigned char *)buf;
92 __u32 crc = F2FS_SUPER_MAGIC;
93 int i;
94
95 while (len--) {
96 crc ^= *p++;
97 for (i = 0; i < 8; i++)
98 crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0);
99 }
100 return crc;
101}
102
103static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size)
104{
105 return f2fs_crc32(buf, buf_size) == blk_crc;
106}
107
108/* 87/*
109 * For checkpoint manager 88 * For checkpoint manager
110 */ 89 */
@@ -183,37 +162,37 @@ struct fsync_inode_entry {
183 block_t last_inode; /* block address locating the last inode */ 162 block_t last_inode; /* block address locating the last inode */
184}; 163};
185 164
186#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats)) 165#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats))
187#define sits_in_cursum(sum) (le16_to_cpu(sum->n_sits)) 166#define sits_in_cursum(jnl) (le16_to_cpu(jnl->n_sits))
188 167
189#define nat_in_journal(sum, i) (sum->nat_j.entries[i].ne) 168#define nat_in_journal(jnl, i) (jnl->nat_j.entries[i].ne)
190#define nid_in_journal(sum, i) (sum->nat_j.entries[i].nid) 169#define nid_in_journal(jnl, i) (jnl->nat_j.entries[i].nid)
191#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se) 170#define sit_in_journal(jnl, i) (jnl->sit_j.entries[i].se)
192#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno) 171#define segno_in_journal(jnl, i) (jnl->sit_j.entries[i].segno)
193 172
194#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum)) 173#define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl))
195#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum)) 174#define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl))
196 175
197static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i) 176static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i)
198{ 177{
199 int before = nats_in_cursum(rs); 178 int before = nats_in_cursum(journal);
200 rs->n_nats = cpu_to_le16(before + i); 179 journal->n_nats = cpu_to_le16(before + i);
201 return before; 180 return before;
202} 181}
203 182
204static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i) 183static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i)
205{ 184{
206 int before = sits_in_cursum(rs); 185 int before = sits_in_cursum(journal);
207 rs->n_sits = cpu_to_le16(before + i); 186 journal->n_sits = cpu_to_le16(before + i);
208 return before; 187 return before;
209} 188}
210 189
211static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, 190static inline bool __has_cursum_space(struct f2fs_journal *journal,
212 int type) 191 int size, int type)
213{ 192{
214 if (type == NAT_JOURNAL) 193 if (type == NAT_JOURNAL)
215 return size <= MAX_NAT_JENTRIES(sum); 194 return size <= MAX_NAT_JENTRIES(journal);
216 return size <= MAX_SIT_JENTRIES(sum); 195 return size <= MAX_SIT_JENTRIES(journal);
217} 196}
218 197
219/* 198/*
@@ -233,12 +212,9 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
233#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) 212#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7)
234#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) 213#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8)
235 214
236#define F2FS_IOC_SET_ENCRYPTION_POLICY \ 215#define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
237 _IOR('f', 19, struct f2fs_encryption_policy) 216#define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
238#define F2FS_IOC_GET_ENCRYPTION_PWSALT \ 217#define F2FS_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT
239 _IOW('f', 20, __u8[16])
240#define F2FS_IOC_GET_ENCRYPTION_POLICY \
241 _IOW('f', 21, struct f2fs_encryption_policy)
242 218
243/* 219/*
244 * should be same as XFS_IOC_GOINGDOWN. 220 * should be same as XFS_IOC_GOINGDOWN.
@@ -268,25 +244,6 @@ struct f2fs_defragment {
268 * For INODE and NODE manager 244 * For INODE and NODE manager
269 */ 245 */
270/* for directory operations */ 246/* for directory operations */
271struct f2fs_str {
272 unsigned char *name;
273 u32 len;
274};
275
276struct f2fs_filename {
277 const struct qstr *usr_fname;
278 struct f2fs_str disk_name;
279 f2fs_hash_t hash;
280#ifdef CONFIG_F2FS_FS_ENCRYPTION
281 struct f2fs_str crypto_buf;
282#endif
283};
284
285#define FSTR_INIT(n, l) { .name = n, .len = l }
286#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len)
287#define fname_name(p) ((p)->disk_name.name)
288#define fname_len(p) ((p)->disk_name.len)
289
290struct f2fs_dentry_ptr { 247struct f2fs_dentry_ptr {
291 struct inode *inode; 248 struct inode *inode;
292 const void *bitmap; 249 const void *bitmap;
@@ -354,6 +311,7 @@ struct extent_node {
354 struct rb_node rb_node; /* rb node located in rb-tree */ 311 struct rb_node rb_node; /* rb node located in rb-tree */
355 struct list_head list; /* node in global extent list of sbi */ 312 struct list_head list; /* node in global extent list of sbi */
356 struct extent_info ei; /* extent info */ 313 struct extent_info ei; /* extent info */
314 struct extent_tree *et; /* extent tree pointer */
357}; 315};
358 316
359struct extent_tree { 317struct extent_tree {
@@ -382,6 +340,7 @@ struct f2fs_map_blocks {
382 block_t m_lblk; 340 block_t m_lblk;
383 unsigned int m_len; 341 unsigned int m_len;
384 unsigned int m_flags; 342 unsigned int m_flags;
343 pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */
385}; 344};
386 345
387/* for flag in get_data_block */ 346/* for flag in get_data_block */
@@ -389,6 +348,8 @@ struct f2fs_map_blocks {
389#define F2FS_GET_BLOCK_DIO 1 348#define F2FS_GET_BLOCK_DIO 1
390#define F2FS_GET_BLOCK_FIEMAP 2 349#define F2FS_GET_BLOCK_FIEMAP 2
391#define F2FS_GET_BLOCK_BMAP 3 350#define F2FS_GET_BLOCK_BMAP 3
351#define F2FS_GET_BLOCK_PRE_DIO 4
352#define F2FS_GET_BLOCK_PRE_AIO 5
392 353
393/* 354/*
394 * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. 355 * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
@@ -410,15 +371,6 @@ struct f2fs_map_blocks {
410#define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) 371#define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT)
411#define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) 372#define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT)
412 373
413/* Encryption algorithms */
414#define F2FS_ENCRYPTION_MODE_INVALID 0
415#define F2FS_ENCRYPTION_MODE_AES_256_XTS 1
416#define F2FS_ENCRYPTION_MODE_AES_256_GCM 2
417#define F2FS_ENCRYPTION_MODE_AES_256_CBC 3
418#define F2FS_ENCRYPTION_MODE_AES_256_CTS 4
419
420#include "f2fs_crypto.h"
421
422#define DEF_DIR_LEVEL 0 374#define DEF_DIR_LEVEL 0
423 375
424struct f2fs_inode_info { 376struct f2fs_inode_info {
@@ -442,13 +394,7 @@ struct f2fs_inode_info {
442 struct list_head dirty_list; /* linked in global dirty list */ 394 struct list_head dirty_list; /* linked in global dirty list */
443 struct list_head inmem_pages; /* inmemory pages managed by f2fs */ 395 struct list_head inmem_pages; /* inmemory pages managed by f2fs */
444 struct mutex inmem_lock; /* lock for inmemory pages */ 396 struct mutex inmem_lock; /* lock for inmemory pages */
445
446 struct extent_tree *extent_tree; /* cached extent_tree entry */ 397 struct extent_tree *extent_tree; /* cached extent_tree entry */
447
448#ifdef CONFIG_F2FS_FS_ENCRYPTION
449 /* Encryption params */
450 struct f2fs_crypt_info *i_crypt_info;
451#endif
452}; 398};
453 399
454static inline void get_extent_info(struct extent_info *ext, 400static inline void get_extent_info(struct extent_info *ext,
@@ -515,6 +461,7 @@ struct f2fs_nm_info {
515 nid_t next_scan_nid; /* the next nid to be scanned */ 461 nid_t next_scan_nid; /* the next nid to be scanned */
516 unsigned int ram_thresh; /* control the memory footprint */ 462 unsigned int ram_thresh; /* control the memory footprint */
517 unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ 463 unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */
464 unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */
518 465
519 /* NAT cache management */ 466 /* NAT cache management */
520 struct radix_tree_root nat_root;/* root of the nat entry cache */ 467 struct radix_tree_root nat_root;/* root of the nat entry cache */
@@ -549,6 +496,8 @@ struct dnode_of_data {
549 unsigned int ofs_in_node; /* data offset in the node page */ 496 unsigned int ofs_in_node; /* data offset in the node page */
550 bool inode_page_locked; /* inode page is locked or not */ 497 bool inode_page_locked; /* inode page is locked or not */
551 bool node_changed; /* is node block changed */ 498 bool node_changed; /* is node block changed */
499 char cur_level; /* level of hole node page */
500 char max_level; /* level of current page located */
552 block_t data_blkaddr; /* block address of the node block */ 501 block_t data_blkaddr; /* block address of the node block */
553}; 502};
554 503
@@ -679,6 +628,7 @@ enum page_type {
679 META_FLUSH, 628 META_FLUSH,
680 INMEM, /* the below types are used by tracepoints only. */ 629 INMEM, /* the below types are used by tracepoints only. */
681 INMEM_DROP, 630 INMEM_DROP,
631 INMEM_REVOKE,
682 IPU, 632 IPU,
683 OPU, 633 OPU,
684}; 634};
@@ -687,7 +637,8 @@ struct f2fs_io_info {
687 struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ 637 struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */
688 enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ 638 enum page_type type; /* contains DATA/NODE/META/META_FLUSH */
689 int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ 639 int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */
690 block_t blk_addr; /* block address to be written */ 640 block_t new_blkaddr; /* new block address to be written */
641 block_t old_blkaddr; /* old block address before Cow */
691 struct page *page; /* page to be written */ 642 struct page *page; /* page to be written */
692 struct page *encrypted_page; /* encrypted page */ 643 struct page *encrypted_page; /* encrypted page */
693}; 644};
@@ -844,8 +795,22 @@ struct f2fs_sb_info {
844 struct list_head s_list; 795 struct list_head s_list;
845 struct mutex umount_mutex; 796 struct mutex umount_mutex;
846 unsigned int shrinker_run_no; 797 unsigned int shrinker_run_no;
798
799 /* For write statistics */
800 u64 sectors_written_start;
801 u64 kbytes_written;
802
803 /* Reference to checksum algorithm driver via cryptoapi */
804 struct crypto_shash *s_chksum_driver;
847}; 805};
848 806
807/* For write statistics. Suppose sector size is 512 bytes,
808 * and the return value is in kbytes. s is of struct f2fs_sb_info.
809 */
810#define BD_PART_WRITTEN(s) \
811(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) - \
812 s->sectors_written_start) >> 1)
813
849static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) 814static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
850{ 815{
851 sbi->last_time[type] = jiffies; 816 sbi->last_time[type] = jiffies;
@@ -874,6 +839,29 @@ static inline bool is_idle(struct f2fs_sb_info *sbi)
874/* 839/*
875 * Inline functions 840 * Inline functions
876 */ 841 */
842static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address,
843 unsigned int length)
844{
845 SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver);
846 u32 *ctx = (u32 *)shash_desc_ctx(shash);
847 int err;
848
849 shash->tfm = sbi->s_chksum_driver;
850 shash->flags = 0;
851 *ctx = F2FS_SUPER_MAGIC;
852
853 err = crypto_shash_update(shash, address, length);
854 BUG_ON(err);
855
856 return *ctx;
857}
858
859static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc,
860 void *buf, size_t buf_size)
861{
862 return f2fs_crc32(sbi, buf, buf_size) == blk_crc;
863}
864
877static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) 865static inline struct f2fs_inode_info *F2FS_I(struct inode *inode)
878{ 866{
879 return container_of(inode, struct f2fs_inode_info, vfs_inode); 867 return container_of(inode, struct f2fs_inode_info, vfs_inode);
@@ -1006,7 +994,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
1006 994
1007static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) 995static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
1008{ 996{
1009 f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex); 997 down_write(&sbi->cp_rwsem);
1010} 998}
1011 999
1012static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) 1000static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
@@ -1525,9 +1513,9 @@ static inline int f2fs_has_inline_xattr(struct inode *inode)
1525 return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR); 1513 return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR);
1526} 1514}
1527 1515
1528static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) 1516static inline unsigned int addrs_per_inode(struct inode *inode)
1529{ 1517{
1530 if (f2fs_has_inline_xattr(&fi->vfs_inode)) 1518 if (f2fs_has_inline_xattr(inode))
1531 return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; 1519 return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS;
1532 return DEF_ADDRS_PER_INODE; 1520 return DEF_ADDRS_PER_INODE;
1533} 1521}
@@ -1681,10 +1669,10 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
1681 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) 1669 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
1682 1670
1683/* get offset of first page in next direct node */ 1671/* get offset of first page in next direct node */
1684#define PGOFS_OF_NEXT_DNODE(pgofs, fi) \ 1672#define PGOFS_OF_NEXT_DNODE(pgofs, inode) \
1685 ((pgofs < ADDRS_PER_INODE(fi)) ? ADDRS_PER_INODE(fi) : \ 1673 ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) : \
1686 (pgofs - ADDRS_PER_INODE(fi) + ADDRS_PER_BLOCK) / \ 1674 (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) / \
1687 ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi)) 1675 ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode))
1688 1676
1689/* 1677/*
1690 * file.c 1678 * file.c
@@ -1723,10 +1711,10 @@ struct dentry *f2fs_get_parent(struct dentry *child);
1723extern unsigned char f2fs_filetype_table[F2FS_FT_MAX]; 1711extern unsigned char f2fs_filetype_table[F2FS_FT_MAX];
1724void set_de_type(struct f2fs_dir_entry *, umode_t); 1712void set_de_type(struct f2fs_dir_entry *, umode_t);
1725 1713
1726struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *, 1714struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *,
1727 f2fs_hash_t, int *, struct f2fs_dentry_ptr *); 1715 f2fs_hash_t, int *, struct f2fs_dentry_ptr *);
1728bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, 1716bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
1729 unsigned int, struct f2fs_str *); 1717 unsigned int, struct fscrypt_str *);
1730void do_make_empty_dir(struct inode *, struct inode *, 1718void do_make_empty_dir(struct inode *, struct inode *,
1731 struct f2fs_dentry_ptr *); 1719 struct f2fs_dentry_ptr *);
1732struct page *init_inode_metadata(struct inode *, struct inode *, 1720struct page *init_inode_metadata(struct inode *, struct inode *,
@@ -1763,6 +1751,7 @@ int f2fs_commit_super(struct f2fs_sb_info *, bool);
1763int f2fs_sync_fs(struct super_block *, int); 1751int f2fs_sync_fs(struct super_block *, int);
1764extern __printf(3, 4) 1752extern __printf(3, 4)
1765void f2fs_msg(struct super_block *, const char *, const char *, ...); 1753void f2fs_msg(struct super_block *, const char *, const char *, ...);
1754int sanity_check_ckpt(struct f2fs_sb_info *sbi);
1766 1755
1767/* 1756/*
1768 * hash.c 1757 * hash.c
@@ -1780,6 +1769,7 @@ int need_dentry_mark(struct f2fs_sb_info *, nid_t);
1780bool is_checkpointed_node(struct f2fs_sb_info *, nid_t); 1769bool is_checkpointed_node(struct f2fs_sb_info *, nid_t);
1781bool need_inode_block_update(struct f2fs_sb_info *, nid_t); 1770bool need_inode_block_update(struct f2fs_sb_info *, nid_t);
1782void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); 1771void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
1772pgoff_t get_next_page_offset(struct dnode_of_data *, pgoff_t);
1783int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); 1773int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
1784int truncate_inode_blocks(struct inode *, pgoff_t); 1774int truncate_inode_blocks(struct inode *, pgoff_t);
1785int truncate_xattr_node(struct inode *, struct page *); 1775int truncate_xattr_node(struct inode *, struct page *);
@@ -1811,7 +1801,8 @@ void destroy_node_manager_caches(void);
1811 * segment.c 1801 * segment.c
1812 */ 1802 */
1813void register_inmem_page(struct inode *, struct page *); 1803void register_inmem_page(struct inode *, struct page *);
1814int commit_inmem_pages(struct inode *, bool); 1804void drop_inmem_pages(struct inode *);
1805int commit_inmem_pages(struct inode *);
1815void f2fs_balance_fs(struct f2fs_sb_info *, bool); 1806void f2fs_balance_fs(struct f2fs_sb_info *, bool);
1816void f2fs_balance_fs_bg(struct f2fs_sb_info *); 1807void f2fs_balance_fs_bg(struct f2fs_sb_info *);
1817int f2fs_issue_flush(struct f2fs_sb_info *); 1808int f2fs_issue_flush(struct f2fs_sb_info *);
@@ -1832,16 +1823,17 @@ void write_meta_page(struct f2fs_sb_info *, struct page *);
1832void write_node_page(unsigned int, struct f2fs_io_info *); 1823void write_node_page(unsigned int, struct f2fs_io_info *);
1833void write_data_page(struct dnode_of_data *, struct f2fs_io_info *); 1824void write_data_page(struct dnode_of_data *, struct f2fs_io_info *);
1834void rewrite_data_page(struct f2fs_io_info *); 1825void rewrite_data_page(struct f2fs_io_info *);
1826void __f2fs_replace_block(struct f2fs_sb_info *, struct f2fs_summary *,
1827 block_t, block_t, bool, bool);
1835void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *, 1828void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *,
1836 block_t, block_t, unsigned char, bool); 1829 block_t, block_t, unsigned char, bool, bool);
1837void allocate_data_block(struct f2fs_sb_info *, struct page *, 1830void allocate_data_block(struct f2fs_sb_info *, struct page *,
1838 block_t, block_t *, struct f2fs_summary *, int); 1831 block_t, block_t *, struct f2fs_summary *, int);
1839void f2fs_wait_on_page_writeback(struct page *, enum page_type); 1832void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool);
1840void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t); 1833void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t);
1841void write_data_summaries(struct f2fs_sb_info *, block_t); 1834void write_data_summaries(struct f2fs_sb_info *, block_t);
1842void write_node_summaries(struct f2fs_sb_info *, block_t); 1835void write_node_summaries(struct f2fs_sb_info *, block_t);
1843int lookup_journal_in_cursum(struct f2fs_summary_block *, 1836int lookup_journal_in_cursum(struct f2fs_journal *, int, unsigned int, int);
1844 int, unsigned int, int);
1845void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *); 1837void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *);
1846int build_segment_manager(struct f2fs_sb_info *); 1838int build_segment_manager(struct f2fs_sb_info *);
1847void destroy_segment_manager(struct f2fs_sb_info *); 1839void destroy_segment_manager(struct f2fs_sb_info *);
@@ -1881,11 +1873,16 @@ void destroy_checkpoint_caches(void);
1881 * data.c 1873 * data.c
1882 */ 1874 */
1883void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); 1875void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
1876void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *,
1877 struct page *, nid_t, enum page_type, int);
1878void f2fs_flush_merged_bios(struct f2fs_sb_info *);
1884int f2fs_submit_page_bio(struct f2fs_io_info *); 1879int f2fs_submit_page_bio(struct f2fs_io_info *);
1885void f2fs_submit_page_mbio(struct f2fs_io_info *); 1880void f2fs_submit_page_mbio(struct f2fs_io_info *);
1886void set_data_blkaddr(struct dnode_of_data *); 1881void set_data_blkaddr(struct dnode_of_data *);
1882void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t);
1887int reserve_new_block(struct dnode_of_data *); 1883int reserve_new_block(struct dnode_of_data *);
1888int f2fs_get_block(struct dnode_of_data *, pgoff_t); 1884int f2fs_get_block(struct dnode_of_data *, pgoff_t);
1885ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *);
1889int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); 1886int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
1890struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); 1887struct page *get_read_data_page(struct inode *, pgoff_t, int, bool);
1891struct page *find_data_page(struct inode *, pgoff_t); 1888struct page *find_data_page(struct inode *, pgoff_t);
@@ -1902,7 +1899,7 @@ int f2fs_release_page(struct page *, gfp_t);
1902 */ 1899 */
1903int start_gc_thread(struct f2fs_sb_info *); 1900int start_gc_thread(struct f2fs_sb_info *);
1904void stop_gc_thread(struct f2fs_sb_info *); 1901void stop_gc_thread(struct f2fs_sb_info *);
1905block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *); 1902block_t start_bidx_of_node(unsigned int, struct inode *);
1906int f2fs_gc(struct f2fs_sb_info *, bool); 1903int f2fs_gc(struct f2fs_sb_info *, bool);
1907void build_gc_manager(struct f2fs_sb_info *); 1904void build_gc_manager(struct f2fs_sb_info *);
1908 1905
@@ -2093,7 +2090,7 @@ int f2fs_convert_inline_inode(struct inode *);
2093int f2fs_write_inline_data(struct inode *, struct page *); 2090int f2fs_write_inline_data(struct inode *, struct page *);
2094bool recover_inline_data(struct inode *, struct page *); 2091bool recover_inline_data(struct inode *, struct page *);
2095struct f2fs_dir_entry *find_in_inline_dir(struct inode *, 2092struct f2fs_dir_entry *find_in_inline_dir(struct inode *,
2096 struct f2fs_filename *, struct page **); 2093 struct fscrypt_name *, struct page **);
2097struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **); 2094struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **);
2098int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); 2095int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *);
2099int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *, 2096int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *,
@@ -2102,7 +2099,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
2102 struct inode *, struct inode *); 2099 struct inode *, struct inode *);
2103bool f2fs_empty_inline_dir(struct inode *); 2100bool f2fs_empty_inline_dir(struct inode *);
2104int f2fs_read_inline_dir(struct file *, struct dir_context *, 2101int f2fs_read_inline_dir(struct file *, struct dir_context *,
2105 struct f2fs_str *); 2102 struct fscrypt_str *);
2106int f2fs_inline_data_fiemap(struct inode *, 2103int f2fs_inline_data_fiemap(struct inode *,
2107 struct fiemap_extent_info *, __u64, __u64); 2104 struct fiemap_extent_info *, __u64, __u64);
2108 2105
@@ -2132,13 +2129,9 @@ void destroy_extent_cache(void);
2132/* 2129/*
2133 * crypto support 2130 * crypto support
2134 */ 2131 */
2135static inline int f2fs_encrypted_inode(struct inode *inode) 2132static inline bool f2fs_encrypted_inode(struct inode *inode)
2136{ 2133{
2137#ifdef CONFIG_F2FS_FS_ENCRYPTION
2138 return file_is_encrypt(inode); 2134 return file_is_encrypt(inode);
2139#else
2140 return 0;
2141#endif
2142} 2135}
2143 2136
2144static inline void f2fs_set_encrypted_inode(struct inode *inode) 2137static inline void f2fs_set_encrypted_inode(struct inode *inode)
@@ -2150,20 +2143,12 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode)
2150 2143
2151static inline bool f2fs_bio_encrypted(struct bio *bio) 2144static inline bool f2fs_bio_encrypted(struct bio *bio)
2152{ 2145{
2153#ifdef CONFIG_F2FS_FS_ENCRYPTION 2146 return bio->bi_private != NULL;
2154 return unlikely(bio->bi_private != NULL);
2155#else
2156 return false;
2157#endif
2158} 2147}
2159 2148
2160static inline int f2fs_sb_has_crypto(struct super_block *sb) 2149static inline int f2fs_sb_has_crypto(struct super_block *sb)
2161{ 2150{
2162#ifdef CONFIG_F2FS_FS_ENCRYPTION
2163 return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); 2151 return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT);
2164#else
2165 return 0;
2166#endif
2167} 2152}
2168 2153
2169static inline bool f2fs_may_encrypt(struct inode *inode) 2154static inline bool f2fs_may_encrypt(struct inode *inode)
@@ -2177,86 +2162,28 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
2177#endif 2162#endif
2178} 2163}
2179 2164
2180/* crypto_policy.c */ 2165#ifndef CONFIG_F2FS_FS_ENCRYPTION
2181int f2fs_is_child_context_consistent_with_parent(struct inode *, 2166#define fscrypt_set_d_op(i)
2182 struct inode *); 2167#define fscrypt_get_ctx fscrypt_notsupp_get_ctx
2183int f2fs_inherit_context(struct inode *, struct inode *, struct page *); 2168#define fscrypt_release_ctx fscrypt_notsupp_release_ctx
2184int f2fs_process_policy(const struct f2fs_encryption_policy *, struct inode *); 2169#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page
2185int f2fs_get_policy(struct inode *, struct f2fs_encryption_policy *); 2170#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page
2186 2171#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages
2187/* crypt.c */ 2172#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page
2188extern struct kmem_cache *f2fs_crypt_info_cachep; 2173#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page
2189bool f2fs_valid_contents_enc_mode(uint32_t); 2174#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range
2190uint32_t f2fs_validate_encryption_key_size(uint32_t, uint32_t); 2175#define fscrypt_process_policy fscrypt_notsupp_process_policy
2191struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *); 2176#define fscrypt_get_policy fscrypt_notsupp_get_policy
2192void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *); 2177#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context
2193struct page *f2fs_encrypt(struct inode *, struct page *); 2178#define fscrypt_inherit_context fscrypt_notsupp_inherit_context
2194int f2fs_decrypt(struct f2fs_crypto_ctx *, struct page *); 2179#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info
2195int f2fs_decrypt_one(struct inode *, struct page *); 2180#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info
2196void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *, struct bio *); 2181#define fscrypt_setup_filename fscrypt_notsupp_setup_filename
2197 2182#define fscrypt_free_filename fscrypt_notsupp_free_filename
2198/* crypto_key.c */ 2183#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size
2199void f2fs_free_encryption_info(struct inode *, struct f2fs_crypt_info *); 2184#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer
2200int _f2fs_get_encryption_info(struct inode *inode); 2185#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer
2201 2186#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr
2202/* crypto_fname.c */ 2187#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk
2203bool f2fs_valid_filenames_enc_mode(uint32_t);
2204u32 f2fs_fname_crypto_round_up(u32, u32);
2205int f2fs_fname_crypto_alloc_buffer(struct inode *, u32, struct f2fs_str *);
2206int f2fs_fname_disk_to_usr(struct inode *, f2fs_hash_t *,
2207 const struct f2fs_str *, struct f2fs_str *);
2208int f2fs_fname_usr_to_disk(struct inode *, const struct qstr *,
2209 struct f2fs_str *);
2210
2211#ifdef CONFIG_F2FS_FS_ENCRYPTION
2212void f2fs_restore_and_release_control_page(struct page **);
2213void f2fs_restore_control_page(struct page *);
2214
2215int __init f2fs_init_crypto(void);
2216int f2fs_crypto_initialize(void);
2217void f2fs_exit_crypto(void);
2218
2219int f2fs_has_encryption_key(struct inode *);
2220
2221static inline int f2fs_get_encryption_info(struct inode *inode)
2222{
2223 struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
2224
2225 if (!ci ||
2226 (ci->ci_keyring_key &&
2227 (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
2228 (1 << KEY_FLAG_REVOKED) |
2229 (1 << KEY_FLAG_DEAD)))))
2230 return _f2fs_get_encryption_info(inode);
2231 return 0;
2232}
2233
2234void f2fs_fname_crypto_free_buffer(struct f2fs_str *);
2235int f2fs_fname_setup_filename(struct inode *, const struct qstr *,
2236 int lookup, struct f2fs_filename *);
2237void f2fs_fname_free_filename(struct f2fs_filename *);
2238#else
2239static inline void f2fs_restore_and_release_control_page(struct page **p) { }
2240static inline void f2fs_restore_control_page(struct page *p) { }
2241
2242static inline int __init f2fs_init_crypto(void) { return 0; }
2243static inline void f2fs_exit_crypto(void) { }
2244
2245static inline int f2fs_has_encryption_key(struct inode *i) { return 0; }
2246static inline int f2fs_get_encryption_info(struct inode *i) { return 0; }
2247static inline void f2fs_fname_crypto_free_buffer(struct f2fs_str *p) { }
2248
2249static inline int f2fs_fname_setup_filename(struct inode *dir,
2250 const struct qstr *iname,
2251 int lookup, struct f2fs_filename *fname)
2252{
2253 memset(fname, 0, sizeof(struct f2fs_filename));
2254 fname->usr_fname = iname;
2255 fname->disk_name.name = (unsigned char *)iname->name;
2256 fname->disk_name.len = iname->len;
2257 return 0;
2258}
2259
2260static inline void f2fs_fname_free_filename(struct f2fs_filename *fname) { }
2261#endif 2188#endif
2262#endif 2189#endif
diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h
deleted file mode 100644
index c2c1c2b63b25..000000000000
--- a/fs/f2fs/f2fs_crypto.h
+++ /dev/null
@@ -1,151 +0,0 @@
1/*
2 * linux/fs/f2fs/f2fs_crypto.h
3 *
4 * Copied from linux/fs/ext4/ext4_crypto.h
5 *
6 * Copyright (C) 2015, Google, Inc.
7 *
8 * This contains encryption header content for f2fs
9 *
10 * Written by Michael Halcrow, 2015.
11 * Modified by Jaegeuk Kim, 2015.
12 */
13#ifndef _F2FS_CRYPTO_H
14#define _F2FS_CRYPTO_H
15
16#include <linux/fs.h>
17
18#define F2FS_KEY_DESCRIPTOR_SIZE 8
19
20/* Policy provided via an ioctl on the topmost directory */
21struct f2fs_encryption_policy {
22 char version;
23 char contents_encryption_mode;
24 char filenames_encryption_mode;
25 char flags;
26 char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE];
27} __attribute__((__packed__));
28
29#define F2FS_ENCRYPTION_CONTEXT_FORMAT_V1 1
30#define F2FS_KEY_DERIVATION_NONCE_SIZE 16
31
32#define F2FS_POLICY_FLAGS_PAD_4 0x00
33#define F2FS_POLICY_FLAGS_PAD_8 0x01
34#define F2FS_POLICY_FLAGS_PAD_16 0x02
35#define F2FS_POLICY_FLAGS_PAD_32 0x03
36#define F2FS_POLICY_FLAGS_PAD_MASK 0x03
37#define F2FS_POLICY_FLAGS_VALID 0x03
38
39/**
40 * Encryption context for inode
41 *
42 * Protector format:
43 * 1 byte: Protector format (1 = this version)
44 * 1 byte: File contents encryption mode
45 * 1 byte: File names encryption mode
46 * 1 byte: Flags
47 * 8 bytes: Master Key descriptor
48 * 16 bytes: Encryption Key derivation nonce
49 */
50struct f2fs_encryption_context {
51 char format;
52 char contents_encryption_mode;
53 char filenames_encryption_mode;
54 char flags;
55 char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE];
56 char nonce[F2FS_KEY_DERIVATION_NONCE_SIZE];
57} __attribute__((__packed__));
58
59/* Encryption parameters */
60#define F2FS_XTS_TWEAK_SIZE 16
61#define F2FS_AES_128_ECB_KEY_SIZE 16
62#define F2FS_AES_256_GCM_KEY_SIZE 32
63#define F2FS_AES_256_CBC_KEY_SIZE 32
64#define F2FS_AES_256_CTS_KEY_SIZE 32
65#define F2FS_AES_256_XTS_KEY_SIZE 64
66#define F2FS_MAX_KEY_SIZE 64
67
68#define F2FS_KEY_DESC_PREFIX "f2fs:"
69#define F2FS_KEY_DESC_PREFIX_SIZE 5
70
71struct f2fs_encryption_key {
72 __u32 mode;
73 char raw[F2FS_MAX_KEY_SIZE];
74 __u32 size;
75} __attribute__((__packed__));
76
77struct f2fs_crypt_info {
78 char ci_data_mode;
79 char ci_filename_mode;
80 char ci_flags;
81 struct crypto_ablkcipher *ci_ctfm;
82 struct key *ci_keyring_key;
83 char ci_master_key[F2FS_KEY_DESCRIPTOR_SIZE];
84};
85
86#define F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001
87#define F2FS_WRITE_PATH_FL 0x00000002
88
89struct f2fs_crypto_ctx {
90 union {
91 struct {
92 struct page *bounce_page; /* Ciphertext page */
93 struct page *control_page; /* Original page */
94 } w;
95 struct {
96 struct bio *bio;
97 struct work_struct work;
98 } r;
99 struct list_head free_list; /* Free list */
100 };
101 char flags; /* Flags */
102};
103
104struct f2fs_completion_result {
105 struct completion completion;
106 int res;
107};
108
109#define DECLARE_F2FS_COMPLETION_RESULT(ecr) \
110 struct f2fs_completion_result ecr = { \
111 COMPLETION_INITIALIZER((ecr).completion), 0 }
112
113static inline int f2fs_encryption_key_size(int mode)
114{
115 switch (mode) {
116 case F2FS_ENCRYPTION_MODE_AES_256_XTS:
117 return F2FS_AES_256_XTS_KEY_SIZE;
118 case F2FS_ENCRYPTION_MODE_AES_256_GCM:
119 return F2FS_AES_256_GCM_KEY_SIZE;
120 case F2FS_ENCRYPTION_MODE_AES_256_CBC:
121 return F2FS_AES_256_CBC_KEY_SIZE;
122 case F2FS_ENCRYPTION_MODE_AES_256_CTS:
123 return F2FS_AES_256_CTS_KEY_SIZE;
124 default:
125 BUG();
126 }
127 return 0;
128}
129
130#define F2FS_FNAME_NUM_SCATTER_ENTRIES 4
131#define F2FS_CRYPTO_BLOCK_SIZE 16
132#define F2FS_FNAME_CRYPTO_DIGEST_SIZE 32
133
134/**
135 * For encrypted symlinks, the ciphertext length is stored at the beginning
136 * of the string in little-endian format.
137 */
138struct f2fs_encrypted_symlink_data {
139 __le16 len;
140 char encrypted_path[1];
141} __attribute__((__packed__));
142
143/**
144 * This function is used to calculate the disk space required to
145 * store a filename of length l in encrypted symlink format.
146 */
147static inline u32 encrypted_symlink_data_len(u32 l)
148{
149 return (l + sizeof(struct f2fs_encrypted_symlink_data) - 1);
150}
151#endif /* _F2FS_CRYPTO_H */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ea272be62677..b41c3579ea9e 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -86,7 +86,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
86 trace_f2fs_vm_page_mkwrite(page, DATA); 86 trace_f2fs_vm_page_mkwrite(page, DATA);
87mapped: 87mapped:
88 /* fill the page */ 88 /* fill the page */
89 f2fs_wait_on_page_writeback(page, DATA); 89 f2fs_wait_on_page_writeback(page, DATA, false);
90 90
91 /* wait for GCed encrypted page writeback */ 91 /* wait for GCed encrypted page writeback */
92 if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) 92 if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
@@ -301,7 +301,7 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping,
301 pagevec_init(&pvec, 0); 301 pagevec_init(&pvec, 0);
302 nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, 302 nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs,
303 PAGECACHE_TAG_DIRTY, 1); 303 PAGECACHE_TAG_DIRTY, 1);
304 pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX; 304 pgofs = nr_pages ? pvec.pages[0]->index : ULONG_MAX;
305 pagevec_release(&pvec); 305 pagevec_release(&pvec);
306 return pgofs; 306 return pgofs;
307} 307}
@@ -358,15 +358,14 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
358 } else if (err == -ENOENT) { 358 } else if (err == -ENOENT) {
359 /* direct node does not exists */ 359 /* direct node does not exists */
360 if (whence == SEEK_DATA) { 360 if (whence == SEEK_DATA) {
361 pgofs = PGOFS_OF_NEXT_DNODE(pgofs, 361 pgofs = get_next_page_offset(&dn, pgofs);
362 F2FS_I(inode));
363 continue; 362 continue;
364 } else { 363 } else {
365 goto found; 364 goto found;
366 } 365 }
367 } 366 }
368 367
369 end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); 368 end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
370 369
371 /* find data/hole in dnode block */ 370 /* find data/hole in dnode block */
372 for (; dn.ofs_in_node < end_offset; 371 for (; dn.ofs_in_node < end_offset;
@@ -422,9 +421,11 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
422 int err; 421 int err;
423 422
424 if (f2fs_encrypted_inode(inode)) { 423 if (f2fs_encrypted_inode(inode)) {
425 err = f2fs_get_encryption_info(inode); 424 err = fscrypt_get_encryption_info(inode);
426 if (err) 425 if (err)
427 return 0; 426 return 0;
427 if (!f2fs_encrypted_inode(inode))
428 return -ENOKEY;
428 } 429 }
429 430
430 /* we don't need to use inline_data strictly */ 431 /* we don't need to use inline_data strictly */
@@ -440,12 +441,18 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
440static int f2fs_file_open(struct inode *inode, struct file *filp) 441static int f2fs_file_open(struct inode *inode, struct file *filp)
441{ 442{
442 int ret = generic_file_open(inode, filp); 443 int ret = generic_file_open(inode, filp);
444 struct inode *dir = filp->f_path.dentry->d_parent->d_inode;
443 445
444 if (!ret && f2fs_encrypted_inode(inode)) { 446 if (!ret && f2fs_encrypted_inode(inode)) {
445 ret = f2fs_get_encryption_info(inode); 447 ret = fscrypt_get_encryption_info(inode);
446 if (ret) 448 if (ret)
447 ret = -EACCES; 449 return -EACCES;
450 if (!fscrypt_has_encryption_key(inode))
451 return -ENOKEY;
448 } 452 }
453 if (f2fs_encrypted_inode(dir) &&
454 !fscrypt_has_permitted_context(dir, inode))
455 return -EPERM;
449 return ret; 456 return ret;
450} 457}
451 458
@@ -480,7 +487,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
480 * we will invalidate all blkaddr in the whole range. 487 * we will invalidate all blkaddr in the whole range.
481 */ 488 */
482 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), 489 fofs = start_bidx_of_node(ofs_of_node(dn->node_page),
483 F2FS_I(dn->inode)) + ofs; 490 dn->inode) + ofs;
484 f2fs_update_extent_cache_range(dn, fofs, 0, len); 491 f2fs_update_extent_cache_range(dn, fofs, 0, len);
485 dec_valid_block_count(sbi, dn->inode, nr_free); 492 dec_valid_block_count(sbi, dn->inode, nr_free);
486 sync_inode_page(dn); 493 sync_inode_page(dn);
@@ -521,9 +528,10 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
521 if (IS_ERR(page)) 528 if (IS_ERR(page))
522 return 0; 529 return 0;
523truncate_out: 530truncate_out:
524 f2fs_wait_on_page_writeback(page, DATA); 531 f2fs_wait_on_page_writeback(page, DATA, true);
525 zero_user(page, offset, PAGE_CACHE_SIZE - offset); 532 zero_user(page, offset, PAGE_CACHE_SIZE - offset);
526 if (!cache_only || !f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode)) 533 if (!cache_only || !f2fs_encrypted_inode(inode) ||
534 !S_ISREG(inode->i_mode))
527 set_page_dirty(page); 535 set_page_dirty(page);
528 f2fs_put_page(page, 1); 536 f2fs_put_page(page, 1);
529 return 0; 537 return 0;
@@ -568,7 +576,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
568 goto out; 576 goto out;
569 } 577 }
570 578
571 count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); 579 count = ADDRS_PER_PAGE(dn.node_page, inode);
572 580
573 count -= dn.ofs_in_node; 581 count -= dn.ofs_in_node;
574 f2fs_bug_on(sbi, count < 0); 582 f2fs_bug_on(sbi, count < 0);
@@ -671,7 +679,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
671 679
672 if (attr->ia_valid & ATTR_SIZE) { 680 if (attr->ia_valid & ATTR_SIZE) {
673 if (f2fs_encrypted_inode(inode) && 681 if (f2fs_encrypted_inode(inode) &&
674 f2fs_get_encryption_info(inode)) 682 fscrypt_get_encryption_info(inode))
675 return -EACCES; 683 return -EACCES;
676 684
677 if (attr->ia_size <= i_size_read(inode)) { 685 if (attr->ia_size <= i_size_read(inode)) {
@@ -743,7 +751,7 @@ static int fill_zero(struct inode *inode, pgoff_t index,
743 if (IS_ERR(page)) 751 if (IS_ERR(page))
744 return PTR_ERR(page); 752 return PTR_ERR(page);
745 753
746 f2fs_wait_on_page_writeback(page, DATA); 754 f2fs_wait_on_page_writeback(page, DATA, true);
747 zero_user(page, start, len); 755 zero_user(page, start, len);
748 set_page_dirty(page); 756 set_page_dirty(page);
749 f2fs_put_page(page, 1); 757 f2fs_put_page(page, 1);
@@ -768,7 +776,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
768 return err; 776 return err;
769 } 777 }
770 778
771 end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); 779 end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
772 count = min(end_offset - dn.ofs_in_node, pg_end - pg_start); 780 count = min(end_offset - dn.ofs_in_node, pg_end - pg_start);
773 781
774 f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset); 782 f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset);
@@ -854,10 +862,8 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src,
854 } else { 862 } else {
855 new_addr = dn.data_blkaddr; 863 new_addr = dn.data_blkaddr;
856 if (!is_checkpointed_data(sbi, new_addr)) { 864 if (!is_checkpointed_data(sbi, new_addr)) {
857 dn.data_blkaddr = NULL_ADDR;
858 /* do not invalidate this block address */ 865 /* do not invalidate this block address */
859 set_data_blkaddr(&dn); 866 f2fs_update_data_blkaddr(&dn, NULL_ADDR);
860 f2fs_update_extent_cache(&dn);
861 do_replace = true; 867 do_replace = true;
862 } 868 }
863 f2fs_put_dnode(&dn); 869 f2fs_put_dnode(&dn);
@@ -884,7 +890,7 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src,
884 890
885 get_node_info(sbi, dn.nid, &ni); 891 get_node_info(sbi, dn.nid, &ni);
886 f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, 892 f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr,
887 ni.version, true); 893 ni.version, true, false);
888 f2fs_put_dnode(&dn); 894 f2fs_put_dnode(&dn);
889 } else { 895 } else {
890 struct page *psrc, *pdst; 896 struct page *psrc, *pdst;
@@ -892,7 +898,7 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src,
892 psrc = get_lock_data_page(inode, src, true); 898 psrc = get_lock_data_page(inode, src, true);
893 if (IS_ERR(psrc)) 899 if (IS_ERR(psrc))
894 return PTR_ERR(psrc); 900 return PTR_ERR(psrc);
895 pdst = get_new_data_page(inode, NULL, dst, false); 901 pdst = get_new_data_page(inode, NULL, dst, true);
896 if (IS_ERR(pdst)) { 902 if (IS_ERR(pdst)) {
897 f2fs_put_page(psrc, 1); 903 f2fs_put_page(psrc, 1);
898 return PTR_ERR(pdst); 904 return PTR_ERR(pdst);
@@ -908,9 +914,7 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src,
908 914
909err_out: 915err_out:
910 if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) { 916 if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) {
911 dn.data_blkaddr = new_addr; 917 f2fs_update_data_blkaddr(&dn, new_addr);
912 set_data_blkaddr(&dn);
913 f2fs_update_extent_cache(&dn);
914 f2fs_put_dnode(&dn); 918 f2fs_put_dnode(&dn);
915 } 919 }
916 return ret; 920 return ret;
@@ -1050,12 +1054,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
1050 1054
1051 if (dn.data_blkaddr != NEW_ADDR) { 1055 if (dn.data_blkaddr != NEW_ADDR) {
1052 invalidate_blocks(sbi, dn.data_blkaddr); 1056 invalidate_blocks(sbi, dn.data_blkaddr);
1053 1057 f2fs_update_data_blkaddr(&dn, NEW_ADDR);
1054 dn.data_blkaddr = NEW_ADDR;
1055 set_data_blkaddr(&dn);
1056
1057 dn.data_blkaddr = NULL_ADDR;
1058 f2fs_update_extent_cache(&dn);
1059 } 1058 }
1060 f2fs_put_dnode(&dn); 1059 f2fs_put_dnode(&dn);
1061 f2fs_unlock_op(sbi); 1060 f2fs_unlock_op(sbi);
@@ -1253,7 +1252,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp)
1253{ 1252{
1254 /* some remained atomic pages should discarded */ 1253 /* some remained atomic pages should discarded */
1255 if (f2fs_is_atomic_file(inode)) 1254 if (f2fs_is_atomic_file(inode))
1256 commit_inmem_pages(inode, true); 1255 drop_inmem_pages(inode);
1257 if (f2fs_is_volatile_file(inode)) { 1256 if (f2fs_is_volatile_file(inode)) {
1258 set_inode_flag(F2FS_I(inode), FI_DROP_CACHE); 1257 set_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
1259 filemap_fdatawrite(inode->i_mapping); 1258 filemap_fdatawrite(inode->i_mapping);
@@ -1377,7 +1376,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
1377 1376
1378 if (f2fs_is_atomic_file(inode)) { 1377 if (f2fs_is_atomic_file(inode)) {
1379 clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); 1378 clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
1380 ret = commit_inmem_pages(inode, false); 1379 ret = commit_inmem_pages(inode);
1381 if (ret) { 1380 if (ret) {
1382 set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); 1381 set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
1383 goto err_out; 1382 goto err_out;
@@ -1440,7 +1439,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
1440 1439
1441 if (f2fs_is_atomic_file(inode)) { 1440 if (f2fs_is_atomic_file(inode)) {
1442 clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); 1441 clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
1443 commit_inmem_pages(inode, true); 1442 drop_inmem_pages(inode);
1444 } 1443 }
1445 if (f2fs_is_volatile_file(inode)) { 1444 if (f2fs_is_volatile_file(inode)) {
1446 clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); 1445 clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
@@ -1535,39 +1534,30 @@ static bool uuid_is_nonzero(__u8 u[16])
1535 1534
1536static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) 1535static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
1537{ 1536{
1538#ifdef CONFIG_F2FS_FS_ENCRYPTION 1537 struct fscrypt_policy policy;
1539 struct f2fs_encryption_policy policy;
1540 struct inode *inode = file_inode(filp); 1538 struct inode *inode = file_inode(filp);
1541 1539
1542 if (copy_from_user(&policy, (struct f2fs_encryption_policy __user *)arg, 1540 if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg,
1543 sizeof(policy))) 1541 sizeof(policy)))
1544 return -EFAULT; 1542 return -EFAULT;
1545 1543
1546 f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); 1544 f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
1547 return f2fs_process_policy(&policy, inode); 1545 return fscrypt_process_policy(inode, &policy);
1548#else
1549 return -EOPNOTSUPP;
1550#endif
1551} 1546}
1552 1547
1553static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) 1548static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
1554{ 1549{
1555#ifdef CONFIG_F2FS_FS_ENCRYPTION 1550 struct fscrypt_policy policy;
1556 struct f2fs_encryption_policy policy;
1557 struct inode *inode = file_inode(filp); 1551 struct inode *inode = file_inode(filp);
1558 int err; 1552 int err;
1559 1553
1560 err = f2fs_get_policy(inode, &policy); 1554 err = fscrypt_get_policy(inode, &policy);
1561 if (err) 1555 if (err)
1562 return err; 1556 return err;
1563 1557
1564 if (copy_to_user((struct f2fs_encryption_policy __user *)arg, &policy, 1558 if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy)))
1565 sizeof(policy)))
1566 return -EFAULT; 1559 return -EFAULT;
1567 return 0; 1560 return 0;
1568#else
1569 return -EOPNOTSUPP;
1570#endif
1571} 1561}
1572 1562
1573static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) 1563static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
@@ -1648,7 +1638,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
1648 struct f2fs_defragment *range) 1638 struct f2fs_defragment *range)
1649{ 1639{
1650 struct inode *inode = file_inode(filp); 1640 struct inode *inode = file_inode(filp);
1651 struct f2fs_map_blocks map; 1641 struct f2fs_map_blocks map = { .m_next_pgofs = NULL };
1652 struct extent_info ei; 1642 struct extent_info ei;
1653 pgoff_t pg_start, pg_end; 1643 pgoff_t pg_start, pg_end;
1654 unsigned int blk_per_seg = sbi->blocks_per_seg; 1644 unsigned int blk_per_seg = sbi->blocks_per_seg;
@@ -1874,14 +1864,32 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1874 1864
1875static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 1865static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1876{ 1866{
1877 struct inode *inode = file_inode(iocb->ki_filp); 1867 struct file *file = iocb->ki_filp;
1868 struct inode *inode = file_inode(file);
1869 ssize_t ret;
1878 1870
1879 if (f2fs_encrypted_inode(inode) && 1871 if (f2fs_encrypted_inode(inode) &&
1880 !f2fs_has_encryption_key(inode) && 1872 !fscrypt_has_encryption_key(inode) &&
1881 f2fs_get_encryption_info(inode)) 1873 fscrypt_get_encryption_info(inode))
1882 return -EACCES; 1874 return -EACCES;
1883 1875
1884 return generic_file_write_iter(iocb, from); 1876 inode_lock(inode);
1877 ret = generic_write_checks(iocb, from);
1878 if (ret > 0) {
1879 ret = f2fs_preallocate_blocks(iocb, from);
1880 if (!ret)
1881 ret = __generic_file_write_iter(iocb, from);
1882 }
1883 inode_unlock(inode);
1884
1885 if (ret > 0) {
1886 ssize_t err;
1887
1888 err = generic_write_sync(file, iocb->ki_pos - ret, ret);
1889 if (err < 0)
1890 ret = err;
1891 }
1892 return ret;
1885} 1893}
1886 1894
1887#ifdef CONFIG_COMPAT 1895#ifdef CONFIG_COMPAT
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index f610c2a9bdde..b0051a97824c 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -245,6 +245,18 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
245 return get_cb_cost(sbi, segno); 245 return get_cb_cost(sbi, segno);
246} 246}
247 247
248static unsigned int count_bits(const unsigned long *addr,
249 unsigned int offset, unsigned int len)
250{
251 unsigned int end = offset + len, sum = 0;
252
253 while (offset < end) {
254 if (test_bit(offset++, addr))
255 ++sum;
256 }
257 return sum;
258}
259
248/* 260/*
249 * This function is called from two paths. 261 * This function is called from two paths.
250 * One is garbage collection and the other is SSR segment selection. 262 * One is garbage collection and the other is SSR segment selection.
@@ -258,9 +270,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
258{ 270{
259 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 271 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
260 struct victim_sel_policy p; 272 struct victim_sel_policy p;
261 unsigned int secno, max_cost; 273 unsigned int secno, max_cost, last_victim;
262 unsigned int last_segment = MAIN_SEGS(sbi); 274 unsigned int last_segment = MAIN_SEGS(sbi);
263 int nsearched = 0; 275 unsigned int nsearched = 0;
264 276
265 mutex_lock(&dirty_i->seglist_lock); 277 mutex_lock(&dirty_i->seglist_lock);
266 278
@@ -273,6 +285,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
273 if (p.max_search == 0) 285 if (p.max_search == 0)
274 goto out; 286 goto out;
275 287
288 last_victim = sbi->last_victim[p.gc_mode];
276 if (p.alloc_mode == LFS && gc_type == FG_GC) { 289 if (p.alloc_mode == LFS && gc_type == FG_GC) {
277 p.min_segno = check_bg_victims(sbi); 290 p.min_segno = check_bg_victims(sbi);
278 if (p.min_segno != NULL_SEGNO) 291 if (p.min_segno != NULL_SEGNO)
@@ -295,27 +308,35 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
295 } 308 }
296 309
297 p.offset = segno + p.ofs_unit; 310 p.offset = segno + p.ofs_unit;
298 if (p.ofs_unit > 1) 311 if (p.ofs_unit > 1) {
299 p.offset -= segno % p.ofs_unit; 312 p.offset -= segno % p.ofs_unit;
313 nsearched += count_bits(p.dirty_segmap,
314 p.offset - p.ofs_unit,
315 p.ofs_unit);
316 } else {
317 nsearched++;
318 }
319
300 320
301 secno = GET_SECNO(sbi, segno); 321 secno = GET_SECNO(sbi, segno);
302 322
303 if (sec_usage_check(sbi, secno)) 323 if (sec_usage_check(sbi, secno))
304 continue; 324 goto next;
305 if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) 325 if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
306 continue; 326 goto next;
307 327
308 cost = get_gc_cost(sbi, segno, &p); 328 cost = get_gc_cost(sbi, segno, &p);
309 329
310 if (p.min_cost > cost) { 330 if (p.min_cost > cost) {
311 p.min_segno = segno; 331 p.min_segno = segno;
312 p.min_cost = cost; 332 p.min_cost = cost;
313 } else if (unlikely(cost == max_cost)) {
314 continue;
315 } 333 }
316 334next:
317 if (nsearched++ >= p.max_search) { 335 if (nsearched >= p.max_search) {
318 sbi->last_victim[p.gc_mode] = segno; 336 if (!sbi->last_victim[p.gc_mode] && segno <= last_victim)
337 sbi->last_victim[p.gc_mode] = last_victim + 1;
338 else
339 sbi->last_victim[p.gc_mode] = segno + 1;
319 break; 340 break;
320 } 341 }
321 } 342 }
@@ -399,7 +420,7 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
399 * On validity, copy that node with cold status, otherwise (invalid node) 420 * On validity, copy that node with cold status, otherwise (invalid node)
400 * ignore that. 421 * ignore that.
401 */ 422 */
402static int gc_node_segment(struct f2fs_sb_info *sbi, 423static void gc_node_segment(struct f2fs_sb_info *sbi,
403 struct f2fs_summary *sum, unsigned int segno, int gc_type) 424 struct f2fs_summary *sum, unsigned int segno, int gc_type)
404{ 425{
405 bool initial = true; 426 bool initial = true;
@@ -419,7 +440,7 @@ next_step:
419 440
420 /* stop BG_GC if there is not enough free sections. */ 441 /* stop BG_GC if there is not enough free sections. */
421 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) 442 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
422 return 0; 443 return;
423 444
424 if (check_valid_map(sbi, segno, off) == 0) 445 if (check_valid_map(sbi, segno, off) == 0)
425 continue; 446 continue;
@@ -446,7 +467,7 @@ next_step:
446 467
447 /* set page dirty and write it */ 468 /* set page dirty and write it */
448 if (gc_type == FG_GC) { 469 if (gc_type == FG_GC) {
449 f2fs_wait_on_page_writeback(node_page, NODE); 470 f2fs_wait_on_page_writeback(node_page, NODE, true);
450 set_page_dirty(node_page); 471 set_page_dirty(node_page);
451 } else { 472 } else {
452 if (!PageWriteback(node_page)) 473 if (!PageWriteback(node_page))
@@ -460,20 +481,6 @@ next_step:
460 initial = false; 481 initial = false;
461 goto next_step; 482 goto next_step;
462 } 483 }
463
464 if (gc_type == FG_GC) {
465 struct writeback_control wbc = {
466 .sync_mode = WB_SYNC_ALL,
467 .nr_to_write = LONG_MAX,
468 .for_reclaim = 0,
469 };
470 sync_node_pages(sbi, 0, &wbc);
471
472 /* return 1 only if FG_GC succefully reclaimed one */
473 if (get_valid_blocks(sbi, segno, 1) == 0)
474 return 1;
475 }
476 return 0;
477} 484}
478 485
479/* 486/*
@@ -483,7 +490,7 @@ next_step:
483 * as indirect or double indirect node blocks, are given, it must be a caller's 490 * as indirect or double indirect node blocks, are given, it must be a caller's
484 * bug. 491 * bug.
485 */ 492 */
486block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi) 493block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode)
487{ 494{
488 unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; 495 unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4;
489 unsigned int bidx; 496 unsigned int bidx;
@@ -500,7 +507,7 @@ block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi)
500 int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); 507 int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
501 bidx = node_ofs - 5 - dec; 508 bidx = node_ofs - 5 - dec;
502 } 509 }
503 return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi); 510 return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode);
504} 511}
505 512
506static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, 513static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
@@ -546,6 +553,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
546 struct f2fs_summary sum; 553 struct f2fs_summary sum;
547 struct node_info ni; 554 struct node_info ni;
548 struct page *page; 555 struct page *page;
556 block_t newaddr;
549 int err; 557 int err;
550 558
551 /* do not read out */ 559 /* do not read out */
@@ -567,21 +575,24 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
567 * don't cache encrypted data into meta inode until previous dirty 575 * don't cache encrypted data into meta inode until previous dirty
568 * data were writebacked to avoid racing between GC and flush. 576 * data were writebacked to avoid racing between GC and flush.
569 */ 577 */
570 f2fs_wait_on_page_writeback(page, DATA); 578 f2fs_wait_on_page_writeback(page, DATA, true);
571 579
572 get_node_info(fio.sbi, dn.nid, &ni); 580 get_node_info(fio.sbi, dn.nid, &ni);
573 set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); 581 set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
574 582
575 /* read page */ 583 /* read page */
576 fio.page = page; 584 fio.page = page;
577 fio.blk_addr = dn.data_blkaddr; 585 fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
578 586
579 fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), 587 allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
580 fio.blk_addr, 588 &sum, CURSEG_COLD_DATA);
581 FGP_LOCK|FGP_CREAT, 589
582 GFP_NOFS); 590 fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr,
583 if (!fio.encrypted_page) 591 FGP_LOCK | FGP_CREAT, GFP_NOFS);
584 goto put_out; 592 if (!fio.encrypted_page) {
593 err = -ENOMEM;
594 goto recover_block;
595 }
585 596
586 err = f2fs_submit_page_bio(&fio); 597 err = f2fs_submit_page_bio(&fio);
587 if (err) 598 if (err)
@@ -590,33 +601,39 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
590 /* write page */ 601 /* write page */
591 lock_page(fio.encrypted_page); 602 lock_page(fio.encrypted_page);
592 603
593 if (unlikely(!PageUptodate(fio.encrypted_page))) 604 if (unlikely(!PageUptodate(fio.encrypted_page))) {
605 err = -EIO;
594 goto put_page_out; 606 goto put_page_out;
595 if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) 607 }
608 if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) {
609 err = -EIO;
596 goto put_page_out; 610 goto put_page_out;
611 }
597 612
598 set_page_dirty(fio.encrypted_page); 613 set_page_dirty(fio.encrypted_page);
599 f2fs_wait_on_page_writeback(fio.encrypted_page, DATA); 614 f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true);
600 if (clear_page_dirty_for_io(fio.encrypted_page)) 615 if (clear_page_dirty_for_io(fio.encrypted_page))
601 dec_page_count(fio.sbi, F2FS_DIRTY_META); 616 dec_page_count(fio.sbi, F2FS_DIRTY_META);
602 617
603 set_page_writeback(fio.encrypted_page); 618 set_page_writeback(fio.encrypted_page);
604 619
605 /* allocate block address */ 620 /* allocate block address */
606 f2fs_wait_on_page_writeback(dn.node_page, NODE); 621 f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
607 allocate_data_block(fio.sbi, NULL, fio.blk_addr, 622
608 &fio.blk_addr, &sum, CURSEG_COLD_DATA);
609 fio.rw = WRITE_SYNC; 623 fio.rw = WRITE_SYNC;
624 fio.new_blkaddr = newaddr;
610 f2fs_submit_page_mbio(&fio); 625 f2fs_submit_page_mbio(&fio);
611 626
612 dn.data_blkaddr = fio.blk_addr; 627 f2fs_update_data_blkaddr(&dn, newaddr);
613 set_data_blkaddr(&dn);
614 f2fs_update_extent_cache(&dn);
615 set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); 628 set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
616 if (page->index == 0) 629 if (page->index == 0)
617 set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); 630 set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
618put_page_out: 631put_page_out:
619 f2fs_put_page(fio.encrypted_page, 1); 632 f2fs_put_page(fio.encrypted_page, 1);
633recover_block:
634 if (err)
635 __f2fs_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr,
636 true, true);
620put_out: 637put_out:
621 f2fs_put_dnode(&dn); 638 f2fs_put_dnode(&dn);
622out: 639out:
@@ -645,7 +662,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
645 .encrypted_page = NULL, 662 .encrypted_page = NULL,
646 }; 663 };
647 set_page_dirty(page); 664 set_page_dirty(page);
648 f2fs_wait_on_page_writeback(page, DATA); 665 f2fs_wait_on_page_writeback(page, DATA, true);
649 if (clear_page_dirty_for_io(page)) 666 if (clear_page_dirty_for_io(page))
650 inode_dec_dirty_pages(inode); 667 inode_dec_dirty_pages(inode);
651 set_cold_data(page); 668 set_cold_data(page);
@@ -663,7 +680,7 @@ out:
663 * If the parent node is not valid or the data block address is different, 680 * If the parent node is not valid or the data block address is different,
664 * the victim data block is ignored. 681 * the victim data block is ignored.
665 */ 682 */
666static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, 683static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
667 struct gc_inode_list *gc_list, unsigned int segno, int gc_type) 684 struct gc_inode_list *gc_list, unsigned int segno, int gc_type)
668{ 685{
669 struct super_block *sb = sbi->sb; 686 struct super_block *sb = sbi->sb;
@@ -686,7 +703,7 @@ next_step:
686 703
687 /* stop BG_GC if there is not enough free sections. */ 704 /* stop BG_GC if there is not enough free sections. */
688 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) 705 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
689 return 0; 706 return;
690 707
691 if (check_valid_map(sbi, segno, off) == 0) 708 if (check_valid_map(sbi, segno, off) == 0)
692 continue; 709 continue;
@@ -719,7 +736,7 @@ next_step:
719 continue; 736 continue;
720 } 737 }
721 738
722 start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); 739 start_bidx = start_bidx_of_node(nofs, inode);
723 data_page = get_read_data_page(inode, 740 data_page = get_read_data_page(inode,
724 start_bidx + ofs_in_node, READA, true); 741 start_bidx + ofs_in_node, READA, true);
725 if (IS_ERR(data_page)) { 742 if (IS_ERR(data_page)) {
@@ -735,7 +752,7 @@ next_step:
735 /* phase 3 */ 752 /* phase 3 */
736 inode = find_gc_inode(gc_list, dni.ino); 753 inode = find_gc_inode(gc_list, dni.ino);
737 if (inode) { 754 if (inode) {
738 start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)) 755 start_bidx = start_bidx_of_node(nofs, inode)
739 + ofs_in_node; 756 + ofs_in_node;
740 if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) 757 if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
741 move_encrypted_block(inode, start_bidx); 758 move_encrypted_block(inode, start_bidx);
@@ -747,15 +764,6 @@ next_step:
747 764
748 if (++phase < 4) 765 if (++phase < 4)
749 goto next_step; 766 goto next_step;
750
751 if (gc_type == FG_GC) {
752 f2fs_submit_merged_bio(sbi, DATA, WRITE);
753
754 /* return 1 only if FG_GC succefully reclaimed one */
755 if (get_valid_blocks(sbi, segno, 1) == 0)
756 return 1;
757 }
758 return 0;
759} 767}
760 768
761static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, 769static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
@@ -771,53 +779,92 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
771 return ret; 779 return ret;
772} 780}
773 781
774static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, 782static int do_garbage_collect(struct f2fs_sb_info *sbi,
783 unsigned int start_segno,
775 struct gc_inode_list *gc_list, int gc_type) 784 struct gc_inode_list *gc_list, int gc_type)
776{ 785{
777 struct page *sum_page; 786 struct page *sum_page;
778 struct f2fs_summary_block *sum; 787 struct f2fs_summary_block *sum;
779 struct blk_plug plug; 788 struct blk_plug plug;
780 int nfree = 0; 789 unsigned int segno = start_segno;
790 unsigned int end_segno = start_segno + sbi->segs_per_sec;
791 int seg_freed = 0;
792 unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
793 SUM_TYPE_DATA : SUM_TYPE_NODE;
781 794
782 /* read segment summary of victim */ 795 /* readahead multi ssa blocks those have contiguous address */
783 sum_page = get_sum_page(sbi, segno); 796 if (sbi->segs_per_sec > 1)
797 ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
798 sbi->segs_per_sec, META_SSA, true);
799
800 /* reference all summary page */
801 while (segno < end_segno) {
802 sum_page = get_sum_page(sbi, segno++);
803 unlock_page(sum_page);
804 }
784 805
785 blk_start_plug(&plug); 806 blk_start_plug(&plug);
786 807
787 sum = page_address(sum_page); 808 for (segno = start_segno; segno < end_segno; segno++) {
809 /* find segment summary of victim */
810 sum_page = find_get_page(META_MAPPING(sbi),
811 GET_SUM_BLOCK(sbi, segno));
812 f2fs_bug_on(sbi, !PageUptodate(sum_page));
813 f2fs_put_page(sum_page, 0);
788 814
789 /* 815 sum = page_address(sum_page);
790 * this is to avoid deadlock: 816 f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer)));
791 * - lock_page(sum_page) - f2fs_replace_block 817
792 * - check_valid_map() - mutex_lock(sentry_lock) 818 /*
793 * - mutex_lock(sentry_lock) - change_curseg() 819 * this is to avoid deadlock:
794 * - lock_page(sum_page) 820 * - lock_page(sum_page) - f2fs_replace_block
795 */ 821 * - check_valid_map() - mutex_lock(sentry_lock)
796 unlock_page(sum_page); 822 * - mutex_lock(sentry_lock) - change_curseg()
797 823 * - lock_page(sum_page)
798 switch (GET_SUM_TYPE((&sum->footer))) { 824 */
799 case SUM_TYPE_NODE: 825
800 nfree = gc_node_segment(sbi, sum->entries, segno, gc_type); 826 if (type == SUM_TYPE_NODE)
801 break; 827 gc_node_segment(sbi, sum->entries, segno, gc_type);
802 case SUM_TYPE_DATA: 828 else
803 nfree = gc_data_segment(sbi, sum->entries, gc_list, 829 gc_data_segment(sbi, sum->entries, gc_list, segno,
804 segno, gc_type); 830 gc_type);
805 break; 831
832 stat_inc_seg_count(sbi, type, gc_type);
833
834 f2fs_put_page(sum_page, 0);
835 }
836
837 if (gc_type == FG_GC) {
838 if (type == SUM_TYPE_NODE) {
839 struct writeback_control wbc = {
840 .sync_mode = WB_SYNC_ALL,
841 .nr_to_write = LONG_MAX,
842 .for_reclaim = 0,
843 };
844 sync_node_pages(sbi, 0, &wbc);
845 } else {
846 f2fs_submit_merged_bio(sbi, DATA, WRITE);
847 }
806 } 848 }
849
807 blk_finish_plug(&plug); 850 blk_finish_plug(&plug);
808 851
809 stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)), gc_type); 852 if (gc_type == FG_GC) {
853 while (start_segno < end_segno)
854 if (get_valid_blocks(sbi, start_segno++, 1) == 0)
855 seg_freed++;
856 }
857
810 stat_inc_call_count(sbi->stat_info); 858 stat_inc_call_count(sbi->stat_info);
811 859
812 f2fs_put_page(sum_page, 0); 860 return seg_freed;
813 return nfree;
814} 861}
815 862
816int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) 863int f2fs_gc(struct f2fs_sb_info *sbi, bool sync)
817{ 864{
818 unsigned int segno, i; 865 unsigned int segno;
819 int gc_type = sync ? FG_GC : BG_GC; 866 int gc_type = sync ? FG_GC : BG_GC;
820 int sec_freed = 0; 867 int sec_freed = 0, seg_freed;
821 int ret = -EINVAL; 868 int ret = -EINVAL;
822 struct cp_control cpc; 869 struct cp_control cpc;
823 struct gc_inode_list gc_list = { 870 struct gc_inode_list gc_list = {
@@ -838,30 +885,24 @@ gc_more:
838 885
839 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) { 886 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) {
840 gc_type = FG_GC; 887 gc_type = FG_GC;
888 /*
889 * If there is no victim and no prefree segment but still not
890 * enough free sections, we should flush dent/node blocks and do
891 * garbage collections.
892 */
841 if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi)) 893 if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi))
842 write_checkpoint(sbi, &cpc); 894 write_checkpoint(sbi, &cpc);
895 else if (has_not_enough_free_secs(sbi, 0))
896 write_checkpoint(sbi, &cpc);
843 } 897 }
844 898
845 if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) 899 if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type))
846 goto stop; 900 goto stop;
847 ret = 0; 901 ret = 0;
848 902
849 /* readahead multi ssa blocks those have contiguous address */ 903 seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type);
850 if (sbi->segs_per_sec > 1)
851 ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec,
852 META_SSA, true);
853
854 for (i = 0; i < sbi->segs_per_sec; i++) {
855 /*
856 * for FG_GC case, halt gcing left segments once failed one
857 * of segments in selected section to avoid long latency.
858 */
859 if (!do_garbage_collect(sbi, segno + i, &gc_list, gc_type) &&
860 gc_type == FG_GC)
861 break;
862 }
863 904
864 if (i == sbi->segs_per_sec && gc_type == FG_GC) 905 if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec)
865 sec_freed++; 906 sec_freed++;
866 907
867 if (gc_type == FG_GC) 908 if (gc_type == FG_GC)
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index c3f0b7d4cfca..358214e9f707 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -71,7 +71,7 @@ bool truncate_inline_inode(struct page *ipage, u64 from)
71 71
72 addr = inline_data_addr(ipage); 72 addr = inline_data_addr(ipage);
73 73
74 f2fs_wait_on_page_writeback(ipage, NODE); 74 f2fs_wait_on_page_writeback(ipage, NODE, true);
75 memset(addr + from, 0, MAX_INLINE_DATA - from); 75 memset(addr + from, 0, MAX_INLINE_DATA - from);
76 76
77 return true; 77 return true;
@@ -105,7 +105,6 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
105 105
106int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) 106int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
107{ 107{
108 void *src_addr, *dst_addr;
109 struct f2fs_io_info fio = { 108 struct f2fs_io_info fio = {
110 .sbi = F2FS_I_SB(dn->inode), 109 .sbi = F2FS_I_SB(dn->inode),
111 .type = DATA, 110 .type = DATA,
@@ -115,8 +114,6 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
115 }; 114 };
116 int dirty, err; 115 int dirty, err;
117 116
118 f2fs_bug_on(F2FS_I_SB(dn->inode), page->index);
119
120 if (!f2fs_exist_data(dn->inode)) 117 if (!f2fs_exist_data(dn->inode))
121 goto clear_out; 118 goto clear_out;
122 119
@@ -124,21 +121,9 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
124 if (err) 121 if (err)
125 return err; 122 return err;
126 123
127 f2fs_wait_on_page_writeback(page, DATA); 124 f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page));
128
129 if (PageUptodate(page))
130 goto no_update;
131
132 zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
133 125
134 /* Copy the whole inline data block */ 126 read_inline_data(page, dn->inode_page);
135 src_addr = inline_data_addr(dn->inode_page);
136 dst_addr = kmap_atomic(page);
137 memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
138 flush_dcache_page(page);
139 kunmap_atomic(dst_addr);
140 SetPageUptodate(page);
141no_update:
142 set_page_dirty(page); 127 set_page_dirty(page);
143 128
144 /* clear dirty state */ 129 /* clear dirty state */
@@ -146,11 +131,9 @@ no_update:
146 131
147 /* write data page to try to make data consistent */ 132 /* write data page to try to make data consistent */
148 set_page_writeback(page); 133 set_page_writeback(page);
149 fio.blk_addr = dn->data_blkaddr; 134 fio.old_blkaddr = dn->data_blkaddr;
150 write_data_page(dn, &fio); 135 write_data_page(dn, &fio);
151 set_data_blkaddr(dn); 136 f2fs_wait_on_page_writeback(page, DATA, true);
152 f2fs_update_extent_cache(dn);
153 f2fs_wait_on_page_writeback(page, DATA);
154 if (dirty) 137 if (dirty)
155 inode_dec_dirty_pages(dn->inode); 138 inode_dec_dirty_pages(dn->inode);
156 139
@@ -159,6 +142,7 @@ no_update:
159 142
160 /* clear inline data and flag after data writeback */ 143 /* clear inline data and flag after data writeback */
161 truncate_inline_inode(dn->inode_page, 0); 144 truncate_inline_inode(dn->inode_page, 0);
145 clear_inline_node(dn->inode_page);
162clear_out: 146clear_out:
163 stat_dec_inline_inode(dn->inode); 147 stat_dec_inline_inode(dn->inode);
164 f2fs_clear_inline_inode(dn->inode); 148 f2fs_clear_inline_inode(dn->inode);
@@ -223,7 +207,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
223 207
224 f2fs_bug_on(F2FS_I_SB(inode), page->index); 208 f2fs_bug_on(F2FS_I_SB(inode), page->index);
225 209
226 f2fs_wait_on_page_writeback(dn.inode_page, NODE); 210 f2fs_wait_on_page_writeback(dn.inode_page, NODE, true);
227 src_addr = kmap_atomic(page); 211 src_addr = kmap_atomic(page);
228 dst_addr = inline_data_addr(dn.inode_page); 212 dst_addr = inline_data_addr(dn.inode_page);
229 memcpy(dst_addr, src_addr, MAX_INLINE_DATA); 213 memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
@@ -233,6 +217,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
233 set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); 217 set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
234 218
235 sync_inode_page(&dn); 219 sync_inode_page(&dn);
220 clear_inline_node(dn.inode_page);
236 f2fs_put_dnode(&dn); 221 f2fs_put_dnode(&dn);
237 return 0; 222 return 0;
238} 223}
@@ -261,7 +246,7 @@ process_inline:
261 ipage = get_node_page(sbi, inode->i_ino); 246 ipage = get_node_page(sbi, inode->i_ino);
262 f2fs_bug_on(sbi, IS_ERR(ipage)); 247 f2fs_bug_on(sbi, IS_ERR(ipage));
263 248
264 f2fs_wait_on_page_writeback(ipage, NODE); 249 f2fs_wait_on_page_writeback(ipage, NODE, true);
265 250
266 src_addr = inline_data_addr(npage); 251 src_addr = inline_data_addr(npage);
267 dst_addr = inline_data_addr(ipage); 252 dst_addr = inline_data_addr(ipage);
@@ -292,7 +277,7 @@ process_inline:
292} 277}
293 278
294struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, 279struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
295 struct f2fs_filename *fname, struct page **res_page) 280 struct fscrypt_name *fname, struct page **res_page)
296{ 281{
297 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 282 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
298 struct f2fs_inline_dentry *inline_dentry; 283 struct f2fs_inline_dentry *inline_dentry;
@@ -389,7 +374,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
389 if (err) 374 if (err)
390 goto out; 375 goto out;
391 376
392 f2fs_wait_on_page_writeback(page, DATA); 377 f2fs_wait_on_page_writeback(page, DATA, true);
393 zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); 378 zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
394 379
395 dentry_blk = kmap_atomic(page); 380 dentry_blk = kmap_atomic(page);
@@ -469,7 +454,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
469 } 454 }
470 } 455 }
471 456
472 f2fs_wait_on_page_writeback(ipage, NODE); 457 f2fs_wait_on_page_writeback(ipage, NODE, true);
473 458
474 name_hash = f2fs_dentry_hash(name); 459 name_hash = f2fs_dentry_hash(name);
475 make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); 460 make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
@@ -507,7 +492,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
507 int i; 492 int i;
508 493
509 lock_page(page); 494 lock_page(page);
510 f2fs_wait_on_page_writeback(page, NODE); 495 f2fs_wait_on_page_writeback(page, NODE, true);
511 496
512 inline_dentry = inline_data_addr(page); 497 inline_dentry = inline_data_addr(page);
513 bit_pos = dentry - inline_dentry->dentry; 498 bit_pos = dentry - inline_dentry->dentry;
@@ -550,7 +535,7 @@ bool f2fs_empty_inline_dir(struct inode *dir)
550} 535}
551 536
552int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, 537int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
553 struct f2fs_str *fstr) 538 struct fscrypt_str *fstr)
554{ 539{
555 struct inode *inode = file_inode(file); 540 struct inode *inode = file_inode(file);
556 struct f2fs_inline_dentry *inline_dentry = NULL; 541 struct f2fs_inline_dentry *inline_dentry = NULL;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2adeff26be11..cb269c46ac25 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -83,7 +83,7 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage)
83 83
84 while (start < end) { 84 while (start < end) {
85 if (*start++) { 85 if (*start++) {
86 f2fs_wait_on_page_writeback(ipage, NODE); 86 f2fs_wait_on_page_writeback(ipage, NODE, true);
87 87
88 set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); 88 set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
89 set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage)); 89 set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage));
@@ -227,7 +227,7 @@ int update_inode(struct inode *inode, struct page *node_page)
227{ 227{
228 struct f2fs_inode *ri; 228 struct f2fs_inode *ri;
229 229
230 f2fs_wait_on_page_writeback(node_page, NODE); 230 f2fs_wait_on_page_writeback(node_page, NODE, true);
231 231
232 ri = F2FS_INODE(node_page); 232 ri = F2FS_INODE(node_page);
233 233
@@ -263,6 +263,10 @@ int update_inode(struct inode *inode, struct page *node_page)
263 set_cold_node(inode, node_page); 263 set_cold_node(inode, node_page);
264 clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); 264 clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
265 265
266 /* deleted inode */
267 if (inode->i_nlink == 0)
268 clear_inline_node(node_page);
269
266 return set_page_dirty(node_page); 270 return set_page_dirty(node_page);
267} 271}
268 272
@@ -320,7 +324,7 @@ void f2fs_evict_inode(struct inode *inode)
320 324
321 /* some remained atomic pages should discarded */ 325 /* some remained atomic pages should discarded */
322 if (f2fs_is_atomic_file(inode)) 326 if (f2fs_is_atomic_file(inode))
323 commit_inmem_pages(inode, true); 327 drop_inmem_pages(inode);
324 328
325 trace_f2fs_evict_inode(inode); 329 trace_f2fs_evict_inode(inode);
326 truncate_inode_pages_final(&inode->i_data); 330 truncate_inode_pages_final(&inode->i_data);
@@ -385,10 +389,7 @@ no_delete:
385 } 389 }
386 } 390 }
387out_clear: 391out_clear:
388#ifdef CONFIG_F2FS_FS_ENCRYPTION 392 fscrypt_put_encryption_info(inode, NULL);
389 if (fi->i_crypt_info)
390 f2fs_free_encryption_info(inode, fi->i_crypt_info);
391#endif
392 clear_inode(inode); 393 clear_inode(inode);
393} 394}
394 395
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 6f944e5eb76e..7876f1052101 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -169,7 +169,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
169 int err; 169 int err;
170 170
171 if (f2fs_encrypted_inode(dir) && 171 if (f2fs_encrypted_inode(dir) &&
172 !f2fs_is_child_context_consistent_with_parent(dir, inode)) 172 !fscrypt_has_permitted_context(dir, inode))
173 return -EPERM; 173 return -EPERM;
174 174
175 f2fs_balance_fs(sbi, true); 175 f2fs_balance_fs(sbi, true);
@@ -260,6 +260,22 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
260 struct page *page; 260 struct page *page;
261 nid_t ino; 261 nid_t ino;
262 int err = 0; 262 int err = 0;
263 unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir));
264
265 if (f2fs_encrypted_inode(dir)) {
266 int res = fscrypt_get_encryption_info(dir);
267
268 /*
269 * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is
270 * created while the directory was encrypted and we
271 * don't have access to the key.
272 */
273 if (fscrypt_has_encryption_key(dir))
274 fscrypt_set_encrypted_dentry(dentry);
275 fscrypt_set_d_op(dentry);
276 if (res && res != -ENOKEY)
277 return ERR_PTR(res);
278 }
263 279
264 if (dentry->d_name.len > F2FS_NAME_LEN) 280 if (dentry->d_name.len > F2FS_NAME_LEN)
265 return ERR_PTR(-ENAMETOOLONG); 281 return ERR_PTR(-ENAMETOOLONG);
@@ -276,15 +292,29 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
276 if (IS_ERR(inode)) 292 if (IS_ERR(inode))
277 return ERR_CAST(inode); 293 return ERR_CAST(inode);
278 294
295 if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) {
296 err = __recover_dot_dentries(dir, root_ino);
297 if (err)
298 goto err_out;
299 }
300
279 if (f2fs_has_inline_dots(inode)) { 301 if (f2fs_has_inline_dots(inode)) {
280 err = __recover_dot_dentries(inode, dir->i_ino); 302 err = __recover_dot_dentries(inode, dir->i_ino);
281 if (err) 303 if (err)
282 goto err_out; 304 goto err_out;
283 } 305 }
306 if (!IS_ERR(inode) && f2fs_encrypted_inode(dir) &&
307 (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
308 !fscrypt_has_permitted_context(dir, inode)) {
309 bool nokey = f2fs_encrypted_inode(inode) &&
310 !fscrypt_has_encryption_key(inode);
311 err = nokey ? -ENOKEY : -EPERM;
312 goto err_out;
313 }
284 return d_splice_alias(inode, dentry); 314 return d_splice_alias(inode, dentry);
285 315
286err_out: 316err_out:
287 iget_failed(inode); 317 iput(inode);
288 return ERR_PTR(err); 318 return ERR_PTR(err);
289} 319}
290 320
@@ -345,13 +375,23 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
345 struct f2fs_sb_info *sbi = F2FS_I_SB(dir); 375 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
346 struct inode *inode; 376 struct inode *inode;
347 size_t len = strlen(symname); 377 size_t len = strlen(symname);
348 size_t p_len; 378 struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1);
349 char *p_str; 379 struct fscrypt_symlink_data *sd = NULL;
350 struct f2fs_str disk_link = FSTR_INIT(NULL, 0);
351 struct f2fs_encrypted_symlink_data *sd = NULL;
352 int err; 380 int err;
353 381
354 if (len > dir->i_sb->s_blocksize) 382 if (f2fs_encrypted_inode(dir)) {
383 err = fscrypt_get_encryption_info(dir);
384 if (err)
385 return err;
386
387 if (!fscrypt_has_encryption_key(dir))
388 return -EPERM;
389
390 disk_link.len = (fscrypt_fname_encrypted_size(dir, len) +
391 sizeof(struct fscrypt_symlink_data));
392 }
393
394 if (disk_link.len > dir->i_sb->s_blocksize)
355 return -ENAMETOOLONG; 395 return -ENAMETOOLONG;
356 396
357 inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); 397 inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
@@ -374,42 +414,36 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
374 f2fs_unlock_op(sbi); 414 f2fs_unlock_op(sbi);
375 alloc_nid_done(sbi, inode->i_ino); 415 alloc_nid_done(sbi, inode->i_ino);
376 416
377 if (f2fs_encrypted_inode(dir)) { 417 if (f2fs_encrypted_inode(inode)) {
378 struct qstr istr = QSTR_INIT(symname, len); 418 struct qstr istr = QSTR_INIT(symname, len);
419 struct fscrypt_str ostr;
379 420
380 err = f2fs_get_encryption_info(inode); 421 sd = kzalloc(disk_link.len, GFP_NOFS);
381 if (err) 422 if (!sd) {
423 err = -ENOMEM;
382 goto err_out; 424 goto err_out;
425 }
383 426
384 err = f2fs_fname_crypto_alloc_buffer(inode, len, &disk_link); 427 err = fscrypt_get_encryption_info(inode);
385 if (err) 428 if (err)
386 goto err_out; 429 goto err_out;
387 430
388 err = f2fs_fname_usr_to_disk(inode, &istr, &disk_link); 431 if (!fscrypt_has_encryption_key(inode)) {
389 if (err < 0) 432 err = -EPERM;
390 goto err_out;
391
392 p_len = encrypted_symlink_data_len(disk_link.len) + 1;
393
394 if (p_len > dir->i_sb->s_blocksize) {
395 err = -ENAMETOOLONG;
396 goto err_out; 433 goto err_out;
397 } 434 }
398 435
399 sd = kzalloc(p_len, GFP_NOFS); 436 ostr.name = sd->encrypted_path;
400 if (!sd) { 437 ostr.len = disk_link.len;
401 err = -ENOMEM; 438 err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr);
439 if (err < 0)
402 goto err_out; 440 goto err_out;
403 } 441
404 memcpy(sd->encrypted_path, disk_link.name, disk_link.len); 442 sd->len = cpu_to_le16(ostr.len);
405 sd->len = cpu_to_le16(disk_link.len); 443 disk_link.name = (char *)sd;
406 p_str = (char *)sd;
407 } else {
408 p_len = len + 1;
409 p_str = (char *)symname;
410 } 444 }
411 445
412 err = page_symlink(inode, p_str, p_len); 446 err = page_symlink(inode, disk_link.name, disk_link.len);
413 447
414err_out: 448err_out:
415 d_instantiate(dentry, inode); 449 d_instantiate(dentry, inode);
@@ -425,7 +459,8 @@ err_out:
425 * performance regression. 459 * performance regression.
426 */ 460 */
427 if (!err) { 461 if (!err) {
428 filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1); 462 filemap_write_and_wait_range(inode->i_mapping, 0,
463 disk_link.len - 1);
429 464
430 if (IS_DIRSYNC(dir)) 465 if (IS_DIRSYNC(dir))
431 f2fs_sync_fs(sbi->sb, 1); 466 f2fs_sync_fs(sbi->sb, 1);
@@ -434,7 +469,6 @@ err_out:
434 } 469 }
435 470
436 kfree(sd); 471 kfree(sd);
437 f2fs_fname_crypto_free_buffer(&disk_link);
438 return err; 472 return err;
439out: 473out:
440 handle_failed_inode(inode); 474 handle_failed_inode(inode);
@@ -582,7 +616,7 @@ out:
582static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) 616static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
583{ 617{
584 if (f2fs_encrypted_inode(dir)) { 618 if (f2fs_encrypted_inode(dir)) {
585 int err = f2fs_get_encryption_info(dir); 619 int err = fscrypt_get_encryption_info(dir);
586 if (err) 620 if (err)
587 return err; 621 return err;
588 } 622 }
@@ -608,11 +642,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
608 struct f2fs_dir_entry *old_dir_entry = NULL; 642 struct f2fs_dir_entry *old_dir_entry = NULL;
609 struct f2fs_dir_entry *old_entry; 643 struct f2fs_dir_entry *old_entry;
610 struct f2fs_dir_entry *new_entry; 644 struct f2fs_dir_entry *new_entry;
645 bool is_old_inline = f2fs_has_inline_dentry(old_dir);
611 int err = -ENOENT; 646 int err = -ENOENT;
612 647
613 if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) && 648 if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) &&
614 !f2fs_is_child_context_consistent_with_parent(new_dir, 649 !fscrypt_has_permitted_context(new_dir, old_inode)) {
615 old_inode)) {
616 err = -EPERM; 650 err = -EPERM;
617 goto out; 651 goto out;
618 } 652 }
@@ -654,8 +688,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
654 if (err) 688 if (err)
655 goto put_out_dir; 689 goto put_out_dir;
656 690
657 if (update_dent_inode(old_inode, new_inode, 691 err = update_dent_inode(old_inode, new_inode,
658 &new_dentry->d_name)) { 692 &new_dentry->d_name);
693 if (err) {
659 release_orphan_inode(sbi); 694 release_orphan_inode(sbi);
660 goto put_out_dir; 695 goto put_out_dir;
661 } 696 }
@@ -693,6 +728,26 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
693 inc_nlink(new_dir); 728 inc_nlink(new_dir);
694 update_inode_page(new_dir); 729 update_inode_page(new_dir);
695 } 730 }
731
732 /*
733 * old entry and new entry can locate in the same inline
734 * dentry in inode, when attaching new entry in inline dentry,
735 * it could force inline dentry conversion, after that,
736 * old_entry and old_page will point to wrong address, in
737 * order to avoid this, let's do the check and update here.
738 */
739 if (is_old_inline && !f2fs_has_inline_dentry(old_dir)) {
740 f2fs_put_page(old_page, 0);
741 old_page = NULL;
742
743 old_entry = f2fs_find_entry(old_dir,
744 &old_dentry->d_name, &old_page);
745 if (!old_entry) {
746 err = -EIO;
747 f2fs_unlock_op(sbi);
748 goto out_whiteout;
749 }
750 }
696 } 751 }
697 752
698 down_write(&F2FS_I(old_inode)->i_sem); 753 down_write(&F2FS_I(old_inode)->i_sem);
@@ -771,11 +826,9 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
771 int err = -ENOENT; 826 int err = -ENOENT;
772 827
773 if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) && 828 if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) &&
774 (old_dir != new_dir) && 829 (old_dir != new_dir) &&
775 (!f2fs_is_child_context_consistent_with_parent(new_dir, 830 (!fscrypt_has_permitted_context(new_dir, old_inode) ||
776 old_inode) || 831 !fscrypt_has_permitted_context(old_dir, new_inode)))
777 !f2fs_is_child_context_consistent_with_parent(old_dir,
778 new_inode)))
779 return -EPERM; 832 return -EPERM;
780 833
781 old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); 834 old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
@@ -937,16 +990,15 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
937 return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); 990 return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
938} 991}
939 992
940#ifdef CONFIG_F2FS_FS_ENCRYPTION
941static const char *f2fs_encrypted_get_link(struct dentry *dentry, 993static const char *f2fs_encrypted_get_link(struct dentry *dentry,
942 struct inode *inode, 994 struct inode *inode,
943 struct delayed_call *done) 995 struct delayed_call *done)
944{ 996{
945 struct page *cpage = NULL; 997 struct page *cpage = NULL;
946 char *caddr, *paddr = NULL; 998 char *caddr, *paddr = NULL;
947 struct f2fs_str cstr = FSTR_INIT(NULL, 0); 999 struct fscrypt_str cstr = FSTR_INIT(NULL, 0);
948 struct f2fs_str pstr = FSTR_INIT(NULL, 0); 1000 struct fscrypt_str pstr = FSTR_INIT(NULL, 0);
949 struct f2fs_encrypted_symlink_data *sd; 1001 struct fscrypt_symlink_data *sd;
950 loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); 1002 loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
951 u32 max_size = inode->i_sb->s_blocksize; 1003 u32 max_size = inode->i_sb->s_blocksize;
952 int res; 1004 int res;
@@ -954,7 +1006,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
954 if (!dentry) 1006 if (!dentry)
955 return ERR_PTR(-ECHILD); 1007 return ERR_PTR(-ECHILD);
956 1008
957 res = f2fs_get_encryption_info(inode); 1009 res = fscrypt_get_encryption_info(inode);
958 if (res) 1010 if (res)
959 return ERR_PTR(res); 1011 return ERR_PTR(res);
960 1012
@@ -965,7 +1017,8 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
965 caddr[size] = 0; 1017 caddr[size] = 0;
966 1018
967 /* Symlink is encrypted */ 1019 /* Symlink is encrypted */
968 sd = (struct f2fs_encrypted_symlink_data *)caddr; 1020 sd = (struct fscrypt_symlink_data *)caddr;
1021 cstr.name = sd->encrypted_path;
969 cstr.len = le16_to_cpu(sd->len); 1022 cstr.len = le16_to_cpu(sd->len);
970 1023
971 /* this is broken symlink case */ 1024 /* this is broken symlink case */
@@ -973,12 +1026,6 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
973 res = -ENOENT; 1026 res = -ENOENT;
974 goto errout; 1027 goto errout;
975 } 1028 }
976 cstr.name = kmalloc(cstr.len, GFP_NOFS);
977 if (!cstr.name) {
978 res = -ENOMEM;
979 goto errout;
980 }
981 memcpy(cstr.name, sd->encrypted_path, cstr.len);
982 1029
983 /* this is broken symlink case */ 1030 /* this is broken symlink case */
984 if (unlikely(cstr.name[0] == 0)) { 1031 if (unlikely(cstr.name[0] == 0)) {
@@ -986,22 +1033,19 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
986 goto errout; 1033 goto errout;
987 } 1034 }
988 1035
989 if ((cstr.len + sizeof(struct f2fs_encrypted_symlink_data) - 1) > 1036 if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) {
990 max_size) {
991 /* Symlink data on the disk is corrupted */ 1037 /* Symlink data on the disk is corrupted */
992 res = -EIO; 1038 res = -EIO;
993 goto errout; 1039 goto errout;
994 } 1040 }
995 res = f2fs_fname_crypto_alloc_buffer(inode, cstr.len, &pstr); 1041 res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
996 if (res) 1042 if (res)
997 goto errout; 1043 goto errout;
998 1044
999 res = f2fs_fname_disk_to_usr(inode, NULL, &cstr, &pstr); 1045 res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
1000 if (res < 0) 1046 if (res < 0)
1001 goto errout; 1047 goto errout;
1002 1048
1003 kfree(cstr.name);
1004
1005 paddr = pstr.name; 1049 paddr = pstr.name;
1006 1050
1007 /* Null-terminate the name */ 1051 /* Null-terminate the name */
@@ -1011,8 +1055,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
1011 set_delayed_call(done, kfree_link, paddr); 1055 set_delayed_call(done, kfree_link, paddr);
1012 return paddr; 1056 return paddr;
1013errout: 1057errout:
1014 kfree(cstr.name); 1058 fscrypt_fname_free_buffer(&pstr);
1015 f2fs_fname_crypto_free_buffer(&pstr);
1016 page_cache_release(cpage); 1059 page_cache_release(cpage);
1017 return ERR_PTR(res); 1060 return ERR_PTR(res);
1018} 1061}
@@ -1029,7 +1072,6 @@ const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
1029 .removexattr = generic_removexattr, 1072 .removexattr = generic_removexattr,
1030#endif 1073#endif
1031}; 1074};
1032#endif
1033 1075
1034const struct inode_operations f2fs_dir_inode_operations = { 1076const struct inode_operations f2fs_dir_inode_operations = {
1035 .create = f2fs_create, 1077 .create = f2fs_create,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 342597a5897f..118321bd1a7f 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -257,15 +257,20 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
257 return new; 257 return new;
258} 258}
259 259
260static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, 260static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
261 struct f2fs_nat_entry *ne) 261 struct f2fs_nat_entry *ne)
262{ 262{
263 struct f2fs_nm_info *nm_i = NM_I(sbi);
263 struct nat_entry *e; 264 struct nat_entry *e;
264 265
265 e = __lookup_nat_cache(nm_i, nid); 266 e = __lookup_nat_cache(nm_i, nid);
266 if (!e) { 267 if (!e) {
267 e = grab_nat_entry(nm_i, nid); 268 e = grab_nat_entry(nm_i, nid);
268 node_info_from_raw_nat(&e->ni, ne); 269 node_info_from_raw_nat(&e->ni, ne);
270 } else {
271 f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino ||
272 nat_get_blkaddr(e) != ne->block_addr ||
273 nat_get_version(e) != ne->version);
269 } 274 }
270} 275}
271 276
@@ -354,7 +359,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
354{ 359{
355 struct f2fs_nm_info *nm_i = NM_I(sbi); 360 struct f2fs_nm_info *nm_i = NM_I(sbi);
356 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); 361 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
357 struct f2fs_summary_block *sum = curseg->sum_blk; 362 struct f2fs_journal *journal = curseg->journal;
358 nid_t start_nid = START_NID(nid); 363 nid_t start_nid = START_NID(nid);
359 struct f2fs_nat_block *nat_blk; 364 struct f2fs_nat_block *nat_blk;
360 struct page *page = NULL; 365 struct page *page = NULL;
@@ -371,23 +376,20 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
371 ni->ino = nat_get_ino(e); 376 ni->ino = nat_get_ino(e);
372 ni->blk_addr = nat_get_blkaddr(e); 377 ni->blk_addr = nat_get_blkaddr(e);
373 ni->version = nat_get_version(e); 378 ni->version = nat_get_version(e);
374 } 379 up_read(&nm_i->nat_tree_lock);
375 up_read(&nm_i->nat_tree_lock);
376 if (e)
377 return; 380 return;
381 }
378 382
379 memset(&ne, 0, sizeof(struct f2fs_nat_entry)); 383 memset(&ne, 0, sizeof(struct f2fs_nat_entry));
380 384
381 down_write(&nm_i->nat_tree_lock);
382
383 /* Check current segment summary */ 385 /* Check current segment summary */
384 mutex_lock(&curseg->curseg_mutex); 386 down_read(&curseg->journal_rwsem);
385 i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0); 387 i = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
386 if (i >= 0) { 388 if (i >= 0) {
387 ne = nat_in_journal(sum, i); 389 ne = nat_in_journal(journal, i);
388 node_info_from_raw_nat(ni, &ne); 390 node_info_from_raw_nat(ni, &ne);
389 } 391 }
390 mutex_unlock(&curseg->curseg_mutex); 392 up_read(&curseg->journal_rwsem);
391 if (i >= 0) 393 if (i >= 0)
392 goto cache; 394 goto cache;
393 395
@@ -398,19 +400,52 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
398 node_info_from_raw_nat(ni, &ne); 400 node_info_from_raw_nat(ni, &ne);
399 f2fs_put_page(page, 1); 401 f2fs_put_page(page, 1);
400cache: 402cache:
403 up_read(&nm_i->nat_tree_lock);
401 /* cache nat entry */ 404 /* cache nat entry */
402 cache_nat_entry(NM_I(sbi), nid, &ne); 405 down_write(&nm_i->nat_tree_lock);
406 cache_nat_entry(sbi, nid, &ne);
403 up_write(&nm_i->nat_tree_lock); 407 up_write(&nm_i->nat_tree_lock);
404} 408}
405 409
410pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs)
411{
412 const long direct_index = ADDRS_PER_INODE(dn->inode);
413 const long direct_blks = ADDRS_PER_BLOCK;
414 const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
415 unsigned int skipped_unit = ADDRS_PER_BLOCK;
416 int cur_level = dn->cur_level;
417 int max_level = dn->max_level;
418 pgoff_t base = 0;
419
420 if (!dn->max_level)
421 return pgofs + 1;
422
423 while (max_level-- > cur_level)
424 skipped_unit *= NIDS_PER_BLOCK;
425
426 switch (dn->max_level) {
427 case 3:
428 base += 2 * indirect_blks;
429 case 2:
430 base += 2 * direct_blks;
431 case 1:
432 base += direct_index;
433 break;
434 default:
435 f2fs_bug_on(F2FS_I_SB(dn->inode), 1);
436 }
437
438 return ((pgofs - base) / skipped_unit + 1) * skipped_unit + base;
439}
440
406/* 441/*
407 * The maximum depth is four. 442 * The maximum depth is four.
408 * Offset[0] will have raw inode offset. 443 * Offset[0] will have raw inode offset.
409 */ 444 */
410static int get_node_path(struct f2fs_inode_info *fi, long block, 445static int get_node_path(struct inode *inode, long block,
411 int offset[4], unsigned int noffset[4]) 446 int offset[4], unsigned int noffset[4])
412{ 447{
413 const long direct_index = ADDRS_PER_INODE(fi); 448 const long direct_index = ADDRS_PER_INODE(inode);
414 const long direct_blks = ADDRS_PER_BLOCK; 449 const long direct_blks = ADDRS_PER_BLOCK;
415 const long dptrs_per_blk = NIDS_PER_BLOCK; 450 const long dptrs_per_blk = NIDS_PER_BLOCK;
416 const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK; 451 const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
@@ -495,10 +530,10 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
495 int offset[4]; 530 int offset[4];
496 unsigned int noffset[4]; 531 unsigned int noffset[4];
497 nid_t nids[4]; 532 nid_t nids[4];
498 int level, i; 533 int level, i = 0;
499 int err = 0; 534 int err = 0;
500 535
501 level = get_node_path(F2FS_I(dn->inode), index, offset, noffset); 536 level = get_node_path(dn->inode, index, offset, noffset);
502 537
503 nids[0] = dn->inode->i_ino; 538 nids[0] = dn->inode->i_ino;
504 npage[0] = dn->inode_page; 539 npage[0] = dn->inode_page;
@@ -585,6 +620,10 @@ release_pages:
585release_out: 620release_out:
586 dn->inode_page = NULL; 621 dn->inode_page = NULL;
587 dn->node_page = NULL; 622 dn->node_page = NULL;
623 if (err == -ENOENT) {
624 dn->cur_level = i;
625 dn->max_level = level;
626 }
588 return err; 627 return err;
589} 628}
590 629
@@ -792,7 +831,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from)
792 831
793 trace_f2fs_truncate_inode_blocks_enter(inode, from); 832 trace_f2fs_truncate_inode_blocks_enter(inode, from);
794 833
795 level = get_node_path(F2FS_I(inode), from, offset, noffset); 834 level = get_node_path(inode, from, offset, noffset);
796restart: 835restart:
797 page = get_node_page(sbi, inode->i_ino); 836 page = get_node_page(sbi, inode->i_ino);
798 if (IS_ERR(page)) { 837 if (IS_ERR(page)) {
@@ -861,7 +900,7 @@ skip_partial:
861 f2fs_put_page(page, 1); 900 f2fs_put_page(page, 1);
862 goto restart; 901 goto restart;
863 } 902 }
864 f2fs_wait_on_page_writeback(page, NODE); 903 f2fs_wait_on_page_writeback(page, NODE, true);
865 ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; 904 ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
866 set_page_dirty(page); 905 set_page_dirty(page);
867 unlock_page(page); 906 unlock_page(page);
@@ -976,7 +1015,7 @@ struct page *new_node_page(struct dnode_of_data *dn,
976 new_ni.ino = dn->inode->i_ino; 1015 new_ni.ino = dn->inode->i_ino;
977 set_node_addr(sbi, &new_ni, NEW_ADDR, false); 1016 set_node_addr(sbi, &new_ni, NEW_ADDR, false);
978 1017
979 f2fs_wait_on_page_writeback(page, NODE); 1018 f2fs_wait_on_page_writeback(page, NODE, true);
980 fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); 1019 fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
981 set_cold_node(dn->inode, page); 1020 set_cold_node(dn->inode, page);
982 SetPageUptodate(page); 1021 SetPageUptodate(page);
@@ -1029,7 +1068,7 @@ static int read_node_page(struct page *page, int rw)
1029 if (PageUptodate(page)) 1068 if (PageUptodate(page))
1030 return LOCKED_PAGE; 1069 return LOCKED_PAGE;
1031 1070
1032 fio.blk_addr = ni.blk_addr; 1071 fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr;
1033 return f2fs_submit_page_bio(&fio); 1072 return f2fs_submit_page_bio(&fio);
1034} 1073}
1035 1074
@@ -1045,12 +1084,11 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
1045 return; 1084 return;
1046 f2fs_bug_on(sbi, check_nid_range(sbi, nid)); 1085 f2fs_bug_on(sbi, check_nid_range(sbi, nid));
1047 1086
1048 apage = find_get_page(NODE_MAPPING(sbi), nid); 1087 rcu_read_lock();
1049 if (apage && PageUptodate(apage)) { 1088 apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid);
1050 f2fs_put_page(apage, 0); 1089 rcu_read_unlock();
1090 if (apage)
1051 return; 1091 return;
1052 }
1053 f2fs_put_page(apage, 0);
1054 1092
1055 apage = grab_cache_page(NODE_MAPPING(sbi), nid); 1093 apage = grab_cache_page(NODE_MAPPING(sbi), nid);
1056 if (!apage) 1094 if (!apage)
@@ -1063,7 +1101,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
1063/* 1101/*
1064 * readahead MAX_RA_NODE number of node pages. 1102 * readahead MAX_RA_NODE number of node pages.
1065 */ 1103 */
1066void ra_node_pages(struct page *parent, int start) 1104static void ra_node_pages(struct page *parent, int start)
1067{ 1105{
1068 struct f2fs_sb_info *sbi = F2FS_P_SB(parent); 1106 struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
1069 struct blk_plug plug; 1107 struct blk_plug plug;
@@ -1083,7 +1121,7 @@ void ra_node_pages(struct page *parent, int start)
1083 blk_finish_plug(&plug); 1121 blk_finish_plug(&plug);
1084} 1122}
1085 1123
1086struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, 1124static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
1087 struct page *parent, int start) 1125 struct page *parent, int start)
1088{ 1126{
1089 struct page *page; 1127 struct page *page;
@@ -1154,19 +1192,57 @@ void sync_inode_page(struct dnode_of_data *dn)
1154 dn->node_changed = ret ? true: false; 1192 dn->node_changed = ret ? true: false;
1155} 1193}
1156 1194
1195static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
1196{
1197 struct inode *inode;
1198 struct page *page;
1199
1200 /* should flush inline_data before evict_inode */
1201 inode = ilookup(sbi->sb, ino);
1202 if (!inode)
1203 return;
1204
1205 page = pagecache_get_page(inode->i_mapping, 0, FGP_NOWAIT, 0);
1206 if (!page)
1207 goto iput_out;
1208
1209 if (!trylock_page(page))
1210 goto release_out;
1211
1212 if (!PageUptodate(page))
1213 goto page_out;
1214
1215 if (!PageDirty(page))
1216 goto page_out;
1217
1218 if (!clear_page_dirty_for_io(page))
1219 goto page_out;
1220
1221 if (!f2fs_write_inline_data(inode, page))
1222 inode_dec_dirty_pages(inode);
1223 else
1224 set_page_dirty(page);
1225page_out:
1226 unlock_page(page);
1227release_out:
1228 f2fs_put_page(page, 0);
1229iput_out:
1230 iput(inode);
1231}
1232
1157int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, 1233int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
1158 struct writeback_control *wbc) 1234 struct writeback_control *wbc)
1159{ 1235{
1160 pgoff_t index, end; 1236 pgoff_t index, end;
1161 struct pagevec pvec; 1237 struct pagevec pvec;
1162 int step = ino ? 2 : 0; 1238 int step = ino ? 2 : 0;
1163 int nwritten = 0, wrote = 0; 1239 int nwritten = 0;
1164 1240
1165 pagevec_init(&pvec, 0); 1241 pagevec_init(&pvec, 0);
1166 1242
1167next_step: 1243next_step:
1168 index = 0; 1244 index = 0;
1169 end = LONG_MAX; 1245 end = ULONG_MAX;
1170 1246
1171 while (index <= end) { 1247 while (index <= end) {
1172 int i, nr_pages; 1248 int i, nr_pages;
@@ -1203,6 +1279,7 @@ next_step:
1203 * If an fsync mode, 1279 * If an fsync mode,
1204 * we should not skip writing node pages. 1280 * we should not skip writing node pages.
1205 */ 1281 */
1282lock_node:
1206 if (ino && ino_of_node(page) == ino) 1283 if (ino && ino_of_node(page) == ino)
1207 lock_page(page); 1284 lock_page(page);
1208 else if (!trylock_page(page)) 1285 else if (!trylock_page(page))
@@ -1221,6 +1298,17 @@ continue_unlock:
1221 goto continue_unlock; 1298 goto continue_unlock;
1222 } 1299 }
1223 1300
1301 /* flush inline_data */
1302 if (!ino && is_inline_node(page)) {
1303 clear_inline_node(page);
1304 unlock_page(page);
1305 flush_inline_data(sbi, ino_of_node(page));
1306 goto lock_node;
1307 }
1308
1309 f2fs_wait_on_page_writeback(page, NODE, true);
1310
1311 BUG_ON(PageWriteback(page));
1224 if (!clear_page_dirty_for_io(page)) 1312 if (!clear_page_dirty_for_io(page))
1225 goto continue_unlock; 1313 goto continue_unlock;
1226 1314
@@ -1238,8 +1326,6 @@ continue_unlock:
1238 1326
1239 if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) 1327 if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
1240 unlock_page(page); 1328 unlock_page(page);
1241 else
1242 wrote++;
1243 1329
1244 if (--wbc->nr_to_write == 0) 1330 if (--wbc->nr_to_write == 0)
1245 break; 1331 break;
@@ -1257,15 +1343,12 @@ continue_unlock:
1257 step++; 1343 step++;
1258 goto next_step; 1344 goto next_step;
1259 } 1345 }
1260
1261 if (wrote)
1262 f2fs_submit_merged_bio(sbi, NODE, WRITE);
1263 return nwritten; 1346 return nwritten;
1264} 1347}
1265 1348
1266int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) 1349int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
1267{ 1350{
1268 pgoff_t index = 0, end = LONG_MAX; 1351 pgoff_t index = 0, end = ULONG_MAX;
1269 struct pagevec pvec; 1352 struct pagevec pvec;
1270 int ret2 = 0, ret = 0; 1353 int ret2 = 0, ret = 0;
1271 1354
@@ -1287,7 +1370,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
1287 continue; 1370 continue;
1288 1371
1289 if (ino && ino_of_node(page) == ino) { 1372 if (ino && ino_of_node(page) == ino) {
1290 f2fs_wait_on_page_writeback(page, NODE); 1373 f2fs_wait_on_page_writeback(page, NODE, true);
1291 if (TestClearPageError(page)) 1374 if (TestClearPageError(page))
1292 ret = -EIO; 1375 ret = -EIO;
1293 } 1376 }
@@ -1326,8 +1409,6 @@ static int f2fs_write_node_page(struct page *page,
1326 if (unlikely(f2fs_cp_error(sbi))) 1409 if (unlikely(f2fs_cp_error(sbi)))
1327 goto redirty_out; 1410 goto redirty_out;
1328 1411
1329 f2fs_wait_on_page_writeback(page, NODE);
1330
1331 /* get old block addr of this node page */ 1412 /* get old block addr of this node page */
1332 nid = nid_of_node(page); 1413 nid = nid_of_node(page);
1333 f2fs_bug_on(sbi, page->index != nid); 1414 f2fs_bug_on(sbi, page->index != nid);
@@ -1351,14 +1432,18 @@ static int f2fs_write_node_page(struct page *page,
1351 } 1432 }
1352 1433
1353 set_page_writeback(page); 1434 set_page_writeback(page);
1354 fio.blk_addr = ni.blk_addr; 1435 fio.old_blkaddr = ni.blk_addr;
1355 write_node_page(nid, &fio); 1436 write_node_page(nid, &fio);
1356 set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page)); 1437 set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
1357 dec_page_count(sbi, F2FS_DIRTY_NODES); 1438 dec_page_count(sbi, F2FS_DIRTY_NODES);
1358 up_read(&sbi->node_write); 1439 up_read(&sbi->node_write);
1440
1441 if (wbc->for_reclaim)
1442 f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE);
1443
1359 unlock_page(page); 1444 unlock_page(page);
1360 1445
1361 if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi))) 1446 if (unlikely(f2fs_cp_error(sbi)))
1362 f2fs_submit_merged_bio(sbi, NODE, WRITE); 1447 f2fs_submit_merged_bio(sbi, NODE, WRITE);
1363 1448
1364 return 0; 1449 return 0;
@@ -1374,8 +1459,6 @@ static int f2fs_write_node_pages(struct address_space *mapping,
1374 struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); 1459 struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
1375 long diff; 1460 long diff;
1376 1461
1377 trace_f2fs_writepages(mapping->host, wbc, NODE);
1378
1379 /* balancing f2fs's metadata in background */ 1462 /* balancing f2fs's metadata in background */
1380 f2fs_balance_fs_bg(sbi); 1463 f2fs_balance_fs_bg(sbi);
1381 1464
@@ -1383,6 +1466,8 @@ static int f2fs_write_node_pages(struct address_space *mapping,
1383 if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) 1466 if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
1384 goto skip_write; 1467 goto skip_write;
1385 1468
1469 trace_f2fs_writepages(mapping->host, wbc, NODE);
1470
1386 diff = nr_pages_to_write(sbi, NODE, wbc); 1471 diff = nr_pages_to_write(sbi, NODE, wbc);
1387 wbc->sync_mode = WB_SYNC_NONE; 1472 wbc->sync_mode = WB_SYNC_NONE;
1388 sync_node_pages(sbi, 0, wbc); 1473 sync_node_pages(sbi, 0, wbc);
@@ -1391,6 +1476,7 @@ static int f2fs_write_node_pages(struct address_space *mapping,
1391 1476
1392skip_write: 1477skip_write:
1393 wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES); 1478 wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES);
1479 trace_f2fs_writepages(mapping->host, wbc, NODE);
1394 return 0; 1480 return 0;
1395} 1481}
1396 1482
@@ -1526,7 +1612,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
1526{ 1612{
1527 struct f2fs_nm_info *nm_i = NM_I(sbi); 1613 struct f2fs_nm_info *nm_i = NM_I(sbi);
1528 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); 1614 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1529 struct f2fs_summary_block *sum = curseg->sum_blk; 1615 struct f2fs_journal *journal = curseg->journal;
1530 int i = 0; 1616 int i = 0;
1531 nid_t nid = nm_i->next_scan_nid; 1617 nid_t nid = nm_i->next_scan_nid;
1532 1618
@@ -1558,16 +1644,18 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
1558 nm_i->next_scan_nid = nid; 1644 nm_i->next_scan_nid = nid;
1559 1645
1560 /* find free nids from current sum_pages */ 1646 /* find free nids from current sum_pages */
1561 mutex_lock(&curseg->curseg_mutex); 1647 down_read(&curseg->journal_rwsem);
1562 for (i = 0; i < nats_in_cursum(sum); i++) { 1648 for (i = 0; i < nats_in_cursum(journal); i++) {
1563 block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr); 1649 block_t addr;
1564 nid = le32_to_cpu(nid_in_journal(sum, i)); 1650
1651 addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
1652 nid = le32_to_cpu(nid_in_journal(journal, i));
1565 if (addr == NULL_ADDR) 1653 if (addr == NULL_ADDR)
1566 add_free_nid(sbi, nid, true); 1654 add_free_nid(sbi, nid, true);
1567 else 1655 else
1568 remove_free_nid(nm_i, nid); 1656 remove_free_nid(nm_i, nid);
1569 } 1657 }
1570 mutex_unlock(&curseg->curseg_mutex); 1658 up_read(&curseg->journal_rwsem);
1571 up_read(&nm_i->nat_tree_lock); 1659 up_read(&nm_i->nat_tree_lock);
1572 1660
1573 ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), 1661 ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
@@ -1703,7 +1791,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
1703 src_addr = inline_xattr_addr(page); 1791 src_addr = inline_xattr_addr(page);
1704 inline_size = inline_xattr_size(inode); 1792 inline_size = inline_xattr_size(inode);
1705 1793
1706 f2fs_wait_on_page_writeback(ipage, NODE); 1794 f2fs_wait_on_page_writeback(ipage, NODE, true);
1707 memcpy(dst_addr, src_addr, inline_size); 1795 memcpy(dst_addr, src_addr, inline_size);
1708update_inode: 1796update_inode:
1709 update_inode(inode, ipage); 1797 update_inode(inode, ipage);
@@ -1831,16 +1919,16 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
1831{ 1919{
1832 struct f2fs_nm_info *nm_i = NM_I(sbi); 1920 struct f2fs_nm_info *nm_i = NM_I(sbi);
1833 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); 1921 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1834 struct f2fs_summary_block *sum = curseg->sum_blk; 1922 struct f2fs_journal *journal = curseg->journal;
1835 int i; 1923 int i;
1836 1924
1837 mutex_lock(&curseg->curseg_mutex); 1925 down_write(&curseg->journal_rwsem);
1838 for (i = 0; i < nats_in_cursum(sum); i++) { 1926 for (i = 0; i < nats_in_cursum(journal); i++) {
1839 struct nat_entry *ne; 1927 struct nat_entry *ne;
1840 struct f2fs_nat_entry raw_ne; 1928 struct f2fs_nat_entry raw_ne;
1841 nid_t nid = le32_to_cpu(nid_in_journal(sum, i)); 1929 nid_t nid = le32_to_cpu(nid_in_journal(journal, i));
1842 1930
1843 raw_ne = nat_in_journal(sum, i); 1931 raw_ne = nat_in_journal(journal, i);
1844 1932
1845 ne = __lookup_nat_cache(nm_i, nid); 1933 ne = __lookup_nat_cache(nm_i, nid);
1846 if (!ne) { 1934 if (!ne) {
@@ -1849,8 +1937,8 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
1849 } 1937 }
1850 __set_nat_cache_dirty(nm_i, ne); 1938 __set_nat_cache_dirty(nm_i, ne);
1851 } 1939 }
1852 update_nats_in_cursum(sum, -i); 1940 update_nats_in_cursum(journal, -i);
1853 mutex_unlock(&curseg->curseg_mutex); 1941 up_write(&curseg->journal_rwsem);
1854} 1942}
1855 1943
1856static void __adjust_nat_entry_set(struct nat_entry_set *nes, 1944static void __adjust_nat_entry_set(struct nat_entry_set *nes,
@@ -1875,7 +1963,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
1875 struct nat_entry_set *set) 1963 struct nat_entry_set *set)
1876{ 1964{
1877 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); 1965 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1878 struct f2fs_summary_block *sum = curseg->sum_blk; 1966 struct f2fs_journal *journal = curseg->journal;
1879 nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK; 1967 nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
1880 bool to_journal = true; 1968 bool to_journal = true;
1881 struct f2fs_nat_block *nat_blk; 1969 struct f2fs_nat_block *nat_blk;
@@ -1887,11 +1975,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
1887 * #1, flush nat entries to journal in current hot data summary block. 1975 * #1, flush nat entries to journal in current hot data summary block.
1888 * #2, flush nat entries to nat page. 1976 * #2, flush nat entries to nat page.
1889 */ 1977 */
1890 if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL)) 1978 if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
1891 to_journal = false; 1979 to_journal = false;
1892 1980
1893 if (to_journal) { 1981 if (to_journal) {
1894 mutex_lock(&curseg->curseg_mutex); 1982 down_write(&curseg->journal_rwsem);
1895 } else { 1983 } else {
1896 page = get_next_nat_page(sbi, start_nid); 1984 page = get_next_nat_page(sbi, start_nid);
1897 nat_blk = page_address(page); 1985 nat_blk = page_address(page);
@@ -1908,11 +1996,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
1908 continue; 1996 continue;
1909 1997
1910 if (to_journal) { 1998 if (to_journal) {
1911 offset = lookup_journal_in_cursum(sum, 1999 offset = lookup_journal_in_cursum(journal,
1912 NAT_JOURNAL, nid, 1); 2000 NAT_JOURNAL, nid, 1);
1913 f2fs_bug_on(sbi, offset < 0); 2001 f2fs_bug_on(sbi, offset < 0);
1914 raw_ne = &nat_in_journal(sum, offset); 2002 raw_ne = &nat_in_journal(journal, offset);
1915 nid_in_journal(sum, offset) = cpu_to_le32(nid); 2003 nid_in_journal(journal, offset) = cpu_to_le32(nid);
1916 } else { 2004 } else {
1917 raw_ne = &nat_blk->entries[nid - start_nid]; 2005 raw_ne = &nat_blk->entries[nid - start_nid];
1918 } 2006 }
@@ -1924,7 +2012,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
1924 } 2012 }
1925 2013
1926 if (to_journal) 2014 if (to_journal)
1927 mutex_unlock(&curseg->curseg_mutex); 2015 up_write(&curseg->journal_rwsem);
1928 else 2016 else
1929 f2fs_put_page(page, 1); 2017 f2fs_put_page(page, 1);
1930 2018
@@ -1941,7 +2029,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
1941{ 2029{
1942 struct f2fs_nm_info *nm_i = NM_I(sbi); 2030 struct f2fs_nm_info *nm_i = NM_I(sbi);
1943 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); 2031 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1944 struct f2fs_summary_block *sum = curseg->sum_blk; 2032 struct f2fs_journal *journal = curseg->journal;
1945 struct nat_entry_set *setvec[SETVEC_SIZE]; 2033 struct nat_entry_set *setvec[SETVEC_SIZE];
1946 struct nat_entry_set *set, *tmp; 2034 struct nat_entry_set *set, *tmp;
1947 unsigned int found; 2035 unsigned int found;
@@ -1958,7 +2046,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
1958 * entries, remove all entries from journal and merge them 2046 * entries, remove all entries from journal and merge them
1959 * into nat entry set. 2047 * into nat entry set.
1960 */ 2048 */
1961 if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) 2049 if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
1962 remove_nats_in_journal(sbi); 2050 remove_nats_in_journal(sbi);
1963 2051
1964 while ((found = __gang_lookup_nat_set(nm_i, 2052 while ((found = __gang_lookup_nat_set(nm_i,
@@ -1967,7 +2055,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
1967 set_idx = setvec[found - 1]->set + 1; 2055 set_idx = setvec[found - 1]->set + 1;
1968 for (idx = 0; idx < found; idx++) 2056 for (idx = 0; idx < found; idx++)
1969 __adjust_nat_entry_set(setvec[idx], &sets, 2057 __adjust_nat_entry_set(setvec[idx], &sets,
1970 MAX_NAT_JENTRIES(sum)); 2058 MAX_NAT_JENTRIES(journal));
1971 } 2059 }
1972 2060
1973 /* flush dirty nats in nat entry set */ 2061 /* flush dirty nats in nat entry set */
@@ -2000,6 +2088,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
2000 nm_i->nat_cnt = 0; 2088 nm_i->nat_cnt = 0;
2001 nm_i->ram_thresh = DEF_RAM_THRESHOLD; 2089 nm_i->ram_thresh = DEF_RAM_THRESHOLD;
2002 nm_i->ra_nid_pages = DEF_RA_NID_PAGES; 2090 nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
2091 nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
2003 2092
2004 INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); 2093 INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
2005 INIT_LIST_HEAD(&nm_i->free_nid_list); 2094 INIT_LIST_HEAD(&nm_i->free_nid_list);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index d4d1f636fe1c..1f4f9d4569d9 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -25,6 +25,9 @@
25/* control the memory footprint threshold (10MB per 1GB ram) */ 25/* control the memory footprint threshold (10MB per 1GB ram) */
26#define DEF_RAM_THRESHOLD 10 26#define DEF_RAM_THRESHOLD 10
27 27
28/* control dirty nats ratio threshold (default: 10% over max nid count) */
29#define DEF_DIRTY_NAT_RATIO_THRESHOLD 10
30
28/* vector size for gang look-up from nat cache that consists of radix tree */ 31/* vector size for gang look-up from nat cache that consists of radix tree */
29#define NATVEC_SIZE 64 32#define NATVEC_SIZE 64
30#define SETVEC_SIZE 32 33#define SETVEC_SIZE 32
@@ -117,6 +120,12 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne,
117 raw_ne->version = ni->version; 120 raw_ne->version = ni->version;
118} 121}
119 122
123static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi)
124{
125 return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid *
126 NM_I(sbi)->dirty_nats_ratio / 100;
127}
128
120enum mem_type { 129enum mem_type {
121 FREE_NIDS, /* indicates the free nid list */ 130 FREE_NIDS, /* indicates the free nid list */
122 NAT_ENTRIES, /* indicates the cached nat entry */ 131 NAT_ENTRIES, /* indicates the cached nat entry */
@@ -321,7 +330,7 @@ static inline int set_nid(struct page *p, int off, nid_t nid, bool i)
321{ 330{
322 struct f2fs_node *rn = F2FS_NODE(p); 331 struct f2fs_node *rn = F2FS_NODE(p);
323 332
324 f2fs_wait_on_page_writeback(p, NODE); 333 f2fs_wait_on_page_writeback(p, NODE, true);
325 334
326 if (i) 335 if (i)
327 rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); 336 rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);
@@ -370,6 +379,21 @@ static inline int is_node(struct page *page, int type)
370#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT) 379#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT)
371#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT) 380#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT)
372 381
382static inline int is_inline_node(struct page *page)
383{
384 return PageChecked(page);
385}
386
387static inline void set_inline_node(struct page *page)
388{
389 SetPageChecked(page);
390}
391
392static inline void clear_inline_node(struct page *page)
393{
394 ClearPageChecked(page);
395}
396
373static inline void set_cold_node(struct inode *inode, struct page *page) 397static inline void set_cold_node(struct inode *inode, struct page *page)
374{ 398{
375 struct f2fs_node *rn = F2FS_NODE(page); 399 struct f2fs_node *rn = F2FS_NODE(page);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 589b20b8677b..0b30cd2aeebd 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -350,8 +350,7 @@ got_it:
350 inode = dn->inode; 350 inode = dn->inode;
351 } 351 }
352 352
353 bidx = start_bidx_of_node(offset, F2FS_I(inode)) + 353 bidx = start_bidx_of_node(offset, inode) + le16_to_cpu(sum.ofs_in_node);
354 le16_to_cpu(sum.ofs_in_node);
355 354
356 /* 355 /*
357 * if inode page is locked, unlock temporarily, but its reference 356 * if inode page is locked, unlock temporarily, but its reference
@@ -386,10 +385,9 @@ truncate_out:
386static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, 385static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
387 struct page *page, block_t blkaddr) 386 struct page *page, block_t blkaddr)
388{ 387{
389 struct f2fs_inode_info *fi = F2FS_I(inode);
390 unsigned int start, end;
391 struct dnode_of_data dn; 388 struct dnode_of_data dn;
392 struct node_info ni; 389 struct node_info ni;
390 unsigned int start, end;
393 int err = 0, recovered = 0; 391 int err = 0, recovered = 0;
394 392
395 /* step 1: recover xattr */ 393 /* step 1: recover xattr */
@@ -409,8 +407,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
409 goto out; 407 goto out;
410 408
411 /* step 3: recover data indices */ 409 /* step 3: recover data indices */
412 start = start_bidx_of_node(ofs_of_node(page), fi); 410 start = start_bidx_of_node(ofs_of_node(page), inode);
413 end = start + ADDRS_PER_PAGE(page, fi); 411 end = start + ADDRS_PER_PAGE(page, inode);
414 412
415 set_new_dnode(&dn, inode, NULL, NULL, 0); 413 set_new_dnode(&dn, inode, NULL, NULL, 0);
416 414
@@ -418,7 +416,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
418 if (err) 416 if (err)
419 goto out; 417 goto out;
420 418
421 f2fs_wait_on_page_writeback(dn.node_page, NODE); 419 f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
422 420
423 get_node_info(sbi, dn.nid, &ni); 421 get_node_info(sbi, dn.nid, &ni);
424 f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); 422 f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
@@ -467,7 +465,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
467 465
468 /* write dummy data page */ 466 /* write dummy data page */
469 f2fs_replace_block(sbi, &dn, src, dest, 467 f2fs_replace_block(sbi, &dn, src, dest,
470 ni.version, false); 468 ni.version, false, false);
471 recovered++; 469 recovered++;
472 } 470 }
473 } 471 }
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 5904a411c86f..6f16b39f0b52 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -191,70 +191,145 @@ void register_inmem_page(struct inode *inode, struct page *page)
191 trace_f2fs_register_inmem_page(page, INMEM); 191 trace_f2fs_register_inmem_page(page, INMEM);
192} 192}
193 193
194int commit_inmem_pages(struct inode *inode, bool abort) 194static int __revoke_inmem_pages(struct inode *inode,
195 struct list_head *head, bool drop, bool recover)
196{
197 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
198 struct inmem_pages *cur, *tmp;
199 int err = 0;
200
201 list_for_each_entry_safe(cur, tmp, head, list) {
202 struct page *page = cur->page;
203
204 if (drop)
205 trace_f2fs_commit_inmem_page(page, INMEM_DROP);
206
207 lock_page(page);
208
209 if (recover) {
210 struct dnode_of_data dn;
211 struct node_info ni;
212
213 trace_f2fs_commit_inmem_page(page, INMEM_REVOKE);
214
215 set_new_dnode(&dn, inode, NULL, NULL, 0);
216 if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) {
217 err = -EAGAIN;
218 goto next;
219 }
220 get_node_info(sbi, dn.nid, &ni);
221 f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
222 cur->old_addr, ni.version, true, true);
223 f2fs_put_dnode(&dn);
224 }
225next:
226 ClearPageUptodate(page);
227 set_page_private(page, 0);
228 ClearPageUptodate(page);
229 f2fs_put_page(page, 1);
230
231 list_del(&cur->list);
232 kmem_cache_free(inmem_entry_slab, cur);
233 dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
234 }
235 return err;
236}
237
238void drop_inmem_pages(struct inode *inode)
239{
240 struct f2fs_inode_info *fi = F2FS_I(inode);
241
242 mutex_lock(&fi->inmem_lock);
243 __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
244 mutex_unlock(&fi->inmem_lock);
245}
246
247static int __commit_inmem_pages(struct inode *inode,
248 struct list_head *revoke_list)
195{ 249{
196 struct f2fs_sb_info *sbi = F2FS_I_SB(inode); 250 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
197 struct f2fs_inode_info *fi = F2FS_I(inode); 251 struct f2fs_inode_info *fi = F2FS_I(inode);
198 struct inmem_pages *cur, *tmp; 252 struct inmem_pages *cur, *tmp;
199 bool submit_bio = false;
200 struct f2fs_io_info fio = { 253 struct f2fs_io_info fio = {
201 .sbi = sbi, 254 .sbi = sbi,
202 .type = DATA, 255 .type = DATA,
203 .rw = WRITE_SYNC | REQ_PRIO, 256 .rw = WRITE_SYNC | REQ_PRIO,
204 .encrypted_page = NULL, 257 .encrypted_page = NULL,
205 }; 258 };
259 bool submit_bio = false;
206 int err = 0; 260 int err = 0;
207 261
208 /*
209 * The abort is true only when f2fs_evict_inode is called.
210 * Basically, the f2fs_evict_inode doesn't produce any data writes, so
211 * that we don't need to call f2fs_balance_fs.
212 * Otherwise, f2fs_gc in f2fs_balance_fs can wait forever until this
213 * inode becomes free by iget_locked in f2fs_iget.
214 */
215 if (!abort) {
216 f2fs_balance_fs(sbi, true);
217 f2fs_lock_op(sbi);
218 }
219
220 mutex_lock(&fi->inmem_lock);
221 list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { 262 list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
222 lock_page(cur->page); 263 struct page *page = cur->page;
223 if (!abort) { 264
224 if (cur->page->mapping == inode->i_mapping) { 265 lock_page(page);
225 set_page_dirty(cur->page); 266 if (page->mapping == inode->i_mapping) {
226 f2fs_wait_on_page_writeback(cur->page, DATA); 267 trace_f2fs_commit_inmem_page(page, INMEM);
227 if (clear_page_dirty_for_io(cur->page)) 268
228 inode_dec_dirty_pages(inode); 269 set_page_dirty(page);
229 trace_f2fs_commit_inmem_page(cur->page, INMEM); 270 f2fs_wait_on_page_writeback(page, DATA, true);
230 fio.page = cur->page; 271 if (clear_page_dirty_for_io(page))
231 err = do_write_data_page(&fio); 272 inode_dec_dirty_pages(inode);
232 if (err) { 273
233 unlock_page(cur->page); 274 fio.page = page;
234 break; 275 err = do_write_data_page(&fio);
235 } 276 if (err) {
236 clear_cold_data(cur->page); 277 unlock_page(page);
237 submit_bio = true; 278 break;
238 } 279 }
239 } else { 280
240 ClearPageUptodate(cur->page); 281 /* record old blkaddr for revoking */
241 trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP); 282 cur->old_addr = fio.old_blkaddr;
283
284 clear_cold_data(page);
285 submit_bio = true;
242 } 286 }
243 set_page_private(cur->page, 0); 287 unlock_page(page);
244 ClearPagePrivate(cur->page); 288 list_move_tail(&cur->list, revoke_list);
245 f2fs_put_page(cur->page, 1); 289 }
246 290
247 list_del(&cur->list); 291 if (submit_bio)
248 kmem_cache_free(inmem_entry_slab, cur); 292 f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE);
249 dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); 293
294 if (!err)
295 __revoke_inmem_pages(inode, revoke_list, false, false);
296
297 return err;
298}
299
300int commit_inmem_pages(struct inode *inode)
301{
302 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
303 struct f2fs_inode_info *fi = F2FS_I(inode);
304 struct list_head revoke_list;
305 int err;
306
307 INIT_LIST_HEAD(&revoke_list);
308 f2fs_balance_fs(sbi, true);
309 f2fs_lock_op(sbi);
310
311 mutex_lock(&fi->inmem_lock);
312 err = __commit_inmem_pages(inode, &revoke_list);
313 if (err) {
314 int ret;
315 /*
316 * try to revoke all committed pages, but still we could fail
317 * due to no memory or other reason, if that happened, EAGAIN
318 * will be returned, which means in such case, transaction is
319 * already not integrity, caller should use journal to do the
320 * recovery or rewrite & commit last transaction. For other
321 * error number, revoking was done by filesystem itself.
322 */
323 ret = __revoke_inmem_pages(inode, &revoke_list, false, true);
324 if (ret)
325 err = ret;
326
327 /* drop all uncommitted pages */
328 __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
250 } 329 }
251 mutex_unlock(&fi->inmem_lock); 330 mutex_unlock(&fi->inmem_lock);
252 331
253 if (!abort) { 332 f2fs_unlock_op(sbi);
254 f2fs_unlock_op(sbi);
255 if (submit_bio)
256 f2fs_submit_merged_bio(sbi, DATA, WRITE);
257 }
258 return err; 333 return err;
259} 334}
260 335
@@ -291,11 +366,17 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
291 366
292 /* checkpoint is the only way to shrink partial cached entries */ 367 /* checkpoint is the only way to shrink partial cached entries */
293 if (!available_free_memory(sbi, NAT_ENTRIES) || 368 if (!available_free_memory(sbi, NAT_ENTRIES) ||
294 excess_prefree_segs(sbi) ||
295 !available_free_memory(sbi, INO_ENTRIES) || 369 !available_free_memory(sbi, INO_ENTRIES) ||
370 excess_prefree_segs(sbi) ||
371 excess_dirty_nats(sbi) ||
296 (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) { 372 (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) {
297 if (test_opt(sbi, DATA_FLUSH)) 373 if (test_opt(sbi, DATA_FLUSH)) {
374 struct blk_plug plug;
375
376 blk_start_plug(&plug);
298 sync_dirty_inodes(sbi, FILE_INODE); 377 sync_dirty_inodes(sbi, FILE_INODE);
378 blk_finish_plug(&plug);
379 }
299 f2fs_sync_fs(sbi->sb, true); 380 f2fs_sync_fs(sbi->sb, true);
300 stat_inc_bg_cp_count(sbi->stat_info); 381 stat_inc_bg_cp_count(sbi->stat_info);
301 } 382 }
@@ -502,7 +583,7 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
502 583
503bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) 584bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
504{ 585{
505 int err = -ENOTSUPP; 586 int err = -EOPNOTSUPP;
506 587
507 if (test_opt(sbi, DISCARD)) { 588 if (test_opt(sbi, DISCARD)) {
508 struct seg_entry *se = get_seg_entry(sbi, 589 struct seg_entry *se = get_seg_entry(sbi,
@@ -841,6 +922,31 @@ static void write_sum_page(struct f2fs_sb_info *sbi,
841 update_meta_page(sbi, (void *)sum_blk, blk_addr); 922 update_meta_page(sbi, (void *)sum_blk, blk_addr);
842} 923}
843 924
925static void write_current_sum_page(struct f2fs_sb_info *sbi,
926 int type, block_t blk_addr)
927{
928 struct curseg_info *curseg = CURSEG_I(sbi, type);
929 struct page *page = grab_meta_page(sbi, blk_addr);
930 struct f2fs_summary_block *src = curseg->sum_blk;
931 struct f2fs_summary_block *dst;
932
933 dst = (struct f2fs_summary_block *)page_address(page);
934
935 mutex_lock(&curseg->curseg_mutex);
936
937 down_read(&curseg->journal_rwsem);
938 memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE);
939 up_read(&curseg->journal_rwsem);
940
941 memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE);
942 memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE);
943
944 mutex_unlock(&curseg->curseg_mutex);
945
946 set_page_dirty(page);
947 f2fs_put_page(page, 1);
948}
949
844static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) 950static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
845{ 951{
846 struct curseg_info *curseg = CURSEG_I(sbi, type); 952 struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -873,9 +979,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
873 979
874 if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { 980 if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
875 segno = find_next_zero_bit(free_i->free_segmap, 981 segno = find_next_zero_bit(free_i->free_segmap,
876 MAIN_SEGS(sbi), *newseg + 1); 982 (hint + 1) * sbi->segs_per_sec, *newseg + 1);
877 if (segno - *newseg < sbi->segs_per_sec - 983 if (segno < (hint + 1) * sbi->segs_per_sec)
878 (*newseg % sbi->segs_per_sec))
879 goto got_it; 984 goto got_it;
880 } 985 }
881find_other_zone: 986find_other_zone:
@@ -1280,8 +1385,8 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
1280{ 1385{
1281 int type = __get_segment_type(fio->page, fio->type); 1386 int type = __get_segment_type(fio->page, fio->type);
1282 1387
1283 allocate_data_block(fio->sbi, fio->page, fio->blk_addr, 1388 allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
1284 &fio->blk_addr, sum, type); 1389 &fio->new_blkaddr, sum, type);
1285 1390
1286 /* writeout dirty page into bdev */ 1391 /* writeout dirty page into bdev */
1287 f2fs_submit_page_mbio(fio); 1392 f2fs_submit_page_mbio(fio);
@@ -1293,7 +1398,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
1293 .sbi = sbi, 1398 .sbi = sbi,
1294 .type = META, 1399 .type = META,
1295 .rw = WRITE_SYNC | REQ_META | REQ_PRIO, 1400 .rw = WRITE_SYNC | REQ_META | REQ_PRIO,
1296 .blk_addr = page->index, 1401 .old_blkaddr = page->index,
1402 .new_blkaddr = page->index,
1297 .page = page, 1403 .page = page,
1298 .encrypted_page = NULL, 1404 .encrypted_page = NULL,
1299 }; 1405 };
@@ -1323,19 +1429,19 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio)
1323 get_node_info(sbi, dn->nid, &ni); 1429 get_node_info(sbi, dn->nid, &ni);
1324 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); 1430 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
1325 do_write_page(&sum, fio); 1431 do_write_page(&sum, fio);
1326 dn->data_blkaddr = fio->blk_addr; 1432 f2fs_update_data_blkaddr(dn, fio->new_blkaddr);
1327} 1433}
1328 1434
1329void rewrite_data_page(struct f2fs_io_info *fio) 1435void rewrite_data_page(struct f2fs_io_info *fio)
1330{ 1436{
1437 fio->new_blkaddr = fio->old_blkaddr;
1331 stat_inc_inplace_blocks(fio->sbi); 1438 stat_inc_inplace_blocks(fio->sbi);
1332 f2fs_submit_page_mbio(fio); 1439 f2fs_submit_page_mbio(fio);
1333} 1440}
1334 1441
1335static void __f2fs_replace_block(struct f2fs_sb_info *sbi, 1442void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
1336 struct f2fs_summary *sum,
1337 block_t old_blkaddr, block_t new_blkaddr, 1443 block_t old_blkaddr, block_t new_blkaddr,
1338 bool recover_curseg) 1444 bool recover_curseg, bool recover_newaddr)
1339{ 1445{
1340 struct sit_info *sit_i = SIT_I(sbi); 1446 struct sit_info *sit_i = SIT_I(sbi);
1341 struct curseg_info *curseg; 1447 struct curseg_info *curseg;
@@ -1378,7 +1484,7 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
1378 curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); 1484 curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
1379 __add_sum_entry(sbi, type, sum); 1485 __add_sum_entry(sbi, type, sum);
1380 1486
1381 if (!recover_curseg) 1487 if (!recover_curseg || recover_newaddr)
1382 update_sit_entry(sbi, new_blkaddr, 1); 1488 update_sit_entry(sbi, new_blkaddr, 1);
1383 if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) 1489 if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
1384 update_sit_entry(sbi, old_blkaddr, -1); 1490 update_sit_entry(sbi, old_blkaddr, -1);
@@ -1402,66 +1508,30 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
1402 1508
1403void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, 1509void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
1404 block_t old_addr, block_t new_addr, 1510 block_t old_addr, block_t new_addr,
1405 unsigned char version, bool recover_curseg) 1511 unsigned char version, bool recover_curseg,
1512 bool recover_newaddr)
1406{ 1513{
1407 struct f2fs_summary sum; 1514 struct f2fs_summary sum;
1408 1515
1409 set_summary(&sum, dn->nid, dn->ofs_in_node, version); 1516 set_summary(&sum, dn->nid, dn->ofs_in_node, version);
1410 1517
1411 __f2fs_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg); 1518 __f2fs_replace_block(sbi, &sum, old_addr, new_addr,
1519 recover_curseg, recover_newaddr);
1412 1520
1413 dn->data_blkaddr = new_addr; 1521 f2fs_update_data_blkaddr(dn, new_addr);
1414 set_data_blkaddr(dn);
1415 f2fs_update_extent_cache(dn);
1416}
1417
1418static inline bool is_merged_page(struct f2fs_sb_info *sbi,
1419 struct page *page, enum page_type type)
1420{
1421 enum page_type btype = PAGE_TYPE_OF_BIO(type);
1422 struct f2fs_bio_info *io = &sbi->write_io[btype];
1423 struct bio_vec *bvec;
1424 struct page *target;
1425 int i;
1426
1427 down_read(&io->io_rwsem);
1428 if (!io->bio) {
1429 up_read(&io->io_rwsem);
1430 return false;
1431 }
1432
1433 bio_for_each_segment_all(bvec, io->bio, i) {
1434
1435 if (bvec->bv_page->mapping) {
1436 target = bvec->bv_page;
1437 } else {
1438 struct f2fs_crypto_ctx *ctx;
1439
1440 /* encrypted page */
1441 ctx = (struct f2fs_crypto_ctx *)page_private(
1442 bvec->bv_page);
1443 target = ctx->w.control_page;
1444 }
1445
1446 if (page == target) {
1447 up_read(&io->io_rwsem);
1448 return true;
1449 }
1450 }
1451
1452 up_read(&io->io_rwsem);
1453 return false;
1454} 1522}
1455 1523
1456void f2fs_wait_on_page_writeback(struct page *page, 1524void f2fs_wait_on_page_writeback(struct page *page,
1457 enum page_type type) 1525 enum page_type type, bool ordered)
1458{ 1526{
1459 if (PageWriteback(page)) { 1527 if (PageWriteback(page)) {
1460 struct f2fs_sb_info *sbi = F2FS_P_SB(page); 1528 struct f2fs_sb_info *sbi = F2FS_P_SB(page);
1461 1529
1462 if (is_merged_page(sbi, page, type)) 1530 f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, type, WRITE);
1463 f2fs_submit_merged_bio(sbi, type, WRITE); 1531 if (ordered)
1464 wait_on_page_writeback(page); 1532 wait_on_page_writeback(page);
1533 else
1534 wait_for_stable_page(page);
1465 } 1535 }
1466} 1536}
1467 1537
@@ -1477,7 +1547,7 @@ void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi,
1477 1547
1478 cpage = find_lock_page(META_MAPPING(sbi), blkaddr); 1548 cpage = find_lock_page(META_MAPPING(sbi), blkaddr);
1479 if (cpage) { 1549 if (cpage) {
1480 f2fs_wait_on_page_writeback(cpage, DATA); 1550 f2fs_wait_on_page_writeback(cpage, DATA, true);
1481 f2fs_put_page(cpage, 1); 1551 f2fs_put_page(cpage, 1);
1482 } 1552 }
1483} 1553}
@@ -1498,12 +1568,11 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
1498 1568
1499 /* Step 1: restore nat cache */ 1569 /* Step 1: restore nat cache */
1500 seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); 1570 seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
1501 memcpy(&seg_i->sum_blk->n_nats, kaddr, SUM_JOURNAL_SIZE); 1571 memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE);
1502 1572
1503 /* Step 2: restore sit cache */ 1573 /* Step 2: restore sit cache */
1504 seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); 1574 seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
1505 memcpy(&seg_i->sum_blk->n_sits, kaddr + SUM_JOURNAL_SIZE, 1575 memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE);
1506 SUM_JOURNAL_SIZE);
1507 offset = 2 * SUM_JOURNAL_SIZE; 1576 offset = 2 * SUM_JOURNAL_SIZE;
1508 1577
1509 /* Step 3: restore summary entries */ 1578 /* Step 3: restore summary entries */
@@ -1599,7 +1668,14 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
1599 /* set uncompleted segment to curseg */ 1668 /* set uncompleted segment to curseg */
1600 curseg = CURSEG_I(sbi, type); 1669 curseg = CURSEG_I(sbi, type);
1601 mutex_lock(&curseg->curseg_mutex); 1670 mutex_lock(&curseg->curseg_mutex);
1602 memcpy(curseg->sum_blk, sum, PAGE_CACHE_SIZE); 1671
1672 /* update journal info */
1673 down_write(&curseg->journal_rwsem);
1674 memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE);
1675 up_write(&curseg->journal_rwsem);
1676
1677 memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE);
1678 memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE);
1603 curseg->next_segno = segno; 1679 curseg->next_segno = segno;
1604 reset_curseg(sbi, type, 0); 1680 reset_curseg(sbi, type, 0);
1605 curseg->alloc_type = ckpt->alloc_type[type]; 1681 curseg->alloc_type = ckpt->alloc_type[type];
@@ -1654,13 +1730,12 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
1654 1730
1655 /* Step 1: write nat cache */ 1731 /* Step 1: write nat cache */
1656 seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); 1732 seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
1657 memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE); 1733 memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE);
1658 written_size += SUM_JOURNAL_SIZE; 1734 written_size += SUM_JOURNAL_SIZE;
1659 1735
1660 /* Step 2: write sit cache */ 1736 /* Step 2: write sit cache */
1661 seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); 1737 seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
1662 memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits, 1738 memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE);
1663 SUM_JOURNAL_SIZE);
1664 written_size += SUM_JOURNAL_SIZE; 1739 written_size += SUM_JOURNAL_SIZE;
1665 1740
1666 /* Step 3: write summary entries */ 1741 /* Step 3: write summary entries */
@@ -1706,12 +1781,8 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi,
1706 else 1781 else
1707 end = type + NR_CURSEG_NODE_TYPE; 1782 end = type + NR_CURSEG_NODE_TYPE;
1708 1783
1709 for (i = type; i < end; i++) { 1784 for (i = type; i < end; i++)
1710 struct curseg_info *sum = CURSEG_I(sbi, i); 1785 write_current_sum_page(sbi, i, blkaddr + (i - type));
1711 mutex_lock(&sum->curseg_mutex);
1712 write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type));
1713 mutex_unlock(&sum->curseg_mutex);
1714 }
1715} 1786}
1716 1787
1717void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) 1788void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
@@ -1727,24 +1798,24 @@ void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
1727 write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); 1798 write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
1728} 1799}
1729 1800
1730int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, 1801int lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
1731 unsigned int val, int alloc) 1802 unsigned int val, int alloc)
1732{ 1803{
1733 int i; 1804 int i;
1734 1805
1735 if (type == NAT_JOURNAL) { 1806 if (type == NAT_JOURNAL) {
1736 for (i = 0; i < nats_in_cursum(sum); i++) { 1807 for (i = 0; i < nats_in_cursum(journal); i++) {
1737 if (le32_to_cpu(nid_in_journal(sum, i)) == val) 1808 if (le32_to_cpu(nid_in_journal(journal, i)) == val)
1738 return i; 1809 return i;
1739 } 1810 }
1740 if (alloc && __has_cursum_space(sum, 1, NAT_JOURNAL)) 1811 if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL))
1741 return update_nats_in_cursum(sum, 1); 1812 return update_nats_in_cursum(journal, 1);
1742 } else if (type == SIT_JOURNAL) { 1813 } else if (type == SIT_JOURNAL) {
1743 for (i = 0; i < sits_in_cursum(sum); i++) 1814 for (i = 0; i < sits_in_cursum(journal); i++)
1744 if (le32_to_cpu(segno_in_journal(sum, i)) == val) 1815 if (le32_to_cpu(segno_in_journal(journal, i)) == val)
1745 return i; 1816 return i;
1746 if (alloc && __has_cursum_space(sum, 1, SIT_JOURNAL)) 1817 if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL))
1747 return update_sits_in_cursum(sum, 1); 1818 return update_sits_in_cursum(journal, 1);
1748 } 1819 }
1749 return -1; 1820 return -1;
1750} 1821}
@@ -1848,20 +1919,22 @@ static void add_sits_in_set(struct f2fs_sb_info *sbi)
1848static void remove_sits_in_journal(struct f2fs_sb_info *sbi) 1919static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
1849{ 1920{
1850 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); 1921 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
1851 struct f2fs_summary_block *sum = curseg->sum_blk; 1922 struct f2fs_journal *journal = curseg->journal;
1852 int i; 1923 int i;
1853 1924
1854 for (i = sits_in_cursum(sum) - 1; i >= 0; i--) { 1925 down_write(&curseg->journal_rwsem);
1926 for (i = 0; i < sits_in_cursum(journal); i++) {
1855 unsigned int segno; 1927 unsigned int segno;
1856 bool dirtied; 1928 bool dirtied;
1857 1929
1858 segno = le32_to_cpu(segno_in_journal(sum, i)); 1930 segno = le32_to_cpu(segno_in_journal(journal, i));
1859 dirtied = __mark_sit_entry_dirty(sbi, segno); 1931 dirtied = __mark_sit_entry_dirty(sbi, segno);
1860 1932
1861 if (!dirtied) 1933 if (!dirtied)
1862 add_sit_entry(segno, &SM_I(sbi)->sit_entry_set); 1934 add_sit_entry(segno, &SM_I(sbi)->sit_entry_set);
1863 } 1935 }
1864 update_sits_in_cursum(sum, -sits_in_cursum(sum)); 1936 update_sits_in_cursum(journal, -i);
1937 up_write(&curseg->journal_rwsem);
1865} 1938}
1866 1939
1867/* 1940/*
@@ -1873,13 +1946,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1873 struct sit_info *sit_i = SIT_I(sbi); 1946 struct sit_info *sit_i = SIT_I(sbi);
1874 unsigned long *bitmap = sit_i->dirty_sentries_bitmap; 1947 unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
1875 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); 1948 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
1876 struct f2fs_summary_block *sum = curseg->sum_blk; 1949 struct f2fs_journal *journal = curseg->journal;
1877 struct sit_entry_set *ses, *tmp; 1950 struct sit_entry_set *ses, *tmp;
1878 struct list_head *head = &SM_I(sbi)->sit_entry_set; 1951 struct list_head *head = &SM_I(sbi)->sit_entry_set;
1879 bool to_journal = true; 1952 bool to_journal = true;
1880 struct seg_entry *se; 1953 struct seg_entry *se;
1881 1954
1882 mutex_lock(&curseg->curseg_mutex);
1883 mutex_lock(&sit_i->sentry_lock); 1955 mutex_lock(&sit_i->sentry_lock);
1884 1956
1885 if (!sit_i->dirty_sentries) 1957 if (!sit_i->dirty_sentries)
@@ -1896,7 +1968,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1896 * entries, remove all entries from journal and add and account 1968 * entries, remove all entries from journal and add and account
1897 * them in sit entry set. 1969 * them in sit entry set.
1898 */ 1970 */
1899 if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL)) 1971 if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL))
1900 remove_sits_in_journal(sbi); 1972 remove_sits_in_journal(sbi);
1901 1973
1902 /* 1974 /*
@@ -1913,10 +1985,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1913 unsigned int segno = start_segno; 1985 unsigned int segno = start_segno;
1914 1986
1915 if (to_journal && 1987 if (to_journal &&
1916 !__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL)) 1988 !__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL))
1917 to_journal = false; 1989 to_journal = false;
1918 1990
1919 if (!to_journal) { 1991 if (to_journal) {
1992 down_write(&curseg->journal_rwsem);
1993 } else {
1920 page = get_next_sit_page(sbi, start_segno); 1994 page = get_next_sit_page(sbi, start_segno);
1921 raw_sit = page_address(page); 1995 raw_sit = page_address(page);
1922 } 1996 }
@@ -1934,13 +2008,13 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1934 } 2008 }
1935 2009
1936 if (to_journal) { 2010 if (to_journal) {
1937 offset = lookup_journal_in_cursum(sum, 2011 offset = lookup_journal_in_cursum(journal,
1938 SIT_JOURNAL, segno, 1); 2012 SIT_JOURNAL, segno, 1);
1939 f2fs_bug_on(sbi, offset < 0); 2013 f2fs_bug_on(sbi, offset < 0);
1940 segno_in_journal(sum, offset) = 2014 segno_in_journal(journal, offset) =
1941 cpu_to_le32(segno); 2015 cpu_to_le32(segno);
1942 seg_info_to_raw_sit(se, 2016 seg_info_to_raw_sit(se,
1943 &sit_in_journal(sum, offset)); 2017 &sit_in_journal(journal, offset));
1944 } else { 2018 } else {
1945 sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); 2019 sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
1946 seg_info_to_raw_sit(se, 2020 seg_info_to_raw_sit(se,
@@ -1952,7 +2026,9 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1952 ses->entry_cnt--; 2026 ses->entry_cnt--;
1953 } 2027 }
1954 2028
1955 if (!to_journal) 2029 if (to_journal)
2030 up_write(&curseg->journal_rwsem);
2031 else
1956 f2fs_put_page(page, 1); 2032 f2fs_put_page(page, 1);
1957 2033
1958 f2fs_bug_on(sbi, ses->entry_cnt); 2034 f2fs_bug_on(sbi, ses->entry_cnt);
@@ -1967,7 +2043,6 @@ out:
1967 add_discard_addrs(sbi, cpc); 2043 add_discard_addrs(sbi, cpc);
1968 } 2044 }
1969 mutex_unlock(&sit_i->sentry_lock); 2045 mutex_unlock(&sit_i->sentry_lock);
1970 mutex_unlock(&curseg->curseg_mutex);
1971 2046
1972 set_prefree_as_free_segments(sbi); 2047 set_prefree_as_free_segments(sbi);
1973} 2048}
@@ -2099,6 +2174,11 @@ static int build_curseg(struct f2fs_sb_info *sbi)
2099 array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL); 2174 array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
2100 if (!array[i].sum_blk) 2175 if (!array[i].sum_blk)
2101 return -ENOMEM; 2176 return -ENOMEM;
2177 init_rwsem(&array[i].journal_rwsem);
2178 array[i].journal = kzalloc(sizeof(struct f2fs_journal),
2179 GFP_KERNEL);
2180 if (!array[i].journal)
2181 return -ENOMEM;
2102 array[i].segno = NULL_SEGNO; 2182 array[i].segno = NULL_SEGNO;
2103 array[i].next_blkoff = 0; 2183 array[i].next_blkoff = 0;
2104 } 2184 }
@@ -2109,11 +2189,11 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
2109{ 2189{
2110 struct sit_info *sit_i = SIT_I(sbi); 2190 struct sit_info *sit_i = SIT_I(sbi);
2111 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); 2191 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
2112 struct f2fs_summary_block *sum = curseg->sum_blk; 2192 struct f2fs_journal *journal = curseg->journal;
2113 int sit_blk_cnt = SIT_BLK_CNT(sbi); 2193 int sit_blk_cnt = SIT_BLK_CNT(sbi);
2114 unsigned int i, start, end; 2194 unsigned int i, start, end;
2115 unsigned int readed, start_blk = 0; 2195 unsigned int readed, start_blk = 0;
2116 int nrpages = MAX_BIO_BLOCKS(sbi); 2196 int nrpages = MAX_BIO_BLOCKS(sbi) * 8;
2117 2197
2118 do { 2198 do {
2119 readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true); 2199 readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true);
@@ -2127,16 +2207,16 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
2127 struct f2fs_sit_entry sit; 2207 struct f2fs_sit_entry sit;
2128 struct page *page; 2208 struct page *page;
2129 2209
2130 mutex_lock(&curseg->curseg_mutex); 2210 down_read(&curseg->journal_rwsem);
2131 for (i = 0; i < sits_in_cursum(sum); i++) { 2211 for (i = 0; i < sits_in_cursum(journal); i++) {
2132 if (le32_to_cpu(segno_in_journal(sum, i)) 2212 if (le32_to_cpu(segno_in_journal(journal, i))
2133 == start) { 2213 == start) {
2134 sit = sit_in_journal(sum, i); 2214 sit = sit_in_journal(journal, i);
2135 mutex_unlock(&curseg->curseg_mutex); 2215 up_read(&curseg->journal_rwsem);
2136 goto got_it; 2216 goto got_it;
2137 } 2217 }
2138 } 2218 }
2139 mutex_unlock(&curseg->curseg_mutex); 2219 up_read(&curseg->journal_rwsem);
2140 2220
2141 page = get_current_sit_page(sbi, start); 2221 page = get_current_sit_page(sbi, start);
2142 sit_blk = (struct f2fs_sit_block *)page_address(page); 2222 sit_blk = (struct f2fs_sit_block *)page_address(page);
@@ -2371,8 +2451,10 @@ static void destroy_curseg(struct f2fs_sb_info *sbi)
2371 if (!array) 2451 if (!array)
2372 return; 2452 return;
2373 SM_I(sbi)->curseg_array = NULL; 2453 SM_I(sbi)->curseg_array = NULL;
2374 for (i = 0; i < NR_CURSEG_TYPE; i++) 2454 for (i = 0; i < NR_CURSEG_TYPE; i++) {
2375 kfree(array[i].sum_blk); 2455 kfree(array[i].sum_blk);
2456 kfree(array[i].journal);
2457 }
2376 kfree(array); 2458 kfree(array);
2377} 2459}
2378 2460
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index ee44d346ea44..975c33df65c7 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -183,7 +183,7 @@ struct segment_allocation {
183 * this value is set in page as a private data which indicate that 183 * this value is set in page as a private data which indicate that
184 * the page is atomically written, and it is in inmem_pages list. 184 * the page is atomically written, and it is in inmem_pages list.
185 */ 185 */
186#define ATOMIC_WRITTEN_PAGE 0x0000ffff 186#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1)
187 187
188#define IS_ATOMIC_WRITTEN_PAGE(page) \ 188#define IS_ATOMIC_WRITTEN_PAGE(page) \
189 (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) 189 (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE)
@@ -191,6 +191,7 @@ struct segment_allocation {
191struct inmem_pages { 191struct inmem_pages {
192 struct list_head list; 192 struct list_head list;
193 struct page *page; 193 struct page *page;
194 block_t old_addr; /* for revoking when fail to commit */
194}; 195};
195 196
196struct sit_info { 197struct sit_info {
@@ -257,6 +258,8 @@ struct victim_selection {
257struct curseg_info { 258struct curseg_info {
258 struct mutex curseg_mutex; /* lock for consistency */ 259 struct mutex curseg_mutex; /* lock for consistency */
259 struct f2fs_summary_block *sum_blk; /* cached summary block */ 260 struct f2fs_summary_block *sum_blk; /* cached summary block */
261 struct rw_semaphore journal_rwsem; /* protect journal area */
262 struct f2fs_journal *journal; /* cached journal info */
260 unsigned char alloc_type; /* current allocation type */ 263 unsigned char alloc_type; /* current allocation type */
261 unsigned int segno; /* current segment number */ 264 unsigned int segno; /* current segment number */
262 unsigned short next_blkoff; /* next block offset to write */ 265 unsigned short next_blkoff; /* next block offset to write */
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 6134832baaaf..15bb81f8dac2 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -126,6 +126,19 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
126 return NULL; 126 return NULL;
127} 127}
128 128
129static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
130 struct f2fs_sb_info *sbi, char *buf)
131{
132 struct super_block *sb = sbi->sb;
133
134 if (!sb->s_bdev->bd_part)
135 return snprintf(buf, PAGE_SIZE, "0\n");
136
137 return snprintf(buf, PAGE_SIZE, "%llu\n",
138 (unsigned long long)(sbi->kbytes_written +
139 BD_PART_WRITTEN(sbi)));
140}
141
129static ssize_t f2fs_sbi_show(struct f2fs_attr *a, 142static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
130 struct f2fs_sb_info *sbi, char *buf) 143 struct f2fs_sb_info *sbi, char *buf)
131{ 144{
@@ -204,6 +217,9 @@ static struct f2fs_attr f2fs_attr_##_name = { \
204 f2fs_sbi_show, f2fs_sbi_store, \ 217 f2fs_sbi_show, f2fs_sbi_store, \
205 offsetof(struct struct_name, elname)) 218 offsetof(struct struct_name, elname))
206 219
220#define F2FS_GENERAL_RO_ATTR(name) \
221static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL)
222
207F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); 223F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time);
208F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); 224F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time);
209F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); 225F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
@@ -216,10 +232,12 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
216F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); 232F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
217F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); 233F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
218F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); 234F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
235F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
219F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); 236F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
220F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); 237F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
221F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); 238F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]);
222F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); 239F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
240F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
223 241
224#define ATTR_LIST(name) (&f2fs_attr_##name.attr) 242#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
225static struct attribute *f2fs_attrs[] = { 243static struct attribute *f2fs_attrs[] = {
@@ -237,8 +255,10 @@ static struct attribute *f2fs_attrs[] = {
237 ATTR_LIST(dir_level), 255 ATTR_LIST(dir_level),
238 ATTR_LIST(ram_thresh), 256 ATTR_LIST(ram_thresh),
239 ATTR_LIST(ra_nid_pages), 257 ATTR_LIST(ra_nid_pages),
258 ATTR_LIST(dirty_nats_ratio),
240 ATTR_LIST(cp_interval), 259 ATTR_LIST(cp_interval),
241 ATTR_LIST(idle_interval), 260 ATTR_LIST(idle_interval),
261 ATTR_LIST(lifetime_write_kbytes),
242 NULL, 262 NULL,
243}; 263};
244 264
@@ -450,10 +470,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
450 470
451 /* Will be used by directory only */ 471 /* Will be used by directory only */
452 fi->i_dir_level = F2FS_SB(sb)->dir_level; 472 fi->i_dir_level = F2FS_SB(sb)->dir_level;
453
454#ifdef CONFIG_F2FS_FS_ENCRYPTION
455 fi->i_crypt_info = NULL;
456#endif
457 return &fi->vfs_inode; 473 return &fi->vfs_inode;
458} 474}
459 475
@@ -474,7 +490,7 @@ static int f2fs_drop_inode(struct inode *inode)
474 490
475 /* some remained atomic pages should discarded */ 491 /* some remained atomic pages should discarded */
476 if (f2fs_is_atomic_file(inode)) 492 if (f2fs_is_atomic_file(inode))
477 commit_inmem_pages(inode, true); 493 drop_inmem_pages(inode);
478 494
479 /* should remain fi->extent_tree for writepage */ 495 /* should remain fi->extent_tree for writepage */
480 f2fs_destroy_extent_node(inode); 496 f2fs_destroy_extent_node(inode);
@@ -487,11 +503,7 @@ static int f2fs_drop_inode(struct inode *inode)
487 503
488 sb_end_intwrite(inode->i_sb); 504 sb_end_intwrite(inode->i_sb);
489 505
490#ifdef CONFIG_F2FS_FS_ENCRYPTION 506 fscrypt_put_encryption_info(inode, NULL);
491 if (F2FS_I(inode)->i_crypt_info)
492 f2fs_free_encryption_info(inode,
493 F2FS_I(inode)->i_crypt_info);
494#endif
495 spin_lock(&inode->i_lock); 507 spin_lock(&inode->i_lock);
496 atomic_dec(&inode->i_count); 508 atomic_dec(&inode->i_count);
497 } 509 }
@@ -562,6 +574,10 @@ static void f2fs_put_super(struct super_block *sb)
562 f2fs_leave_shrinker(sbi); 574 f2fs_leave_shrinker(sbi);
563 mutex_unlock(&sbi->umount_mutex); 575 mutex_unlock(&sbi->umount_mutex);
564 576
577 /* our cp_error case, we can wait for any writeback page */
578 if (get_pages(sbi, F2FS_WRITEBACK))
579 f2fs_flush_merged_bios(sbi);
580
565 iput(sbi->node_inode); 581 iput(sbi->node_inode);
566 iput(sbi->meta_inode); 582 iput(sbi->meta_inode);
567 583
@@ -574,6 +590,8 @@ static void f2fs_put_super(struct super_block *sb)
574 wait_for_completion(&sbi->s_kobj_unregister); 590 wait_for_completion(&sbi->s_kobj_unregister);
575 591
576 sb->s_fs_info = NULL; 592 sb->s_fs_info = NULL;
593 if (sbi->s_chksum_driver)
594 crypto_free_shash(sbi->s_chksum_driver);
577 kfree(sbi->raw_super); 595 kfree(sbi->raw_super);
578 kfree(sbi); 596 kfree(sbi);
579} 597}
@@ -766,8 +784,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
766 bool need_stop_gc = false; 784 bool need_stop_gc = false;
767 bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); 785 bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
768 786
769 sync_filesystem(sb);
770
771 /* 787 /*
772 * Save the old mount options in case we 788 * Save the old mount options in case we
773 * need to restore them. 789 * need to restore them.
@@ -775,6 +791,13 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
775 org_mount_opt = sbi->mount_opt; 791 org_mount_opt = sbi->mount_opt;
776 active_logs = sbi->active_logs; 792 active_logs = sbi->active_logs;
777 793
794 if (*flags & MS_RDONLY) {
795 set_opt(sbi, FASTBOOT);
796 set_sbi_flag(sbi, SBI_IS_DIRTY);
797 }
798
799 sync_filesystem(sb);
800
778 sbi->mount_opt.opt = 0; 801 sbi->mount_opt.opt = 0;
779 default_options(sbi); 802 default_options(sbi);
780 803
@@ -862,6 +885,41 @@ static struct super_operations f2fs_sops = {
862 .remount_fs = f2fs_remount, 885 .remount_fs = f2fs_remount,
863}; 886};
864 887
888#ifdef CONFIG_F2FS_FS_ENCRYPTION
889static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
890{
891 return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
892 F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
893 ctx, len, NULL);
894}
895
896static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
897 void *fs_data)
898{
899 return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
900 F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
901 ctx, len, fs_data, XATTR_CREATE);
902}
903
904static unsigned f2fs_max_namelen(struct inode *inode)
905{
906 return S_ISLNK(inode->i_mode) ?
907 inode->i_sb->s_blocksize : F2FS_NAME_LEN;
908}
909
910static struct fscrypt_operations f2fs_cryptops = {
911 .get_context = f2fs_get_context,
912 .set_context = f2fs_set_context,
913 .is_encrypted = f2fs_encrypted_inode,
914 .empty_dir = f2fs_empty_dir,
915 .max_namelen = f2fs_max_namelen,
916};
917#else
918static struct fscrypt_operations f2fs_cryptops = {
919 .is_encrypted = f2fs_encrypted_inode,
920};
921#endif
922
865static struct inode *f2fs_nfs_get_inode(struct super_block *sb, 923static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
866 u64 ino, u32 generation) 924 u64 ino, u32 generation)
867{ 925{
@@ -1074,7 +1132,7 @@ static int sanity_check_raw_super(struct super_block *sb,
1074 return 0; 1132 return 0;
1075} 1133}
1076 1134
1077static int sanity_check_ckpt(struct f2fs_sb_info *sbi) 1135int sanity_check_ckpt(struct f2fs_sb_info *sbi)
1078{ 1136{
1079 unsigned int total, fsmeta; 1137 unsigned int total, fsmeta;
1080 struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); 1138 struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
@@ -1134,14 +1192,15 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
1134 1192
1135/* 1193/*
1136 * Read f2fs raw super block. 1194 * Read f2fs raw super block.
1137 * Because we have two copies of super block, so read the first one at first, 1195 * Because we have two copies of super block, so read both of them
1138 * if the first one is invalid, move to read the second one. 1196 * to get the first valid one. If any one of them is broken, we pass
1197 * them recovery flag back to the caller.
1139 */ 1198 */
1140static int read_raw_super_block(struct super_block *sb, 1199static int read_raw_super_block(struct super_block *sb,
1141 struct f2fs_super_block **raw_super, 1200 struct f2fs_super_block **raw_super,
1142 int *valid_super_block, int *recovery) 1201 int *valid_super_block, int *recovery)
1143{ 1202{
1144 int block = 0; 1203 int block;
1145 struct buffer_head *bh; 1204 struct buffer_head *bh;
1146 struct f2fs_super_block *super, *buf; 1205 struct f2fs_super_block *super, *buf;
1147 int err = 0; 1206 int err = 0;
@@ -1149,50 +1208,48 @@ static int read_raw_super_block(struct super_block *sb,
1149 super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL); 1208 super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL);
1150 if (!super) 1209 if (!super)
1151 return -ENOMEM; 1210 return -ENOMEM;
1152retry: 1211
1153 bh = sb_bread(sb, block); 1212 for (block = 0; block < 2; block++) {
1154 if (!bh) { 1213 bh = sb_bread(sb, block);
1155 *recovery = 1; 1214 if (!bh) {
1156 f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock", 1215 f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock",
1157 block + 1); 1216 block + 1);
1158 err = -EIO; 1217 err = -EIO;
1159 goto next; 1218 continue;
1160 } 1219 }
1161 1220
1162 buf = (struct f2fs_super_block *)(bh->b_data + F2FS_SUPER_OFFSET); 1221 buf = (struct f2fs_super_block *)
1222 (bh->b_data + F2FS_SUPER_OFFSET);
1163 1223
1164 /* sanity checking of raw super */ 1224 /* sanity checking of raw super */
1165 if (sanity_check_raw_super(sb, buf)) { 1225 if (sanity_check_raw_super(sb, buf)) {
1166 brelse(bh); 1226 f2fs_msg(sb, KERN_ERR,
1167 *recovery = 1; 1227 "Can't find valid F2FS filesystem in %dth superblock",
1168 f2fs_msg(sb, KERN_ERR, 1228 block + 1);
1169 "Can't find valid F2FS filesystem in %dth superblock", 1229 err = -EINVAL;
1170 block + 1); 1230 brelse(bh);
1171 err = -EINVAL; 1231 continue;
1172 goto next; 1232 }
1173 }
1174 1233
1175 if (!*raw_super) { 1234 if (!*raw_super) {
1176 memcpy(super, buf, sizeof(*super)); 1235 memcpy(super, buf, sizeof(*super));
1177 *valid_super_block = block; 1236 *valid_super_block = block;
1178 *raw_super = super; 1237 *raw_super = super;
1238 }
1239 brelse(bh);
1179 } 1240 }
1180 brelse(bh);
1181 1241
1182next: 1242 /* Fail to read any one of the superblocks*/
1183 /* check the validity of the second superblock */ 1243 if (err < 0)
1184 if (block == 0) { 1244 *recovery = 1;
1185 block++;
1186 goto retry;
1187 }
1188 1245
1189 /* No valid superblock */ 1246 /* No valid superblock */
1190 if (!*raw_super) { 1247 if (!*raw_super)
1191 kfree(super); 1248 kfree(super);
1192 return err; 1249 else
1193 } 1250 err = 0;
1194 1251
1195 return 0; 1252 return err;
1196} 1253}
1197 1254
1198static int __f2fs_commit_super(struct f2fs_sb_info *sbi, int block) 1255static int __f2fs_commit_super(struct f2fs_sb_info *sbi, int block)
@@ -1242,6 +1299,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
1242 bool retry = true, need_fsck = false; 1299 bool retry = true, need_fsck = false;
1243 char *options = NULL; 1300 char *options = NULL;
1244 int recovery, i, valid_super_block; 1301 int recovery, i, valid_super_block;
1302 struct curseg_info *seg_i;
1245 1303
1246try_onemore: 1304try_onemore:
1247 err = -EINVAL; 1305 err = -EINVAL;
@@ -1254,6 +1312,15 @@ try_onemore:
1254 if (!sbi) 1312 if (!sbi)
1255 return -ENOMEM; 1313 return -ENOMEM;
1256 1314
1315 /* Load the checksum driver */
1316 sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0);
1317 if (IS_ERR(sbi->s_chksum_driver)) {
1318 f2fs_msg(sb, KERN_ERR, "Cannot load crc32 driver.");
1319 err = PTR_ERR(sbi->s_chksum_driver);
1320 sbi->s_chksum_driver = NULL;
1321 goto free_sbi;
1322 }
1323
1257 /* set a block size */ 1324 /* set a block size */
1258 if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) { 1325 if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) {
1259 f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); 1326 f2fs_msg(sb, KERN_ERR, "unable to set blocksize");
@@ -1285,6 +1352,7 @@ try_onemore:
1285 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 1352 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
1286 1353
1287 sb->s_op = &f2fs_sops; 1354 sb->s_op = &f2fs_sops;
1355 sb->s_cop = &f2fs_cryptops;
1288 sb->s_xattr = f2fs_xattr_handlers; 1356 sb->s_xattr = f2fs_xattr_handlers;
1289 sb->s_export_op = &f2fs_export_ops; 1357 sb->s_export_op = &f2fs_export_ops;
1290 sb->s_magic = F2FS_SUPER_MAGIC; 1358 sb->s_magic = F2FS_SUPER_MAGIC;
@@ -1333,13 +1401,6 @@ try_onemore:
1333 goto free_meta_inode; 1401 goto free_meta_inode;
1334 } 1402 }
1335 1403
1336 /* sanity checking of checkpoint */
1337 err = -EINVAL;
1338 if (sanity_check_ckpt(sbi)) {
1339 f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint");
1340 goto free_cp;
1341 }
1342
1343 sbi->total_valid_node_count = 1404 sbi->total_valid_node_count =
1344 le32_to_cpu(sbi->ckpt->valid_node_count); 1405 le32_to_cpu(sbi->ckpt->valid_node_count);
1345 sbi->total_valid_inode_count = 1406 sbi->total_valid_inode_count =
@@ -1372,6 +1433,17 @@ try_onemore:
1372 goto free_nm; 1433 goto free_nm;
1373 } 1434 }
1374 1435
1436 /* For write statistics */
1437 if (sb->s_bdev->bd_part)
1438 sbi->sectors_written_start =
1439 (u64)part_stat_read(sb->s_bdev->bd_part, sectors[1]);
1440
1441 /* Read accumulated write IO statistics if exists */
1442 seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
1443 if (__exist_node_summaries(sbi))
1444 sbi->kbytes_written =
1445 le64_to_cpu(seg_i->sum_blk->journal.info.kbytes_written);
1446
1375 build_gc_manager(sbi); 1447 build_gc_manager(sbi);
1376 1448
1377 /* get an inode for node space */ 1449 /* get an inode for node space */
@@ -1466,8 +1538,10 @@ try_onemore:
1466 1538
1467 /* recover broken superblock */ 1539 /* recover broken superblock */
1468 if (recovery && !f2fs_readonly(sb) && !bdev_read_only(sb->s_bdev)) { 1540 if (recovery && !f2fs_readonly(sb) && !bdev_read_only(sb->s_bdev)) {
1469 f2fs_msg(sb, KERN_INFO, "Recover invalid superblock"); 1541 err = f2fs_commit_super(sbi, true);
1470 f2fs_commit_super(sbi, true); 1542 f2fs_msg(sb, KERN_INFO,
1543 "Try to recover %dth superblock, ret: %ld",
1544 sbi->valid_super_block ? 1 : 2, err);
1471 } 1545 }
1472 1546
1473 f2fs_update_time(sbi, CP_TIME); 1547 f2fs_update_time(sbi, CP_TIME);
@@ -1496,7 +1570,6 @@ free_nm:
1496 destroy_node_manager(sbi); 1570 destroy_node_manager(sbi);
1497free_sm: 1571free_sm:
1498 destroy_segment_manager(sbi); 1572 destroy_segment_manager(sbi);
1499free_cp:
1500 kfree(sbi->ckpt); 1573 kfree(sbi->ckpt);
1501free_meta_inode: 1574free_meta_inode:
1502 make_bad_inode(sbi->meta_inode); 1575 make_bad_inode(sbi->meta_inode);
@@ -1506,6 +1579,8 @@ free_options:
1506free_sb_buf: 1579free_sb_buf:
1507 kfree(raw_super); 1580 kfree(raw_super);
1508free_sbi: 1581free_sbi:
1582 if (sbi->s_chksum_driver)
1583 crypto_free_shash(sbi->s_chksum_driver);
1509 kfree(sbi); 1584 kfree(sbi);
1510 1585
1511 /* give only one another chance */ 1586 /* give only one another chance */
@@ -1585,13 +1660,9 @@ static int __init init_f2fs_fs(void)
1585 err = -ENOMEM; 1660 err = -ENOMEM;
1586 goto free_extent_cache; 1661 goto free_extent_cache;
1587 } 1662 }
1588 err = f2fs_init_crypto();
1589 if (err)
1590 goto free_kset;
1591
1592 err = register_shrinker(&f2fs_shrinker_info); 1663 err = register_shrinker(&f2fs_shrinker_info);
1593 if (err) 1664 if (err)
1594 goto free_crypto; 1665 goto free_kset;
1595 1666
1596 err = register_filesystem(&f2fs_fs_type); 1667 err = register_filesystem(&f2fs_fs_type);
1597 if (err) 1668 if (err)
@@ -1606,8 +1677,6 @@ free_filesystem:
1606 unregister_filesystem(&f2fs_fs_type); 1677 unregister_filesystem(&f2fs_fs_type);
1607free_shrinker: 1678free_shrinker:
1608 unregister_shrinker(&f2fs_shrinker_info); 1679 unregister_shrinker(&f2fs_shrinker_info);
1609free_crypto:
1610 f2fs_exit_crypto();
1611free_kset: 1680free_kset:
1612 kset_unregister(f2fs_kset); 1681 kset_unregister(f2fs_kset);
1613free_extent_cache: 1682free_extent_cache:
@@ -1630,7 +1699,6 @@ static void __exit exit_f2fs_fs(void)
1630 f2fs_destroy_root_stats(); 1699 f2fs_destroy_root_stats();
1631 unregister_shrinker(&f2fs_shrinker_info); 1700 unregister_shrinker(&f2fs_shrinker_info);
1632 unregister_filesystem(&f2fs_fs_type); 1701 unregister_filesystem(&f2fs_fs_type);
1633 f2fs_exit_crypto();
1634 destroy_extent_cache(); 1702 destroy_extent_cache();
1635 destroy_checkpoint_caches(); 1703 destroy_checkpoint_caches();
1636 destroy_segment_manager_caches(); 1704 destroy_segment_manager_caches();
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index 145fb659ad44..562ce0821559 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -29,7 +29,8 @@ static inline void __print_last_io(void)
29 last_io.major, last_io.minor, 29 last_io.major, last_io.minor,
30 last_io.pid, "----------------", 30 last_io.pid, "----------------",
31 last_io.type, 31 last_io.type,
32 last_io.fio.rw, last_io.fio.blk_addr, 32 last_io.fio.rw,
33 last_io.fio.new_blkaddr,
33 last_io.len); 34 last_io.len);
34 memset(&last_io, 0, sizeof(last_io)); 35 memset(&last_io, 0, sizeof(last_io));
35} 36}
@@ -101,7 +102,8 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
101 last_io.pid == pid && 102 last_io.pid == pid &&
102 last_io.type == __file_type(inode, pid) && 103 last_io.type == __file_type(inode, pid) &&
103 last_io.fio.rw == fio->rw && 104 last_io.fio.rw == fio->rw &&
104 last_io.fio.blk_addr + last_io.len == fio->blk_addr) { 105 last_io.fio.new_blkaddr + last_io.len ==
106 fio->new_blkaddr) {
105 last_io.len++; 107 last_io.len++;
106 return; 108 return;
107 } 109 }
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 10f1e784fa23..06a72dc0191a 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -300,7 +300,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
300 300
301 if (ipage) { 301 if (ipage) {
302 inline_addr = inline_xattr_addr(ipage); 302 inline_addr = inline_xattr_addr(ipage);
303 f2fs_wait_on_page_writeback(ipage, NODE); 303 f2fs_wait_on_page_writeback(ipage, NODE, true);
304 } else { 304 } else {
305 page = get_node_page(sbi, inode->i_ino); 305 page = get_node_page(sbi, inode->i_ino);
306 if (IS_ERR(page)) { 306 if (IS_ERR(page)) {
@@ -308,7 +308,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
308 return PTR_ERR(page); 308 return PTR_ERR(page);
309 } 309 }
310 inline_addr = inline_xattr_addr(page); 310 inline_addr = inline_xattr_addr(page);
311 f2fs_wait_on_page_writeback(page, NODE); 311 f2fs_wait_on_page_writeback(page, NODE, true);
312 } 312 }
313 memcpy(inline_addr, txattr_addr, inline_size); 313 memcpy(inline_addr, txattr_addr, inline_size);
314 f2fs_put_page(page, 1); 314 f2fs_put_page(page, 1);
@@ -329,7 +329,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
329 return PTR_ERR(xpage); 329 return PTR_ERR(xpage);
330 } 330 }
331 f2fs_bug_on(sbi, new_nid); 331 f2fs_bug_on(sbi, new_nid);
332 f2fs_wait_on_page_writeback(xpage, NODE); 332 f2fs_wait_on_page_writeback(xpage, NODE, true);
333 } else { 333 } else {
334 struct dnode_of_data dn; 334 struct dnode_of_data dn;
335 set_new_dnode(&dn, inode, NULL, NULL, new_nid); 335 set_new_dnode(&dn, inode, NULL, NULL, new_nid);
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 79dccc8252dd..f990de20cdcd 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -126,7 +126,8 @@ extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
126 126
127#define f2fs_xattr_handlers NULL 127#define f2fs_xattr_handlers NULL
128static inline int f2fs_setxattr(struct inode *inode, int index, 128static inline int f2fs_setxattr(struct inode *inode, int index,
129 const char *name, const void *value, size_t size, int flags) 129 const char *name, const void *value, size_t size,
130 struct page *page, int flags)
130{ 131{
131 return -EOPNOTSUPP; 132 return -EOPNOTSUPP;
132} 133}
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
index 182f9ffe2b51..3ff1772f612e 100644
--- a/fs/fat/Kconfig
+++ b/fs/fat/Kconfig
@@ -93,8 +93,24 @@ config FAT_DEFAULT_IOCHARSET
93 that most of your FAT filesystems use, and can be overridden 93 that most of your FAT filesystems use, and can be overridden
94 with the "iocharset" mount option for FAT filesystems. 94 with the "iocharset" mount option for FAT filesystems.
95 Note that "utf8" is not recommended for FAT filesystems. 95 Note that "utf8" is not recommended for FAT filesystems.
96 If unsure, you shouldn't set "utf8" here. 96 If unsure, you shouldn't set "utf8" here - select the next option
97 instead if you would like to use UTF-8 encoded file names by default.
97 See <file:Documentation/filesystems/vfat.txt> for more information. 98 See <file:Documentation/filesystems/vfat.txt> for more information.
98 99
99 Enable any character sets you need in File Systems/Native Language 100 Enable any character sets you need in File Systems/Native Language
100 Support. 101 Support.
102
103config FAT_DEFAULT_UTF8
104 bool "Enable FAT UTF-8 option by default"
105 depends on VFAT_FS
106 default n
107 help
108 Set this if you would like to have "utf8" mount option set
109 by default when mounting FAT filesystems.
110
111 Even if you say Y here can always disable UTF-8 for
112 particular mount by adding "utf8=0" to mount options.
113
114 Say Y if you use UTF-8 encoding for file names, N otherwise.
115
116 See <file:Documentation/filesystems/vfat.txt> for more information.
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index a5599052116c..226281068a46 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1127,7 +1127,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
1127 } 1127 }
1128 opts->name_check = 'n'; 1128 opts->name_check = 'n';
1129 opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK = 0; 1129 opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK = 0;
1130 opts->utf8 = opts->unicode_xlate = 0; 1130 opts->unicode_xlate = 0;
1131 opts->numtail = 1; 1131 opts->numtail = 1;
1132 opts->usefree = opts->nocase = 0; 1132 opts->usefree = opts->nocase = 0;
1133 opts->tz_set = 0; 1133 opts->tz_set = 0;
@@ -1135,6 +1135,8 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
1135 opts->errors = FAT_ERRORS_RO; 1135 opts->errors = FAT_ERRORS_RO;
1136 *debug = 0; 1136 *debug = 0;
1137 1137
1138 opts->utf8 = IS_ENABLED(CONFIG_FAT_DEFAULT_UTF8) && is_vfat;
1139
1138 if (!options) 1140 if (!options)
1139 goto out; 1141 goto out;
1140 1142
diff --git a/fs/fhandle.c b/fs/fhandle.c
index d59712dfa3e7..ca3c3dd01789 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -228,7 +228,7 @@ long do_handle_open(int mountdirfd,
228 path_put(&path); 228 path_put(&path);
229 return fd; 229 return fd;
230 } 230 }
231 file = file_open_root(path.dentry, path.mnt, "", open_flag); 231 file = file_open_root(path.dentry, path.mnt, "", open_flag, 0);
232 if (IS_ERR(file)) { 232 if (IS_ERR(file)) {
233 put_unused_fd(fd); 233 put_unused_fd(fd);
234 retval = PTR_ERR(file); 234 retval = PTR_ERR(file);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5c46ed9f3e14..fee81e8768c9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -281,13 +281,15 @@ locked_inode_to_wb_and_lock_list(struct inode *inode)
281 wb_get(wb); 281 wb_get(wb);
282 spin_unlock(&inode->i_lock); 282 spin_unlock(&inode->i_lock);
283 spin_lock(&wb->list_lock); 283 spin_lock(&wb->list_lock);
284 wb_put(wb); /* not gonna deref it anymore */
285 284
286 /* i_wb may have changed inbetween, can't use inode_to_wb() */ 285 /* i_wb may have changed inbetween, can't use inode_to_wb() */
287 if (likely(wb == inode->i_wb)) 286 if (likely(wb == inode->i_wb)) {
288 return wb; /* @inode already has ref */ 287 wb_put(wb); /* @inode already has ref */
288 return wb;
289 }
289 290
290 spin_unlock(&wb->list_lock); 291 spin_unlock(&wb->list_lock);
292 wb_put(wb);
291 cpu_relax(); 293 cpu_relax();
292 spin_lock(&inode->i_lock); 294 spin_lock(&inode->i_lock);
293 } 295 }
@@ -1337,10 +1339,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
1337 * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode() 1339 * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
1338 * and does more profound writeback list handling in writeback_sb_inodes(). 1340 * and does more profound writeback list handling in writeback_sb_inodes().
1339 */ 1341 */
1340static int 1342static int writeback_single_inode(struct inode *inode,
1341writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, 1343 struct writeback_control *wbc)
1342 struct writeback_control *wbc)
1343{ 1344{
1345 struct bdi_writeback *wb;
1344 int ret = 0; 1346 int ret = 0;
1345 1347
1346 spin_lock(&inode->i_lock); 1348 spin_lock(&inode->i_lock);
@@ -1378,7 +1380,8 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
1378 ret = __writeback_single_inode(inode, wbc); 1380 ret = __writeback_single_inode(inode, wbc);
1379 1381
1380 wbc_detach_inode(wbc); 1382 wbc_detach_inode(wbc);
1381 spin_lock(&wb->list_lock); 1383
1384 wb = inode_to_wb_and_lock_list(inode);
1382 spin_lock(&inode->i_lock); 1385 spin_lock(&inode->i_lock);
1383 /* 1386 /*
1384 * If inode is clean, remove it from writeback lists. Otherwise don't 1387 * If inode is clean, remove it from writeback lists. Otherwise don't
@@ -1453,6 +1456,7 @@ static long writeback_sb_inodes(struct super_block *sb,
1453 1456
1454 while (!list_empty(&wb->b_io)) { 1457 while (!list_empty(&wb->b_io)) {
1455 struct inode *inode = wb_inode(wb->b_io.prev); 1458 struct inode *inode = wb_inode(wb->b_io.prev);
1459 struct bdi_writeback *tmp_wb;
1456 1460
1457 if (inode->i_sb != sb) { 1461 if (inode->i_sb != sb) {
1458 if (work->sb) { 1462 if (work->sb) {
@@ -1543,15 +1547,23 @@ static long writeback_sb_inodes(struct super_block *sb,
1543 cond_resched(); 1547 cond_resched();
1544 } 1548 }
1545 1549
1546 1550 /*
1547 spin_lock(&wb->list_lock); 1551 * Requeue @inode if still dirty. Be careful as @inode may
1552 * have been switched to another wb in the meantime.
1553 */
1554 tmp_wb = inode_to_wb_and_lock_list(inode);
1548 spin_lock(&inode->i_lock); 1555 spin_lock(&inode->i_lock);
1549 if (!(inode->i_state & I_DIRTY_ALL)) 1556 if (!(inode->i_state & I_DIRTY_ALL))
1550 wrote++; 1557 wrote++;
1551 requeue_inode(inode, wb, &wbc); 1558 requeue_inode(inode, tmp_wb, &wbc);
1552 inode_sync_complete(inode); 1559 inode_sync_complete(inode);
1553 spin_unlock(&inode->i_lock); 1560 spin_unlock(&inode->i_lock);
1554 1561
1562 if (unlikely(tmp_wb != wb)) {
1563 spin_unlock(&tmp_wb->list_lock);
1564 spin_lock(&wb->list_lock);
1565 }
1566
1555 /* 1567 /*
1556 * bail out to wb_writeback() often enough to check 1568 * bail out to wb_writeback() often enough to check
1557 * background threshold and other termination conditions. 1569 * background threshold and other termination conditions.
@@ -2338,7 +2350,6 @@ EXPORT_SYMBOL(sync_inodes_sb);
2338 */ 2350 */
2339int write_inode_now(struct inode *inode, int sync) 2351int write_inode_now(struct inode *inode, int sync)
2340{ 2352{
2341 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
2342 struct writeback_control wbc = { 2353 struct writeback_control wbc = {
2343 .nr_to_write = LONG_MAX, 2354 .nr_to_write = LONG_MAX,
2344 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, 2355 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
@@ -2350,7 +2361,7 @@ int write_inode_now(struct inode *inode, int sync)
2350 wbc.nr_to_write = 0; 2361 wbc.nr_to_write = 0;
2351 2362
2352 might_sleep(); 2363 might_sleep();
2353 return writeback_single_inode(inode, wb, &wbc); 2364 return writeback_single_inode(inode, &wbc);
2354} 2365}
2355EXPORT_SYMBOL(write_inode_now); 2366EXPORT_SYMBOL(write_inode_now);
2356 2367
@@ -2367,7 +2378,7 @@ EXPORT_SYMBOL(write_inode_now);
2367 */ 2378 */
2368int sync_inode(struct inode *inode, struct writeback_control *wbc) 2379int sync_inode(struct inode *inode, struct writeback_control *wbc)
2369{ 2380{
2370 return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc); 2381 return writeback_single_inode(inode, wbc);
2371} 2382}
2372EXPORT_SYMBOL(sync_inode); 2383EXPORT_SYMBOL(sync_inode);
2373 2384
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 8e3ee1936c7e..c5b6b7165489 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -90,7 +90,7 @@ static struct list_head *cuse_conntbl_head(dev_t devt)
90 90
91static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to) 91static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to)
92{ 92{
93 struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp }; 93 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp);
94 loff_t pos = 0; 94 loff_t pos = 0;
95 95
96 return fuse_direct_io(&io, to, &pos, FUSE_DIO_CUSE); 96 return fuse_direct_io(&io, to, &pos, FUSE_DIO_CUSE);
@@ -98,7 +98,7 @@ static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to)
98 98
99static ssize_t cuse_write_iter(struct kiocb *kiocb, struct iov_iter *from) 99static ssize_t cuse_write_iter(struct kiocb *kiocb, struct iov_iter *from)
100{ 100{
101 struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp }; 101 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp);
102 loff_t pos = 0; 102 loff_t pos = 0;
103 /* 103 /*
104 * No locking or generic_write_checks(), the server is 104 * No locking or generic_write_checks(), the server is
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index b03d253ece15..9dde38f12c07 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -528,6 +528,11 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
528 } 528 }
529} 529}
530 530
531static void fuse_io_release(struct kref *kref)
532{
533 kfree(container_of(kref, struct fuse_io_priv, refcnt));
534}
535
531static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) 536static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
532{ 537{
533 if (io->err) 538 if (io->err)
@@ -585,8 +590,9 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
585 } 590 }
586 591
587 io->iocb->ki_complete(io->iocb, res, 0); 592 io->iocb->ki_complete(io->iocb, res, 0);
588 kfree(io);
589 } 593 }
594
595 kref_put(&io->refcnt, fuse_io_release);
590} 596}
591 597
592static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req) 598static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req)
@@ -613,6 +619,7 @@ static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req,
613 size_t num_bytes, struct fuse_io_priv *io) 619 size_t num_bytes, struct fuse_io_priv *io)
614{ 620{
615 spin_lock(&io->lock); 621 spin_lock(&io->lock);
622 kref_get(&io->refcnt);
616 io->size += num_bytes; 623 io->size += num_bytes;
617 io->reqs++; 624 io->reqs++;
618 spin_unlock(&io->lock); 625 spin_unlock(&io->lock);
@@ -691,7 +698,7 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode,
691 698
692static int fuse_do_readpage(struct file *file, struct page *page) 699static int fuse_do_readpage(struct file *file, struct page *page)
693{ 700{
694 struct fuse_io_priv io = { .async = 0, .file = file }; 701 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
695 struct inode *inode = page->mapping->host; 702 struct inode *inode = page->mapping->host;
696 struct fuse_conn *fc = get_fuse_conn(inode); 703 struct fuse_conn *fc = get_fuse_conn(inode);
697 struct fuse_req *req; 704 struct fuse_req *req;
@@ -984,7 +991,7 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
984 size_t res; 991 size_t res;
985 unsigned offset; 992 unsigned offset;
986 unsigned i; 993 unsigned i;
987 struct fuse_io_priv io = { .async = 0, .file = file }; 994 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
988 995
989 for (i = 0; i < req->num_pages; i++) 996 for (i = 0; i < req->num_pages; i++)
990 fuse_wait_on_page_writeback(inode, req->pages[i]->index); 997 fuse_wait_on_page_writeback(inode, req->pages[i]->index);
@@ -1240,6 +1247,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
1240 size_t *nbytesp, int write) 1247 size_t *nbytesp, int write)
1241{ 1248{
1242 size_t nbytes = 0; /* # bytes already packed in req */ 1249 size_t nbytes = 0; /* # bytes already packed in req */
1250 ssize_t ret = 0;
1243 1251
1244 /* Special case for kernel I/O: can copy directly into the buffer */ 1252 /* Special case for kernel I/O: can copy directly into the buffer */
1245 if (ii->type & ITER_KVEC) { 1253 if (ii->type & ITER_KVEC) {
@@ -1259,13 +1267,12 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
1259 while (nbytes < *nbytesp && req->num_pages < req->max_pages) { 1267 while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
1260 unsigned npages; 1268 unsigned npages;
1261 size_t start; 1269 size_t start;
1262 ssize_t ret = iov_iter_get_pages(ii, 1270 ret = iov_iter_get_pages(ii, &req->pages[req->num_pages],
1263 &req->pages[req->num_pages],
1264 *nbytesp - nbytes, 1271 *nbytesp - nbytes,
1265 req->max_pages - req->num_pages, 1272 req->max_pages - req->num_pages,
1266 &start); 1273 &start);
1267 if (ret < 0) 1274 if (ret < 0)
1268 return ret; 1275 break;
1269 1276
1270 iov_iter_advance(ii, ret); 1277 iov_iter_advance(ii, ret);
1271 nbytes += ret; 1278 nbytes += ret;
@@ -1288,7 +1295,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
1288 1295
1289 *nbytesp = nbytes; 1296 *nbytesp = nbytes;
1290 1297
1291 return 0; 1298 return ret;
1292} 1299}
1293 1300
1294static inline int fuse_iter_npages(const struct iov_iter *ii_p) 1301static inline int fuse_iter_npages(const struct iov_iter *ii_p)
@@ -1312,6 +1319,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
1312 pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT; 1319 pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT;
1313 ssize_t res = 0; 1320 ssize_t res = 0;
1314 struct fuse_req *req; 1321 struct fuse_req *req;
1322 int err = 0;
1315 1323
1316 if (io->async) 1324 if (io->async)
1317 req = fuse_get_req_for_background(fc, fuse_iter_npages(iter)); 1325 req = fuse_get_req_for_background(fc, fuse_iter_npages(iter));
@@ -1332,11 +1340,9 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
1332 size_t nres; 1340 size_t nres;
1333 fl_owner_t owner = current->files; 1341 fl_owner_t owner = current->files;
1334 size_t nbytes = min(count, nmax); 1342 size_t nbytes = min(count, nmax);
1335 int err = fuse_get_user_pages(req, iter, &nbytes, write); 1343 err = fuse_get_user_pages(req, iter, &nbytes, write);
1336 if (err) { 1344 if (err && !nbytes)
1337 res = err;
1338 break; 1345 break;
1339 }
1340 1346
1341 if (write) 1347 if (write)
1342 nres = fuse_send_write(req, io, pos, nbytes, owner); 1348 nres = fuse_send_write(req, io, pos, nbytes, owner);
@@ -1346,11 +1352,11 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
1346 if (!io->async) 1352 if (!io->async)
1347 fuse_release_user_pages(req, !write); 1353 fuse_release_user_pages(req, !write);
1348 if (req->out.h.error) { 1354 if (req->out.h.error) {
1349 if (!res) 1355 err = req->out.h.error;
1350 res = req->out.h.error;
1351 break; 1356 break;
1352 } else if (nres > nbytes) { 1357 } else if (nres > nbytes) {
1353 res = -EIO; 1358 res = 0;
1359 err = -EIO;
1354 break; 1360 break;
1355 } 1361 }
1356 count -= nres; 1362 count -= nres;
@@ -1374,7 +1380,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
1374 if (res > 0) 1380 if (res > 0)
1375 *ppos = pos; 1381 *ppos = pos;
1376 1382
1377 return res; 1383 return res > 0 ? res : err;
1378} 1384}
1379EXPORT_SYMBOL_GPL(fuse_direct_io); 1385EXPORT_SYMBOL_GPL(fuse_direct_io);
1380 1386
@@ -1398,7 +1404,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
1398 1404
1399static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) 1405static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
1400{ 1406{
1401 struct fuse_io_priv io = { .async = 0, .file = iocb->ki_filp }; 1407 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb->ki_filp);
1402 return __fuse_direct_read(&io, to, &iocb->ki_pos); 1408 return __fuse_direct_read(&io, to, &iocb->ki_pos);
1403} 1409}
1404 1410
@@ -1406,7 +1412,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
1406{ 1412{
1407 struct file *file = iocb->ki_filp; 1413 struct file *file = iocb->ki_filp;
1408 struct inode *inode = file_inode(file); 1414 struct inode *inode = file_inode(file);
1409 struct fuse_io_priv io = { .async = 0, .file = file }; 1415 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
1410 ssize_t res; 1416 ssize_t res;
1411 1417
1412 if (is_bad_inode(inode)) 1418 if (is_bad_inode(inode))
@@ -2843,6 +2849,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
2843 loff_t i_size; 2849 loff_t i_size;
2844 size_t count = iov_iter_count(iter); 2850 size_t count = iov_iter_count(iter);
2845 struct fuse_io_priv *io; 2851 struct fuse_io_priv *io;
2852 bool is_sync = is_sync_kiocb(iocb);
2846 2853
2847 pos = offset; 2854 pos = offset;
2848 inode = file->f_mapping->host; 2855 inode = file->f_mapping->host;
@@ -2863,6 +2870,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
2863 if (!io) 2870 if (!io)
2864 return -ENOMEM; 2871 return -ENOMEM;
2865 spin_lock_init(&io->lock); 2872 spin_lock_init(&io->lock);
2873 kref_init(&io->refcnt);
2866 io->reqs = 1; 2874 io->reqs = 1;
2867 io->bytes = -1; 2875 io->bytes = -1;
2868 io->size = 0; 2876 io->size = 0;
@@ -2882,12 +2890,18 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
2882 * to wait on real async I/O requests, so we must submit this request 2890 * to wait on real async I/O requests, so we must submit this request
2883 * synchronously. 2891 * synchronously.
2884 */ 2892 */
2885 if (!is_sync_kiocb(iocb) && (offset + count > i_size) && 2893 if (!is_sync && (offset + count > i_size) &&
2886 iov_iter_rw(iter) == WRITE) 2894 iov_iter_rw(iter) == WRITE)
2887 io->async = false; 2895 io->async = false;
2888 2896
2889 if (io->async && is_sync_kiocb(iocb)) 2897 if (io->async && is_sync) {
2898 /*
2899 * Additional reference to keep io around after
2900 * calling fuse_aio_complete()
2901 */
2902 kref_get(&io->refcnt);
2890 io->done = &wait; 2903 io->done = &wait;
2904 }
2891 2905
2892 if (iov_iter_rw(iter) == WRITE) { 2906 if (iov_iter_rw(iter) == WRITE) {
2893 ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE); 2907 ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
@@ -2900,14 +2914,14 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
2900 fuse_aio_complete(io, ret < 0 ? ret : 0, -1); 2914 fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
2901 2915
2902 /* we have a non-extending, async request, so return */ 2916 /* we have a non-extending, async request, so return */
2903 if (!is_sync_kiocb(iocb)) 2917 if (!is_sync)
2904 return -EIOCBQUEUED; 2918 return -EIOCBQUEUED;
2905 2919
2906 wait_for_completion(&wait); 2920 wait_for_completion(&wait);
2907 ret = fuse_get_res_by_io(io); 2921 ret = fuse_get_res_by_io(io);
2908 } 2922 }
2909 2923
2910 kfree(io); 2924 kref_put(&io->refcnt, fuse_io_release);
2911 2925
2912 if (iov_iter_rw(iter) == WRITE) { 2926 if (iov_iter_rw(iter) == WRITE) {
2913 if (ret > 0) 2927 if (ret > 0)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ce394b5fe6b4..eddbe02c4028 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -22,6 +22,7 @@
22#include <linux/rbtree.h> 22#include <linux/rbtree.h>
23#include <linux/poll.h> 23#include <linux/poll.h>
24#include <linux/workqueue.h> 24#include <linux/workqueue.h>
25#include <linux/kref.h>
25 26
26/** Max number of pages that can be used in a single read request */ 27/** Max number of pages that can be used in a single read request */
27#define FUSE_MAX_PAGES_PER_REQ 32 28#define FUSE_MAX_PAGES_PER_REQ 32
@@ -243,6 +244,7 @@ struct fuse_args {
243 244
244/** The request IO state (for asynchronous processing) */ 245/** The request IO state (for asynchronous processing) */
245struct fuse_io_priv { 246struct fuse_io_priv {
247 struct kref refcnt;
246 int async; 248 int async;
247 spinlock_t lock; 249 spinlock_t lock;
248 unsigned reqs; 250 unsigned reqs;
@@ -256,6 +258,13 @@ struct fuse_io_priv {
256 struct completion *done; 258 struct completion *done;
257}; 259};
258 260
261#define FUSE_IO_PRIV_SYNC(f) \
262{ \
263 .refcnt = { ATOMIC_INIT(1) }, \
264 .async = 0, \
265 .file = f, \
266}
267
259/** 268/**
260 * Request flags 269 * Request flags
261 * 270 *
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 93f07465e5a6..aa016e4b8bec 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1082,7 +1082,7 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
1082 * the first place, mapping->nr_pages will always be zero. 1082 * the first place, mapping->nr_pages will always be zero.
1083 */ 1083 */
1084 if (mapping->nrpages) { 1084 if (mapping->nrpages) {
1085 loff_t lstart = offset & (PAGE_CACHE_SIZE - 1); 1085 loff_t lstart = offset & ~(PAGE_CACHE_SIZE - 1);
1086 loff_t len = iov_iter_count(iter); 1086 loff_t len = iov_iter_count(iter);
1087 loff_t end = PAGE_ALIGN(offset + len) - 1; 1087 loff_t end = PAGE_ALIGN(offset + len) - 1;
1088 1088
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 6a92592304fb..4a01f30e9995 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -798,7 +798,7 @@ static int get_first_leaf(struct gfs2_inode *dip, u32 index,
798 int error; 798 int error;
799 799
800 error = get_leaf_nr(dip, index, &leaf_no); 800 error = get_leaf_nr(dip, index, &leaf_no);
801 if (!error) 801 if (!IS_ERR_VALUE(error))
802 error = get_leaf(dip, leaf_no, bh_out); 802 error = get_leaf(dip, leaf_no, bh_out);
803 803
804 return error; 804 return error;
@@ -1014,7 +1014,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
1014 1014
1015 index = name->hash >> (32 - dip->i_depth); 1015 index = name->hash >> (32 - dip->i_depth);
1016 error = get_leaf_nr(dip, index, &leaf_no); 1016 error = get_leaf_nr(dip, index, &leaf_no);
1017 if (error) 1017 if (IS_ERR_VALUE(error))
1018 return error; 1018 return error;
1019 1019
1020 /* Get the old leaf block */ 1020 /* Get the old leaf block */
@@ -1660,7 +1660,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
1660 brelse(bh); 1660 brelse(bh);
1661 if (fail_on_exist) 1661 if (fail_on_exist)
1662 return ERR_PTR(-EEXIST); 1662 return ERR_PTR(-EEXIST);
1663 inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0); 1663 inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino);
1664 if (!IS_ERR(inode)) 1664 if (!IS_ERR(inode))
1665 GFS2_I(inode)->i_rahead = rahead; 1665 GFS2_I(inode)->i_rahead = rahead;
1666 return inode; 1666 return inode;
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 5d15e9498b48..d5bda8513457 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -137,7 +137,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
137 struct gfs2_sbd *sdp = sb->s_fs_info; 137 struct gfs2_sbd *sdp = sb->s_fs_info;
138 struct inode *inode; 138 struct inode *inode;
139 139
140 inode = gfs2_ilookup(sb, inum->no_addr, 0); 140 inode = gfs2_ilookup(sb, inum->no_addr);
141 if (inode) { 141 if (inode) {
142 if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { 142 if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
143 iput(inode); 143 iput(inode);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a4ff7b56f5cd..6539131c52a2 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -572,17 +572,24 @@ static void delete_work_func(struct work_struct *work)
572 struct inode *inode; 572 struct inode *inode;
573 u64 no_addr = gl->gl_name.ln_number; 573 u64 no_addr = gl->gl_name.ln_number;
574 574
575 /* If someone's using this glock to create a new dinode, the block must
576 have been freed by another node, then re-used, in which case our
577 iopen callback is too late after the fact. Ignore it. */
578 if (test_bit(GLF_INODE_CREATING, &gl->gl_flags))
579 goto out;
580
575 ip = gl->gl_object; 581 ip = gl->gl_object;
576 /* Note: Unsafe to dereference ip as we don't hold right refs/locks */ 582 /* Note: Unsafe to dereference ip as we don't hold right refs/locks */
577 583
578 if (ip) 584 if (ip)
579 inode = gfs2_ilookup(sdp->sd_vfs, no_addr, 1); 585 inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
580 else 586 else
581 inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED); 587 inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
582 if (inode && !IS_ERR(inode)) { 588 if (inode && !IS_ERR(inode)) {
583 d_prune_aliases(inode); 589 d_prune_aliases(inode);
584 iput(inode); 590 iput(inode);
585 } 591 }
592out:
586 gfs2_glock_put(gl); 593 gfs2_glock_put(gl);
587} 594}
588 595
@@ -1015,6 +1022,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1015 handle_callback(gl, LM_ST_UNLOCKED, 0, false); 1022 handle_callback(gl, LM_ST_UNLOCKED, 0, false);
1016 1023
1017 list_del_init(&gh->gh_list); 1024 list_del_init(&gh->gh_list);
1025 clear_bit(HIF_HOLDER, &gh->gh_iflags);
1018 if (find_first_holder(gl) == NULL) { 1026 if (find_first_holder(gl) == NULL) {
1019 if (glops->go_unlock) { 1027 if (glops->go_unlock) {
1020 GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags)); 1028 GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 845fb09cc606..a6a3389a07fc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -328,6 +328,7 @@ enum {
328 GLF_LRU = 13, 328 GLF_LRU = 13,
329 GLF_OBJECT = 14, /* Used only for tracing */ 329 GLF_OBJECT = 14, /* Used only for tracing */
330 GLF_BLOCKING = 15, 330 GLF_BLOCKING = 15,
331 GLF_INODE_CREATING = 16, /* Inode creation occurring */
331}; 332};
332 333
333struct gfs2_glock { 334struct gfs2_glock {
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 352f958769e1..bb30f9a72c65 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -37,61 +37,9 @@
37#include "super.h" 37#include "super.h"
38#include "glops.h" 38#include "glops.h"
39 39
40struct gfs2_skip_data { 40struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr)
41 u64 no_addr;
42 int skipped;
43 int non_block;
44};
45
46static int iget_test(struct inode *inode, void *opaque)
47{
48 struct gfs2_inode *ip = GFS2_I(inode);
49 struct gfs2_skip_data *data = opaque;
50
51 if (ip->i_no_addr == data->no_addr) {
52 if (data->non_block &&
53 inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) {
54 data->skipped = 1;
55 return 0;
56 }
57 return 1;
58 }
59 return 0;
60}
61
62static int iget_set(struct inode *inode, void *opaque)
63{
64 struct gfs2_inode *ip = GFS2_I(inode);
65 struct gfs2_skip_data *data = opaque;
66
67 if (data->skipped)
68 return -ENOENT;
69 inode->i_ino = (unsigned long)(data->no_addr);
70 ip->i_no_addr = data->no_addr;
71 return 0;
72}
73
74struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int non_block)
75{ 41{
76 unsigned long hash = (unsigned long)no_addr; 42 return ilookup(sb, (unsigned long)no_addr);
77 struct gfs2_skip_data data;
78
79 data.no_addr = no_addr;
80 data.skipped = 0;
81 data.non_block = non_block;
82 return ilookup5(sb, hash, iget_test, &data);
83}
84
85static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr,
86 int non_block)
87{
88 struct gfs2_skip_data data;
89 unsigned long hash = (unsigned long)no_addr;
90
91 data.no_addr = no_addr;
92 data.skipped = 0;
93 data.non_block = non_block;
94 return iget5_locked(sb, hash, iget_test, iget_set, &data);
95} 43}
96 44
97/** 45/**
@@ -132,21 +80,21 @@ static void gfs2_set_iop(struct inode *inode)
132 * @sb: The super block 80 * @sb: The super block
133 * @no_addr: The inode number 81 * @no_addr: The inode number
134 * @type: The type of the inode 82 * @type: The type of the inode
135 * non_block: Can we block on inodes that are being freed?
136 * 83 *
137 * Returns: A VFS inode, or an error 84 * Returns: A VFS inode, or an error
138 */ 85 */
139 86
140struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, 87struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
141 u64 no_addr, u64 no_formal_ino, int non_block) 88 u64 no_addr, u64 no_formal_ino)
142{ 89{
143 struct inode *inode; 90 struct inode *inode;
144 struct gfs2_inode *ip; 91 struct gfs2_inode *ip;
145 struct gfs2_glock *io_gl = NULL; 92 struct gfs2_glock *io_gl = NULL;
146 int error; 93 int error;
147 94
148 inode = gfs2_iget(sb, no_addr, non_block); 95 inode = iget_locked(sb, (unsigned long)no_addr);
149 ip = GFS2_I(inode); 96 ip = GFS2_I(inode);
97 ip->i_no_addr = no_addr;
150 98
151 if (!inode) 99 if (!inode)
152 return ERR_PTR(-ENOMEM); 100 return ERR_PTR(-ENOMEM);
@@ -221,7 +169,7 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
221 if (error) 169 if (error)
222 goto fail; 170 goto fail;
223 171
224 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0, 1); 172 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0);
225 if (IS_ERR(inode)) 173 if (IS_ERR(inode))
226 goto fail; 174 goto fail;
227 175
@@ -592,7 +540,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
592 struct inode *inode = NULL; 540 struct inode *inode = NULL;
593 struct gfs2_inode *dip = GFS2_I(dir), *ip; 541 struct gfs2_inode *dip = GFS2_I(dir), *ip;
594 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 542 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
595 struct gfs2_glock *io_gl; 543 struct gfs2_glock *io_gl = NULL;
596 int error, free_vfs_inode = 1; 544 int error, free_vfs_inode = 1;
597 u32 aflags = 0; 545 u32 aflags = 0;
598 unsigned blocks = 1; 546 unsigned blocks = 1;
@@ -729,6 +677,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
729 if (error) 677 if (error)
730 goto fail_gunlock2; 678 goto fail_gunlock2;
731 679
680 BUG_ON(test_and_set_bit(GLF_INODE_CREATING, &io_gl->gl_flags));
681
732 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); 682 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
733 if (error) 683 if (error)
734 goto fail_gunlock2; 684 goto fail_gunlock2;
@@ -771,12 +721,15 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
771 } 721 }
772 gfs2_glock_dq_uninit(ghs); 722 gfs2_glock_dq_uninit(ghs);
773 gfs2_glock_dq_uninit(ghs + 1); 723 gfs2_glock_dq_uninit(ghs + 1);
724 clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags);
774 return error; 725 return error;
775 726
776fail_gunlock3: 727fail_gunlock3:
777 gfs2_glock_dq_uninit(&ip->i_iopen_gh); 728 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
778 gfs2_glock_put(io_gl); 729 gfs2_glock_put(io_gl);
779fail_gunlock2: 730fail_gunlock2:
731 if (io_gl)
732 clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags);
780 gfs2_glock_dq_uninit(ghs + 1); 733 gfs2_glock_dq_uninit(ghs + 1);
781fail_free_inode: 734fail_free_inode:
782 if (ip->i_gl) 735 if (ip->i_gl)
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index ba4d9492d422..e1af0d4aa308 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -94,12 +94,11 @@ err:
94} 94}
95 95
96extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 96extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
97 u64 no_addr, u64 no_formal_ino, 97 u64 no_addr, u64 no_formal_ino);
98 int non_block);
99extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, 98extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
100 u64 *no_formal_ino, 99 u64 *no_formal_ino,
101 unsigned int blktype); 100 unsigned int blktype);
102extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int nonblock); 101extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
103 102
104extern int gfs2_inode_refresh(struct gfs2_inode *ip); 103extern int gfs2_inode_refresh(struct gfs2_inode *ip);
105 104
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index dbed9e243ea2..49b0bff18fe3 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -454,7 +454,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
454 struct dentry *dentry; 454 struct dentry *dentry;
455 struct inode *inode; 455 struct inode *inode;
456 456
457 inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); 457 inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
458 if (IS_ERR(inode)) { 458 if (IS_ERR(inode)) {
459 fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode)); 459 fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
460 return PTR_ERR(inode); 460 return PTR_ERR(inode);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 8f960a51a9a0..f8a0cd821290 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1551,12 +1551,16 @@ static void gfs2_evict_inode(struct inode *inode)
1551 goto out_truncate; 1551 goto out_truncate;
1552 } 1552 }
1553 1553
1554 ip->i_iopen_gh.gh_flags |= GL_NOCACHE; 1554 if (ip->i_iopen_gh.gh_gl &&
1555 gfs2_glock_dq_wait(&ip->i_iopen_gh); 1555 test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
1556 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); 1556 ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
1557 error = gfs2_glock_nq(&ip->i_iopen_gh); 1557 gfs2_glock_dq_wait(&ip->i_iopen_gh);
1558 if (error) 1558 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE,
1559 goto out_truncate; 1559 &ip->i_iopen_gh);
1560 error = gfs2_glock_nq(&ip->i_iopen_gh);
1561 if (error)
1562 goto out_truncate;
1563 }
1560 1564
1561 /* Case 1 starts here */ 1565 /* Case 1 starts here */
1562 1566
@@ -1606,11 +1610,13 @@ out_unlock:
1606 if (gfs2_rs_active(&ip->i_res)) 1610 if (gfs2_rs_active(&ip->i_res))
1607 gfs2_rs_deltree(&ip->i_res); 1611 gfs2_rs_deltree(&ip->i_res);
1608 1612
1609 if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { 1613 if (ip->i_iopen_gh.gh_gl) {
1610 ip->i_iopen_gh.gh_flags |= GL_NOCACHE; 1614 if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
1611 gfs2_glock_dq_wait(&ip->i_iopen_gh); 1615 ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
1616 gfs2_glock_dq_wait(&ip->i_iopen_gh);
1617 }
1618 gfs2_holder_uninit(&ip->i_iopen_gh);
1612 } 1619 }
1613 gfs2_holder_uninit(&ip->i_iopen_gh);
1614 gfs2_glock_dq_uninit(&gh); 1620 gfs2_glock_dq_uninit(&gh);
1615 if (error && error != GLR_TRYFAILED && error != -EROFS) 1621 if (error && error != GLR_TRYFAILED && error != -EROFS)
1616 fs_warn(sdp, "gfs2_evict_inode: %d\n", error); 1622 fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 36345fefa3ff..517f2de784cf 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -131,14 +131,12 @@ static int journal_submit_commit_record(journal_t *journal,
131 if (is_journal_aborted(journal)) 131 if (is_journal_aborted(journal))
132 return 0; 132 return 0;
133 133
134 bh = jbd2_journal_get_descriptor_buffer(journal); 134 bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
135 JBD2_COMMIT_BLOCK);
135 if (!bh) 136 if (!bh)
136 return 1; 137 return 1;
137 138
138 tmp = (struct commit_header *)bh->b_data; 139 tmp = (struct commit_header *)bh->b_data;
139 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
140 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
141 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
142 tmp->h_commit_sec = cpu_to_be64(now.tv_sec); 140 tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
143 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); 141 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
144 142
@@ -222,7 +220,7 @@ static int journal_submit_data_buffers(journal_t *journal,
222 spin_lock(&journal->j_list_lock); 220 spin_lock(&journal->j_list_lock);
223 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 221 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
224 mapping = jinode->i_vfs_inode->i_mapping; 222 mapping = jinode->i_vfs_inode->i_mapping;
225 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); 223 jinode->i_flags |= JI_COMMIT_RUNNING;
226 spin_unlock(&journal->j_list_lock); 224 spin_unlock(&journal->j_list_lock);
227 /* 225 /*
228 * submit the inode data buffers. We use writepage 226 * submit the inode data buffers. We use writepage
@@ -236,8 +234,8 @@ static int journal_submit_data_buffers(journal_t *journal,
236 ret = err; 234 ret = err;
237 spin_lock(&journal->j_list_lock); 235 spin_lock(&journal->j_list_lock);
238 J_ASSERT(jinode->i_transaction == commit_transaction); 236 J_ASSERT(jinode->i_transaction == commit_transaction);
239 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); 237 jinode->i_flags &= ~JI_COMMIT_RUNNING;
240 smp_mb__after_atomic(); 238 smp_mb();
241 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 239 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
242 } 240 }
243 spin_unlock(&journal->j_list_lock); 241 spin_unlock(&journal->j_list_lock);
@@ -258,7 +256,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
258 /* For locking, see the comment in journal_submit_data_buffers() */ 256 /* For locking, see the comment in journal_submit_data_buffers() */
259 spin_lock(&journal->j_list_lock); 257 spin_lock(&journal->j_list_lock);
260 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 258 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
261 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); 259 jinode->i_flags |= JI_COMMIT_RUNNING;
262 spin_unlock(&journal->j_list_lock); 260 spin_unlock(&journal->j_list_lock);
263 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); 261 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
264 if (err) { 262 if (err) {
@@ -274,8 +272,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
274 ret = err; 272 ret = err;
275 } 273 }
276 spin_lock(&journal->j_list_lock); 274 spin_lock(&journal->j_list_lock);
277 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); 275 jinode->i_flags &= ~JI_COMMIT_RUNNING;
278 smp_mb__after_atomic(); 276 smp_mb();
279 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 277 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
280 } 278 }
281 279
@@ -319,22 +317,6 @@ static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
319 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); 317 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
320} 318}
321 319
322static void jbd2_descr_block_csum_set(journal_t *j,
323 struct buffer_head *bh)
324{
325 struct jbd2_journal_block_tail *tail;
326 __u32 csum;
327
328 if (!jbd2_journal_has_csum_v2or3(j))
329 return;
330
331 tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
332 sizeof(struct jbd2_journal_block_tail));
333 tail->t_checksum = 0;
334 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
335 tail->t_checksum = cpu_to_be32(csum);
336}
337
338static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, 320static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
339 struct buffer_head *bh, __u32 sequence) 321 struct buffer_head *bh, __u32 sequence)
340{ 322{
@@ -379,7 +361,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
379 ktime_t start_time; 361 ktime_t start_time;
380 u64 commit_time; 362 u64 commit_time;
381 char *tagp = NULL; 363 char *tagp = NULL;
382 journal_header_t *header;
383 journal_block_tag_t *tag = NULL; 364 journal_block_tag_t *tag = NULL;
384 int space_left = 0; 365 int space_left = 0;
385 int first_tag = 0; 366 int first_tag = 0;
@@ -554,8 +535,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
554 jbd2_journal_abort(journal, err); 535 jbd2_journal_abort(journal, err);
555 536
556 blk_start_plug(&plug); 537 blk_start_plug(&plug);
557 jbd2_journal_write_revoke_records(journal, commit_transaction, 538 jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
558 &log_bufs, WRITE_SYNC);
559 539
560 jbd_debug(3, "JBD2: commit phase 2b\n"); 540 jbd_debug(3, "JBD2: commit phase 2b\n");
561 541
@@ -616,7 +596,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
616 596
617 jbd_debug(4, "JBD2: get descriptor\n"); 597 jbd_debug(4, "JBD2: get descriptor\n");
618 598
619 descriptor = jbd2_journal_get_descriptor_buffer(journal); 599 descriptor = jbd2_journal_get_descriptor_buffer(
600 commit_transaction,
601 JBD2_DESCRIPTOR_BLOCK);
620 if (!descriptor) { 602 if (!descriptor) {
621 jbd2_journal_abort(journal, -EIO); 603 jbd2_journal_abort(journal, -EIO);
622 continue; 604 continue;
@@ -625,11 +607,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
625 jbd_debug(4, "JBD2: got buffer %llu (%p)\n", 607 jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
626 (unsigned long long)descriptor->b_blocknr, 608 (unsigned long long)descriptor->b_blocknr,
627 descriptor->b_data); 609 descriptor->b_data);
628 header = (journal_header_t *)descriptor->b_data;
629 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
630 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
631 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
632
633 tagp = &descriptor->b_data[sizeof(journal_header_t)]; 610 tagp = &descriptor->b_data[sizeof(journal_header_t)];
634 space_left = descriptor->b_size - 611 space_left = descriptor->b_size -
635 sizeof(journal_header_t); 612 sizeof(journal_header_t);
@@ -721,7 +698,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
721 698
722 tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); 699 tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
723 700
724 jbd2_descr_block_csum_set(journal, descriptor); 701 jbd2_descriptor_block_csum_set(journal, descriptor);
725start_journal_io: 702start_journal_io:
726 for (i = 0; i < bufs; i++) { 703 for (i = 0; i < bufs; i++) {
727 struct buffer_head *bh = wbuf[i]; 704 struct buffer_head *bh = wbuf[i];
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 81e622681c82..de73a9516a54 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -805,10 +805,13 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
805 * But we don't bother doing that, so there will be coherency problems with 805 * But we don't bother doing that, so there will be coherency problems with
806 * mmaps of blockdevs which hold live JBD-controlled filesystems. 806 * mmaps of blockdevs which hold live JBD-controlled filesystems.
807 */ 807 */
808struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) 808struct buffer_head *
809jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type)
809{ 810{
811 journal_t *journal = transaction->t_journal;
810 struct buffer_head *bh; 812 struct buffer_head *bh;
811 unsigned long long blocknr; 813 unsigned long long blocknr;
814 journal_header_t *header;
812 int err; 815 int err;
813 816
814 err = jbd2_journal_next_log_block(journal, &blocknr); 817 err = jbd2_journal_next_log_block(journal, &blocknr);
@@ -821,12 +824,31 @@ struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
821 return NULL; 824 return NULL;
822 lock_buffer(bh); 825 lock_buffer(bh);
823 memset(bh->b_data, 0, journal->j_blocksize); 826 memset(bh->b_data, 0, journal->j_blocksize);
827 header = (journal_header_t *)bh->b_data;
828 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
829 header->h_blocktype = cpu_to_be32(type);
830 header->h_sequence = cpu_to_be32(transaction->t_tid);
824 set_buffer_uptodate(bh); 831 set_buffer_uptodate(bh);
825 unlock_buffer(bh); 832 unlock_buffer(bh);
826 BUFFER_TRACE(bh, "return this buffer"); 833 BUFFER_TRACE(bh, "return this buffer");
827 return bh; 834 return bh;
828} 835}
829 836
837void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh)
838{
839 struct jbd2_journal_block_tail *tail;
840 __u32 csum;
841
842 if (!jbd2_journal_has_csum_v2or3(j))
843 return;
844
845 tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
846 sizeof(struct jbd2_journal_block_tail));
847 tail->t_checksum = 0;
848 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
849 tail->t_checksum = cpu_to_be32(csum);
850}
851
830/* 852/*
831 * Return tid of the oldest transaction in the journal and block in the journal 853 * Return tid of the oldest transaction in the journal and block in the journal
832 * where the transaction starts. 854 * where the transaction starts.
@@ -1408,11 +1430,12 @@ out:
1408/** 1430/**
1409 * jbd2_mark_journal_empty() - Mark on disk journal as empty. 1431 * jbd2_mark_journal_empty() - Mark on disk journal as empty.
1410 * @journal: The journal to update. 1432 * @journal: The journal to update.
1433 * @write_op: With which operation should we write the journal sb
1411 * 1434 *
1412 * Update a journal's dynamic superblock fields to show that journal is empty. 1435 * Update a journal's dynamic superblock fields to show that journal is empty.
1413 * Write updated superblock to disk waiting for IO to complete. 1436 * Write updated superblock to disk waiting for IO to complete.
1414 */ 1437 */
1415static void jbd2_mark_journal_empty(journal_t *journal) 1438static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
1416{ 1439{
1417 journal_superblock_t *sb = journal->j_superblock; 1440 journal_superblock_t *sb = journal->j_superblock;
1418 1441
@@ -1430,7 +1453,7 @@ static void jbd2_mark_journal_empty(journal_t *journal)
1430 sb->s_start = cpu_to_be32(0); 1453 sb->s_start = cpu_to_be32(0);
1431 read_unlock(&journal->j_state_lock); 1454 read_unlock(&journal->j_state_lock);
1432 1455
1433 jbd2_write_superblock(journal, WRITE_FUA); 1456 jbd2_write_superblock(journal, write_op);
1434 1457
1435 /* Log is no longer empty */ 1458 /* Log is no longer empty */
1436 write_lock(&journal->j_state_lock); 1459 write_lock(&journal->j_state_lock);
@@ -1716,7 +1739,13 @@ int jbd2_journal_destroy(journal_t *journal)
1716 if (journal->j_sb_buffer) { 1739 if (journal->j_sb_buffer) {
1717 if (!is_journal_aborted(journal)) { 1740 if (!is_journal_aborted(journal)) {
1718 mutex_lock(&journal->j_checkpoint_mutex); 1741 mutex_lock(&journal->j_checkpoint_mutex);
1719 jbd2_mark_journal_empty(journal); 1742
1743 write_lock(&journal->j_state_lock);
1744 journal->j_tail_sequence =
1745 ++journal->j_transaction_sequence;
1746 write_unlock(&journal->j_state_lock);
1747
1748 jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA);
1720 mutex_unlock(&journal->j_checkpoint_mutex); 1749 mutex_unlock(&journal->j_checkpoint_mutex);
1721 } else 1750 } else
1722 err = -EIO; 1751 err = -EIO;
@@ -1975,7 +2004,7 @@ int jbd2_journal_flush(journal_t *journal)
1975 * the magic code for a fully-recovered superblock. Any future 2004 * the magic code for a fully-recovered superblock. Any future
1976 * commits of data to the journal will restore the current 2005 * commits of data to the journal will restore the current
1977 * s_start value. */ 2006 * s_start value. */
1978 jbd2_mark_journal_empty(journal); 2007 jbd2_mark_journal_empty(journal, WRITE_FUA);
1979 mutex_unlock(&journal->j_checkpoint_mutex); 2008 mutex_unlock(&journal->j_checkpoint_mutex);
1980 write_lock(&journal->j_state_lock); 2009 write_lock(&journal->j_state_lock);
1981 J_ASSERT(!journal->j_running_transaction); 2010 J_ASSERT(!journal->j_running_transaction);
@@ -2021,7 +2050,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
2021 if (write) { 2050 if (write) {
2022 /* Lock to make assertions happy... */ 2051 /* Lock to make assertions happy... */
2023 mutex_lock(&journal->j_checkpoint_mutex); 2052 mutex_lock(&journal->j_checkpoint_mutex);
2024 jbd2_mark_journal_empty(journal); 2053 jbd2_mark_journal_empty(journal, WRITE_FUA);
2025 mutex_unlock(&journal->j_checkpoint_mutex); 2054 mutex_unlock(&journal->j_checkpoint_mutex);
2026 } 2055 }
2027 2056
@@ -2565,7 +2594,7 @@ void jbd2_journal_release_jbd_inode(journal_t *journal,
2565restart: 2594restart:
2566 spin_lock(&journal->j_list_lock); 2595 spin_lock(&journal->j_list_lock);
2567 /* Is commit writing out inode - we have to wait */ 2596 /* Is commit writing out inode - we have to wait */
2568 if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) { 2597 if (jinode->i_flags & JI_COMMIT_RUNNING) {
2569 wait_queue_head_t *wq; 2598 wait_queue_head_t *wq;
2570 DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); 2599 DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
2571 wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); 2600 wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 7f277e49fe88..08a456b96e4e 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -174,8 +174,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
174 return 0; 174 return 0;
175} 175}
176 176
177static int jbd2_descr_block_csum_verify(journal_t *j, 177static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf)
178 void *buf)
179{ 178{
180 struct jbd2_journal_block_tail *tail; 179 struct jbd2_journal_block_tail *tail;
181 __be32 provided; 180 __be32 provided;
@@ -522,8 +521,8 @@ static int do_one_pass(journal_t *journal,
522 descr_csum_size = 521 descr_csum_size =
523 sizeof(struct jbd2_journal_block_tail); 522 sizeof(struct jbd2_journal_block_tail);
524 if (descr_csum_size > 0 && 523 if (descr_csum_size > 0 &&
525 !jbd2_descr_block_csum_verify(journal, 524 !jbd2_descriptor_block_csum_verify(journal,
526 bh->b_data)) { 525 bh->b_data)) {
527 printk(KERN_ERR "JBD2: Invalid checksum " 526 printk(KERN_ERR "JBD2: Invalid checksum "
528 "recovering block %lu in log\n", 527 "recovering block %lu in log\n",
529 next_log_block); 528 next_log_block);
@@ -811,26 +810,6 @@ static int do_one_pass(journal_t *journal,
811 return err; 810 return err;
812} 811}
813 812
814static int jbd2_revoke_block_csum_verify(journal_t *j,
815 void *buf)
816{
817 struct jbd2_journal_revoke_tail *tail;
818 __be32 provided;
819 __u32 calculated;
820
821 if (!jbd2_journal_has_csum_v2or3(j))
822 return 1;
823
824 tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
825 sizeof(struct jbd2_journal_revoke_tail));
826 provided = tail->r_checksum;
827 tail->r_checksum = 0;
828 calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
829 tail->r_checksum = provided;
830
831 return provided == cpu_to_be32(calculated);
832}
833
834/* Scan a revoke record, marking all blocks mentioned as revoked. */ 813/* Scan a revoke record, marking all blocks mentioned as revoked. */
835 814
836static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, 815static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
@@ -846,11 +825,11 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
846 offset = sizeof(jbd2_journal_revoke_header_t); 825 offset = sizeof(jbd2_journal_revoke_header_t);
847 rcount = be32_to_cpu(header->r_count); 826 rcount = be32_to_cpu(header->r_count);
848 827
849 if (!jbd2_revoke_block_csum_verify(journal, header)) 828 if (!jbd2_descriptor_block_csum_verify(journal, header))
850 return -EFSBADCRC; 829 return -EFSBADCRC;
851 830
852 if (jbd2_journal_has_csum_v2or3(journal)) 831 if (jbd2_journal_has_csum_v2or3(journal))
853 csum_size = sizeof(struct jbd2_journal_revoke_tail); 832 csum_size = sizeof(struct jbd2_journal_block_tail);
854 if (rcount > journal->j_blocksize - csum_size) 833 if (rcount > journal->j_blocksize - csum_size)
855 return -EINVAL; 834 return -EINVAL;
856 max = rcount; 835 max = rcount;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 705ae577882b..91171dc352cb 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -122,11 +122,11 @@ struct jbd2_revoke_table_s
122 122
123 123
124#ifdef __KERNEL__ 124#ifdef __KERNEL__
125static void write_one_revoke_record(journal_t *, transaction_t *, 125static void write_one_revoke_record(transaction_t *,
126 struct list_head *, 126 struct list_head *,
127 struct buffer_head **, int *, 127 struct buffer_head **, int *,
128 struct jbd2_revoke_record_s *, int); 128 struct jbd2_revoke_record_s *);
129static void flush_descriptor(journal_t *, struct buffer_head *, int, int); 129static void flush_descriptor(journal_t *, struct buffer_head *, int);
130#endif 130#endif
131 131
132/* Utility functions to maintain the revoke table */ 132/* Utility functions to maintain the revoke table */
@@ -519,11 +519,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
519 * Write revoke records to the journal for all entries in the current 519 * Write revoke records to the journal for all entries in the current
520 * revoke hash, deleting the entries as we go. 520 * revoke hash, deleting the entries as we go.
521 */ 521 */
522void jbd2_journal_write_revoke_records(journal_t *journal, 522void jbd2_journal_write_revoke_records(transaction_t *transaction,
523 transaction_t *transaction, 523 struct list_head *log_bufs)
524 struct list_head *log_bufs,
525 int write_op)
526{ 524{
525 journal_t *journal = transaction->t_journal;
527 struct buffer_head *descriptor; 526 struct buffer_head *descriptor;
528 struct jbd2_revoke_record_s *record; 527 struct jbd2_revoke_record_s *record;
529 struct jbd2_revoke_table_s *revoke; 528 struct jbd2_revoke_table_s *revoke;
@@ -544,16 +543,15 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
544 while (!list_empty(hash_list)) { 543 while (!list_empty(hash_list)) {
545 record = (struct jbd2_revoke_record_s *) 544 record = (struct jbd2_revoke_record_s *)
546 hash_list->next; 545 hash_list->next;
547 write_one_revoke_record(journal, transaction, log_bufs, 546 write_one_revoke_record(transaction, log_bufs,
548 &descriptor, &offset, 547 &descriptor, &offset, record);
549 record, write_op);
550 count++; 548 count++;
551 list_del(&record->hash); 549 list_del(&record->hash);
552 kmem_cache_free(jbd2_revoke_record_cache, record); 550 kmem_cache_free(jbd2_revoke_record_cache, record);
553 } 551 }
554 } 552 }
555 if (descriptor) 553 if (descriptor)
556 flush_descriptor(journal, descriptor, offset, write_op); 554 flush_descriptor(journal, descriptor, offset);
557 jbd_debug(1, "Wrote %d revoke records\n", count); 555 jbd_debug(1, "Wrote %d revoke records\n", count);
558} 556}
559 557
@@ -562,18 +560,16 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
562 * block if the old one is full or if we have not already created one. 560 * block if the old one is full or if we have not already created one.
563 */ 561 */
564 562
565static void write_one_revoke_record(journal_t *journal, 563static void write_one_revoke_record(transaction_t *transaction,
566 transaction_t *transaction,
567 struct list_head *log_bufs, 564 struct list_head *log_bufs,
568 struct buffer_head **descriptorp, 565 struct buffer_head **descriptorp,
569 int *offsetp, 566 int *offsetp,
570 struct jbd2_revoke_record_s *record, 567 struct jbd2_revoke_record_s *record)
571 int write_op)
572{ 568{
569 journal_t *journal = transaction->t_journal;
573 int csum_size = 0; 570 int csum_size = 0;
574 struct buffer_head *descriptor; 571 struct buffer_head *descriptor;
575 int sz, offset; 572 int sz, offset;
576 journal_header_t *header;
577 573
578 /* If we are already aborting, this all becomes a noop. We 574 /* If we are already aborting, this all becomes a noop. We
579 still need to go round the loop in 575 still need to go round the loop in
@@ -587,7 +583,7 @@ static void write_one_revoke_record(journal_t *journal,
587 583
588 /* Do we need to leave space at the end for a checksum? */ 584 /* Do we need to leave space at the end for a checksum? */
589 if (jbd2_journal_has_csum_v2or3(journal)) 585 if (jbd2_journal_has_csum_v2or3(journal))
590 csum_size = sizeof(struct jbd2_journal_revoke_tail); 586 csum_size = sizeof(struct jbd2_journal_block_tail);
591 587
592 if (jbd2_has_feature_64bit(journal)) 588 if (jbd2_has_feature_64bit(journal))
593 sz = 8; 589 sz = 8;
@@ -597,19 +593,16 @@ static void write_one_revoke_record(journal_t *journal,
597 /* Make sure we have a descriptor with space left for the record */ 593 /* Make sure we have a descriptor with space left for the record */
598 if (descriptor) { 594 if (descriptor) {
599 if (offset + sz > journal->j_blocksize - csum_size) { 595 if (offset + sz > journal->j_blocksize - csum_size) {
600 flush_descriptor(journal, descriptor, offset, write_op); 596 flush_descriptor(journal, descriptor, offset);
601 descriptor = NULL; 597 descriptor = NULL;
602 } 598 }
603 } 599 }
604 600
605 if (!descriptor) { 601 if (!descriptor) {
606 descriptor = jbd2_journal_get_descriptor_buffer(journal); 602 descriptor = jbd2_journal_get_descriptor_buffer(transaction,
603 JBD2_REVOKE_BLOCK);
607 if (!descriptor) 604 if (!descriptor)
608 return; 605 return;
609 header = (journal_header_t *)descriptor->b_data;
610 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
611 header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
612 header->h_sequence = cpu_to_be32(transaction->t_tid);
613 606
614 /* Record it so that we can wait for IO completion later */ 607 /* Record it so that we can wait for IO completion later */
615 BUFFER_TRACE(descriptor, "file in log_bufs"); 608 BUFFER_TRACE(descriptor, "file in log_bufs");
@@ -630,21 +623,6 @@ static void write_one_revoke_record(journal_t *journal,
630 *offsetp = offset; 623 *offsetp = offset;
631} 624}
632 625
633static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
634{
635 struct jbd2_journal_revoke_tail *tail;
636 __u32 csum;
637
638 if (!jbd2_journal_has_csum_v2or3(j))
639 return;
640
641 tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
642 sizeof(struct jbd2_journal_revoke_tail));
643 tail->r_checksum = 0;
644 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
645 tail->r_checksum = cpu_to_be32(csum);
646}
647
648/* 626/*
649 * Flush a revoke descriptor out to the journal. If we are aborting, 627 * Flush a revoke descriptor out to the journal. If we are aborting,
650 * this is a noop; otherwise we are generating a buffer which needs to 628 * this is a noop; otherwise we are generating a buffer which needs to
@@ -654,7 +632,7 @@ static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
654 632
655static void flush_descriptor(journal_t *journal, 633static void flush_descriptor(journal_t *journal,
656 struct buffer_head *descriptor, 634 struct buffer_head *descriptor,
657 int offset, int write_op) 635 int offset)
658{ 636{
659 jbd2_journal_revoke_header_t *header; 637 jbd2_journal_revoke_header_t *header;
660 638
@@ -665,12 +643,12 @@ static void flush_descriptor(journal_t *journal,
665 643
666 header = (jbd2_journal_revoke_header_t *)descriptor->b_data; 644 header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
667 header->r_count = cpu_to_be32(offset); 645 header->r_count = cpu_to_be32(offset);
668 jbd2_revoke_csum_set(journal, descriptor); 646 jbd2_descriptor_block_csum_set(journal, descriptor);
669 647
670 set_buffer_jwrite(descriptor); 648 set_buffer_jwrite(descriptor);
671 BUFFER_TRACE(descriptor, "write"); 649 BUFFER_TRACE(descriptor, "write");
672 set_buffer_dirty(descriptor); 650 set_buffer_dirty(descriptor);
673 write_dirty_buffer(descriptor, write_op); 651 write_dirty_buffer(descriptor, WRITE_SYNC);
674} 652}
675#endif 653#endif
676 654
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 081dff087fc0..01e4652d88f6 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -966,14 +966,8 @@ repeat:
966 if (!frozen_buffer) { 966 if (!frozen_buffer) {
967 JBUFFER_TRACE(jh, "allocate memory for buffer"); 967 JBUFFER_TRACE(jh, "allocate memory for buffer");
968 jbd_unlock_bh_state(bh); 968 jbd_unlock_bh_state(bh);
969 frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); 969 frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
970 if (!frozen_buffer) { 970 GFP_NOFS | __GFP_NOFAIL);
971 printk(KERN_ERR "%s: OOM for frozen_buffer\n",
972 __func__);
973 JBUFFER_TRACE(jh, "oom!");
974 error = -ENOMEM;
975 goto out;
976 }
977 goto repeat; 971 goto repeat;
978 } 972 }
979 jh->b_frozen_data = frozen_buffer; 973 jh->b_frozen_data = frozen_buffer;
@@ -1226,15 +1220,9 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
1226 goto out; 1220 goto out;
1227 1221
1228repeat: 1222repeat:
1229 if (!jh->b_committed_data) { 1223 if (!jh->b_committed_data)
1230 committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); 1224 committed_data = jbd2_alloc(jh2bh(jh)->b_size,
1231 if (!committed_data) { 1225 GFP_NOFS|__GFP_NOFAIL);
1232 printk(KERN_ERR "%s: No memory for committed data\n",
1233 __func__);
1234 err = -ENOMEM;
1235 goto out;
1236 }
1237 }
1238 1226
1239 jbd_lock_bh_state(bh); 1227 jbd_lock_bh_state(bh);
1240 if (!jh->b_committed_data) { 1228 if (!jh->b_committed_data) {
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 95d5880a63ee..7e553f286775 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -134,37 +134,59 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
134 if (mutex_lock_interruptible(&c->alloc_sem)) 134 if (mutex_lock_interruptible(&c->alloc_sem))
135 return -EINTR; 135 return -EINTR;
136 136
137
137 for (;;) { 138 for (;;) {
139 /* We can't start doing GC until we've finished checking
140 the node CRCs etc. */
141 int bucket, want_ino;
142
138 spin_lock(&c->erase_completion_lock); 143 spin_lock(&c->erase_completion_lock);
139 if (!c->unchecked_size) 144 if (!c->unchecked_size)
140 break; 145 break;
141
142 /* We can't start doing GC yet. We haven't finished checking
143 the node CRCs etc. Do it now. */
144
145 /* checked_ino is protected by the alloc_sem */
146 if (c->checked_ino > c->highest_ino && xattr) {
147 pr_crit("Checked all inodes but still 0x%x bytes of unchecked space?\n",
148 c->unchecked_size);
149 jffs2_dbg_dump_block_lists_nolock(c);
150 spin_unlock(&c->erase_completion_lock);
151 mutex_unlock(&c->alloc_sem);
152 return -ENOSPC;
153 }
154
155 spin_unlock(&c->erase_completion_lock); 146 spin_unlock(&c->erase_completion_lock);
156 147
157 if (!xattr) 148 if (!xattr)
158 xattr = jffs2_verify_xattr(c); 149 xattr = jffs2_verify_xattr(c);
159 150
160 spin_lock(&c->inocache_lock); 151 spin_lock(&c->inocache_lock);
152 /* Instead of doing the inodes in numeric order, doing a lookup
153 * in the hash for each possible number, just walk the hash
154 * buckets of *existing* inodes. This means that we process
155 * them out-of-order, but it can be a lot faster if there's
156 * a sparse inode# space. Which there often is. */
157 want_ino = c->check_ino;
158 for (bucket = c->check_ino % c->inocache_hashsize ; bucket < c->inocache_hashsize; bucket++) {
159 for (ic = c->inocache_list[bucket]; ic; ic = ic->next) {
160 if (ic->ino < want_ino)
161 continue;
162
163 if (ic->state != INO_STATE_CHECKEDABSENT &&
164 ic->state != INO_STATE_PRESENT)
165 goto got_next; /* with inocache_lock held */
166
167 jffs2_dbg(1, "Skipping ino #%u already checked\n",
168 ic->ino);
169 }
170 want_ino = 0;
171 }
161 172
162 ic = jffs2_get_ino_cache(c, c->checked_ino++); 173 /* Point c->check_ino past the end of the last bucket. */
174 c->check_ino = ((c->highest_ino + c->inocache_hashsize + 1) &
175 ~c->inocache_hashsize) - 1;
163 176
164 if (!ic) { 177 spin_unlock(&c->inocache_lock);
165 spin_unlock(&c->inocache_lock); 178
166 continue; 179 pr_crit("Checked all inodes but still 0x%x bytes of unchecked space?\n",
167 } 180 c->unchecked_size);
181 jffs2_dbg_dump_block_lists_nolock(c);
182 mutex_unlock(&c->alloc_sem);
183 return -ENOSPC;
184
185 got_next:
186 /* For next time round the loop, we want c->checked_ino to indicate
187 * the *next* one we want to check. And since we're walking the
188 * buckets rather than doing it sequentially, it's: */
189 c->check_ino = ic->ino + c->inocache_hashsize;
168 190
169 if (!ic->pino_nlink) { 191 if (!ic->pino_nlink) {
170 jffs2_dbg(1, "Skipping check of ino #%d with nlink/pino zero\n", 192 jffs2_dbg(1, "Skipping check of ino #%d with nlink/pino zero\n",
@@ -176,8 +198,6 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
176 switch(ic->state) { 198 switch(ic->state) {
177 case INO_STATE_CHECKEDABSENT: 199 case INO_STATE_CHECKEDABSENT:
178 case INO_STATE_PRESENT: 200 case INO_STATE_PRESENT:
179 jffs2_dbg(1, "Skipping ino #%u already checked\n",
180 ic->ino);
181 spin_unlock(&c->inocache_lock); 201 spin_unlock(&c->inocache_lock);
182 continue; 202 continue;
183 203
@@ -196,7 +216,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
196 ic->ino); 216 ic->ino);
197 /* We need to come back again for the _same_ inode. We've 217 /* We need to come back again for the _same_ inode. We've
198 made no progress in this case, but that should be OK */ 218 made no progress in this case, but that should be OK */
199 c->checked_ino--; 219 c->check_ino = ic->ino;
200 220
201 mutex_unlock(&c->alloc_sem); 221 mutex_unlock(&c->alloc_sem);
202 sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock); 222 sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock);
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 046fee8b6e9b..778275f48a87 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -49,7 +49,7 @@ struct jffs2_sb_info {
49 struct mtd_info *mtd; 49 struct mtd_info *mtd;
50 50
51 uint32_t highest_ino; 51 uint32_t highest_ino;
52 uint32_t checked_ino; 52 uint32_t check_ino; /* *NEXT* inode to be checked */
53 53
54 unsigned int flags; 54 unsigned int flags;
55 55
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index b6bd4affd9ad..cda0774c2c9c 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -846,8 +846,8 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c)
846 return 1; 846 return 1;
847 847
848 if (c->unchecked_size) { 848 if (c->unchecked_size) {
849 jffs2_dbg(1, "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n", 849 jffs2_dbg(1, "jffs2_thread_should_wake(): unchecked_size %d, check_ino #%d\n",
850 c->unchecked_size, c->checked_ino); 850 c->unchecked_size, c->check_ino);
851 return 1; 851 return 1;
852 } 852 }
853 853
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 5a3da3f52908..b25d28a21212 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1183,22 +1183,20 @@ void jffs2_dirty_trigger(struct jffs2_sb_info *c)
1183 1183
1184int jffs2_nand_flash_setup(struct jffs2_sb_info *c) 1184int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
1185{ 1185{
1186 struct nand_ecclayout *oinfo = c->mtd->ecclayout;
1187
1188 if (!c->mtd->oobsize) 1186 if (!c->mtd->oobsize)
1189 return 0; 1187 return 0;
1190 1188
1191 /* Cleanmarker is out-of-band, so inline size zero */ 1189 /* Cleanmarker is out-of-band, so inline size zero */
1192 c->cleanmarker_size = 0; 1190 c->cleanmarker_size = 0;
1193 1191
1194 if (!oinfo || oinfo->oobavail == 0) { 1192 if (c->mtd->oobavail == 0) {
1195 pr_err("inconsistent device description\n"); 1193 pr_err("inconsistent device description\n");
1196 return -EINVAL; 1194 return -EINVAL;
1197 } 1195 }
1198 1196
1199 jffs2_dbg(1, "using OOB on NAND\n"); 1197 jffs2_dbg(1, "using OOB on NAND\n");
1200 1198
1201 c->oobavail = oinfo->oobavail; 1199 c->oobavail = c->mtd->oobavail;
1202 1200
1203 /* Initialise write buffer */ 1201 /* Initialise write buffer */
1204 init_rwsem(&c->wbuf_sem); 1202 init_rwsem(&c->wbuf_sem);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 996b7742c90b..03b688d19f69 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -44,28 +44,122 @@ static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
44 return strlcpy(buf, kn->parent ? kn->name : "/", buflen); 44 return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
45} 45}
46 46
47static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf, 47/* kernfs_node_depth - compute depth from @from to @to */
48 size_t buflen) 48static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to)
49{ 49{
50 char *p = buf + buflen; 50 size_t depth = 0;
51 int len;
52 51
53 *--p = '\0'; 52 while (to->parent && to != from) {
53 depth++;
54 to = to->parent;
55 }
56 return depth;
57}
54 58
55 do { 59static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
56 len = strlen(kn->name); 60 struct kernfs_node *b)
57 if (p - buf < len + 1) { 61{
58 buf[0] = '\0'; 62 size_t da, db;
59 p = NULL; 63 struct kernfs_root *ra = kernfs_root(a), *rb = kernfs_root(b);
60 break; 64
61 } 65 if (ra != rb)
62 p -= len; 66 return NULL;
63 memcpy(p, kn->name, len); 67
64 *--p = '/'; 68 da = kernfs_depth(ra->kn, a);
65 kn = kn->parent; 69 db = kernfs_depth(rb->kn, b);
66 } while (kn && kn->parent); 70
71 while (da > db) {
72 a = a->parent;
73 da--;
74 }
75 while (db > da) {
76 b = b->parent;
77 db--;
78 }
79
80 /* worst case b and a will be the same at root */
81 while (b != a) {
82 b = b->parent;
83 a = a->parent;
84 }
85
86 return a;
87}
88
89/**
90 * kernfs_path_from_node_locked - find a pseudo-absolute path to @kn_to,
91 * where kn_from is treated as root of the path.
92 * @kn_from: kernfs node which should be treated as root for the path
93 * @kn_to: kernfs node to which path is needed
94 * @buf: buffer to copy the path into
95 * @buflen: size of @buf
96 *
97 * We need to handle couple of scenarios here:
98 * [1] when @kn_from is an ancestor of @kn_to at some level
99 * kn_from: /n1/n2/n3
100 * kn_to: /n1/n2/n3/n4/n5
101 * result: /n4/n5
102 *
103 * [2] when @kn_from is on a different hierarchy and we need to find common
104 * ancestor between @kn_from and @kn_to.
105 * kn_from: /n1/n2/n3/n4
106 * kn_to: /n1/n2/n5
107 * result: /../../n5
108 * OR
109 * kn_from: /n1/n2/n3/n4/n5 [depth=5]
110 * kn_to: /n1/n2/n3 [depth=3]
111 * result: /../..
112 *
113 * return value: length of the string. If greater than buflen,
114 * then contents of buf are undefined. On error, -1 is returned.
115 */
116static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
117 struct kernfs_node *kn_from,
118 char *buf, size_t buflen)
119{
120 struct kernfs_node *kn, *common;
121 const char parent_str[] = "/..";
122 size_t depth_from, depth_to, len = 0, nlen = 0;
123 char *p;
124 int i;
125
126 if (!kn_from)
127 kn_from = kernfs_root(kn_to)->kn;
128
129 if (kn_from == kn_to)
130 return strlcpy(buf, "/", buflen);
131
132 common = kernfs_common_ancestor(kn_from, kn_to);
133 if (WARN_ON(!common))
134 return -1;
135
136 depth_to = kernfs_depth(common, kn_to);
137 depth_from = kernfs_depth(common, kn_from);
138
139 if (buf)
140 buf[0] = '\0';
141
142 for (i = 0; i < depth_from; i++)
143 len += strlcpy(buf + len, parent_str,
144 len < buflen ? buflen - len : 0);
145
146 /* Calculate how many bytes we need for the rest */
147 for (kn = kn_to; kn != common; kn = kn->parent)
148 nlen += strlen(kn->name) + 1;
149
150 if (len + nlen >= buflen)
151 return len + nlen;
152
153 p = buf + len + nlen;
154 *p = '\0';
155 for (kn = kn_to; kn != common; kn = kn->parent) {
156 nlen = strlen(kn->name);
157 p -= nlen;
158 memcpy(p, kn->name, nlen);
159 *(--p) = '/';
160 }
67 161
68 return p; 162 return len + nlen;
69} 163}
70 164
71/** 165/**
@@ -115,6 +209,34 @@ size_t kernfs_path_len(struct kernfs_node *kn)
115} 209}
116 210
117/** 211/**
212 * kernfs_path_from_node - build path of node @to relative to @from.
213 * @from: parent kernfs_node relative to which we need to build the path
214 * @to: kernfs_node of interest
215 * @buf: buffer to copy @to's path into
216 * @buflen: size of @buf
217 *
218 * Builds @to's path relative to @from in @buf. @from and @to must
219 * be on the same kernfs-root. If @from is not parent of @to, then a relative
220 * path (which includes '..'s) as needed to reach from @from to @to is
221 * returned.
222 *
223 * If @buf isn't long enough, the return value will be greater than @buflen
224 * and @buf contents are undefined.
225 */
226int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from,
227 char *buf, size_t buflen)
228{
229 unsigned long flags;
230 int ret;
231
232 spin_lock_irqsave(&kernfs_rename_lock, flags);
233 ret = kernfs_path_from_node_locked(to, from, buf, buflen);
234 spin_unlock_irqrestore(&kernfs_rename_lock, flags);
235 return ret;
236}
237EXPORT_SYMBOL_GPL(kernfs_path_from_node);
238
239/**
118 * kernfs_path - build full path of a given node 240 * kernfs_path - build full path of a given node
119 * @kn: kernfs_node of interest 241 * @kn: kernfs_node of interest
120 * @buf: buffer to copy @kn's name into 242 * @buf: buffer to copy @kn's name into
@@ -127,13 +249,12 @@ size_t kernfs_path_len(struct kernfs_node *kn)
127 */ 249 */
128char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) 250char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
129{ 251{
130 unsigned long flags; 252 int ret;
131 char *p;
132 253
133 spin_lock_irqsave(&kernfs_rename_lock, flags); 254 ret = kernfs_path_from_node(kn, NULL, buf, buflen);
134 p = kernfs_path_locked(kn, buf, buflen); 255 if (ret < 0 || ret >= buflen)
135 spin_unlock_irqrestore(&kernfs_rename_lock, flags); 256 return NULL;
136 return p; 257 return buf;
137} 258}
138EXPORT_SYMBOL_GPL(kernfs_path); 259EXPORT_SYMBOL_GPL(kernfs_path);
139 260
@@ -164,17 +285,25 @@ void pr_cont_kernfs_name(struct kernfs_node *kn)
164void pr_cont_kernfs_path(struct kernfs_node *kn) 285void pr_cont_kernfs_path(struct kernfs_node *kn)
165{ 286{
166 unsigned long flags; 287 unsigned long flags;
167 char *p; 288 int sz;
168 289
169 spin_lock_irqsave(&kernfs_rename_lock, flags); 290 spin_lock_irqsave(&kernfs_rename_lock, flags);
170 291
171 p = kernfs_path_locked(kn, kernfs_pr_cont_buf, 292 sz = kernfs_path_from_node_locked(kn, NULL, kernfs_pr_cont_buf,
172 sizeof(kernfs_pr_cont_buf)); 293 sizeof(kernfs_pr_cont_buf));
173 if (p) 294 if (sz < 0) {
174 pr_cont("%s", p); 295 pr_cont("(error)");
175 else 296 goto out;
176 pr_cont("<name too long>"); 297 }
298
299 if (sz >= sizeof(kernfs_pr_cont_buf)) {
300 pr_cont("(name too long)");
301 goto out;
302 }
177 303
304 pr_cont("%s", kernfs_pr_cont_buf);
305
306out:
178 spin_unlock_irqrestore(&kernfs_rename_lock, flags); 307 spin_unlock_irqrestore(&kernfs_rename_lock, flags);
179} 308}
180 309
@@ -691,15 +820,22 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
691 const unsigned char *path, 820 const unsigned char *path,
692 const void *ns) 821 const void *ns)
693{ 822{
694 static char path_buf[PATH_MAX]; /* protected by kernfs_mutex */ 823 size_t len;
695 size_t len = strlcpy(path_buf, path, PATH_MAX); 824 char *p, *name;
696 char *p = path_buf;
697 char *name;
698 825
699 lockdep_assert_held(&kernfs_mutex); 826 lockdep_assert_held(&kernfs_mutex);
700 827
701 if (len >= PATH_MAX) 828 /* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */
829 spin_lock_irq(&kernfs_rename_lock);
830
831 len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));
832
833 if (len >= sizeof(kernfs_pr_cont_buf)) {
834 spin_unlock_irq(&kernfs_rename_lock);
702 return NULL; 835 return NULL;
836 }
837
838 p = kernfs_pr_cont_buf;
703 839
704 while ((name = strsep(&p, "/")) && parent) { 840 while ((name = strsep(&p, "/")) && parent) {
705 if (*name == '\0') 841 if (*name == '\0')
@@ -707,6 +843,8 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
707 parent = kernfs_find_ns(parent, name, ns); 843 parent = kernfs_find_ns(parent, name, ns);
708 } 844 }
709 845
846 spin_unlock_irq(&kernfs_rename_lock);
847
710 return parent; 848 return parent;
711} 849}
712 850
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 8eaf417187f1..b67dbccdaf88 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -14,6 +14,7 @@
14#include <linux/magic.h> 14#include <linux/magic.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/namei.h>
17 18
18#include "kernfs-internal.h" 19#include "kernfs-internal.h"
19 20
@@ -62,6 +63,74 @@ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
62 return NULL; 63 return NULL;
63} 64}
64 65
66/*
67 * find the next ancestor in the path down to @child, where @parent was the
68 * ancestor whose descendant we want to find.
69 *
70 * Say the path is /a/b/c/d. @child is d, @parent is NULL. We return the root
71 * node. If @parent is b, then we return the node for c.
72 * Passing in d as @parent is not ok.
73 */
74static struct kernfs_node *find_next_ancestor(struct kernfs_node *child,
75 struct kernfs_node *parent)
76{
77 if (child == parent) {
78 pr_crit_once("BUG in find_next_ancestor: called with parent == child");
79 return NULL;
80 }
81
82 while (child->parent != parent) {
83 if (!child->parent)
84 return NULL;
85 child = child->parent;
86 }
87
88 return child;
89}
90
91/**
92 * kernfs_node_dentry - get a dentry for the given kernfs_node
93 * @kn: kernfs_node for which a dentry is needed
94 * @sb: the kernfs super_block
95 */
96struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
97 struct super_block *sb)
98{
99 struct dentry *dentry;
100 struct kernfs_node *knparent = NULL;
101
102 BUG_ON(sb->s_op != &kernfs_sops);
103
104 dentry = dget(sb->s_root);
105
106 /* Check if this is the root kernfs_node */
107 if (!kn->parent)
108 return dentry;
109
110 knparent = find_next_ancestor(kn, NULL);
111 if (WARN_ON(!knparent))
112 return ERR_PTR(-EINVAL);
113
114 do {
115 struct dentry *dtmp;
116 struct kernfs_node *kntmp;
117
118 if (kn == knparent)
119 return dentry;
120 kntmp = find_next_ancestor(kn, knparent);
121 if (WARN_ON(!kntmp))
122 return ERR_PTR(-EINVAL);
123 mutex_lock(&d_inode(dentry)->i_mutex);
124 dtmp = lookup_one_len(kntmp->name, dentry, strlen(kntmp->name));
125 mutex_unlock(&d_inode(dentry)->i_mutex);
126 dput(dentry);
127 if (IS_ERR(dtmp))
128 return dtmp;
129 knparent = kntmp;
130 dentry = dtmp;
131 } while (true);
132}
133
65static int kernfs_fill_super(struct super_block *sb, unsigned long magic) 134static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
66{ 135{
67 struct kernfs_super_info *info = kernfs_info(sb); 136 struct kernfs_super_info *info = kernfs_info(sb);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 187477ded6b3..eccda3a02de6 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -1,858 +1,433 @@
1/* 1#include <linux/spinlock.h>
2 * linux/fs/mbcache.c
3 * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
4 */
5
6/*
7 * Filesystem Meta Information Block Cache (mbcache)
8 *
9 * The mbcache caches blocks of block devices that need to be located
10 * by their device/block number, as well as by other criteria (such
11 * as the block's contents).
12 *
13 * There can only be one cache entry in a cache per device and block number.
14 * Additional indexes need not be unique in this sense. The number of
15 * additional indexes (=other criteria) can be hardwired at compile time
16 * or specified at cache create time.
17 *
18 * Each cache entry is of fixed size. An entry may be `valid' or `invalid'
19 * in the cache. A valid entry is in the main hash tables of the cache,
20 * and may also be in the lru list. An invalid entry is not in any hashes
21 * or lists.
22 *
23 * A valid cache entry is only in the lru list if no handles refer to it.
24 * Invalid cache entries will be freed when the last handle to the cache
25 * entry is released. Entries that cannot be freed immediately are put
26 * back on the lru list.
27 */
28
29/*
30 * Lock descriptions and usage:
31 *
32 * Each hash chain of both the block and index hash tables now contains
33 * a built-in lock used to serialize accesses to the hash chain.
34 *
35 * Accesses to global data structures mb_cache_list and mb_cache_lru_list
36 * are serialized via the global spinlock mb_cache_spinlock.
37 *
38 * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize
39 * accesses to its local data, such as e_used and e_queued.
40 *
41 * Lock ordering:
42 *
43 * Each block hash chain's lock has the highest lock order, followed by an
44 * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's
45 * lock), and mb_cach_spinlock, with the lowest order. While holding
46 * either a block or index hash chain lock, a thread can acquire an
47 * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock.
48 *
49 * Synchronization:
50 *
51 * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and
52 * index hash chian, it needs to lock the corresponding hash chain. For each
53 * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to
54 * prevent either any simultaneous release or free on the entry and also
55 * to serialize accesses to either the e_used or e_queued member of the entry.
56 *
57 * To avoid having a dangling reference to an already freed
58 * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
59 * block hash chain and also no longer being referenced, both e_used,
60 * and e_queued are 0's. When an mb_cache_entry is explicitly freed it is
61 * first removed from a block hash chain.
62 */
63
64#include <linux/kernel.h>
65#include <linux/module.h>
66
67#include <linux/hash.h>
68#include <linux/fs.h>
69#include <linux/mm.h>
70#include <linux/slab.h> 2#include <linux/slab.h>
71#include <linux/sched.h> 3#include <linux/list.h>
72#include <linux/list_bl.h> 4#include <linux/list_bl.h>
5#include <linux/module.h>
6#include <linux/sched.h>
7#include <linux/workqueue.h>
73#include <linux/mbcache.h> 8#include <linux/mbcache.h>
74#include <linux/init.h>
75#include <linux/blockgroup_lock.h>
76#include <linux/log2.h>
77
78#ifdef MB_CACHE_DEBUG
79# define mb_debug(f...) do { \
80 printk(KERN_DEBUG f); \
81 printk("\n"); \
82 } while (0)
83#define mb_assert(c) do { if (!(c)) \
84 printk(KERN_ERR "assertion " #c " failed\n"); \
85 } while(0)
86#else
87# define mb_debug(f...) do { } while(0)
88# define mb_assert(c) do { } while(0)
89#endif
90#define mb_error(f...) do { \
91 printk(KERN_ERR f); \
92 printk("\n"); \
93 } while(0)
94
95#define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
96
97#define MB_CACHE_ENTRY_LOCK_BITS ilog2(NR_BG_LOCKS)
98#define MB_CACHE_ENTRY_LOCK_INDEX(ce) \
99 (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
100
101static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
102static struct blockgroup_lock *mb_cache_bg_lock;
103static struct kmem_cache *mb_cache_kmem_cache;
104
105MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
106MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
107MODULE_LICENSE("GPL");
108
109EXPORT_SYMBOL(mb_cache_create);
110EXPORT_SYMBOL(mb_cache_shrink);
111EXPORT_SYMBOL(mb_cache_destroy);
112EXPORT_SYMBOL(mb_cache_entry_alloc);
113EXPORT_SYMBOL(mb_cache_entry_insert);
114EXPORT_SYMBOL(mb_cache_entry_release);
115EXPORT_SYMBOL(mb_cache_entry_free);
116EXPORT_SYMBOL(mb_cache_entry_get);
117#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
118EXPORT_SYMBOL(mb_cache_entry_find_first);
119EXPORT_SYMBOL(mb_cache_entry_find_next);
120#endif
121 9
122/* 10/*
123 * Global data: list of all mbcache's, lru list, and a spinlock for 11 * Mbcache is a simple key-value store. Keys need not be unique, however
124 * accessing cache data structures on SMP machines. The lru list is 12 * key-value pairs are expected to be unique (we use this fact in
125 * global across all mbcaches. 13 * mb_cache_entry_delete_block()).
14 *
15 * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
16 * They use hash of a block contents as a key and block number as a value.
17 * That's why keys need not be unique (different xattr blocks may end up having
18 * the same hash). However block number always uniquely identifies a cache
19 * entry.
20 *
21 * We provide functions for creation and removal of entries, search by key,
22 * and a special "delete entry with given key-value pair" operation. Fixed
23 * size hash table is used for fast key lookups.
126 */ 24 */
127 25
128static LIST_HEAD(mb_cache_list); 26struct mb_cache {
129static LIST_HEAD(mb_cache_lru_list); 27 /* Hash table of entries */
130static DEFINE_SPINLOCK(mb_cache_spinlock); 28 struct hlist_bl_head *c_hash;
131 29 /* log2 of hash table size */
132static inline void 30 int c_bucket_bits;
133__spin_lock_mb_cache_entry(struct mb_cache_entry *ce) 31 /* Maximum entries in cache to avoid degrading hash too much */
134{ 32 int c_max_entries;
135 spin_lock(bgl_lock_ptr(mb_cache_bg_lock, 33 /* Protects c_list, c_entry_count */
136 MB_CACHE_ENTRY_LOCK_INDEX(ce))); 34 spinlock_t c_list_lock;
137} 35 struct list_head c_list;
138 36 /* Number of entries in cache */
139static inline void 37 unsigned long c_entry_count;
140__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce) 38 struct shrinker c_shrink;
141{ 39 /* Work for shrinking when the cache has too many entries */
142 spin_unlock(bgl_lock_ptr(mb_cache_bg_lock, 40 struct work_struct c_shrink_work;
143 MB_CACHE_ENTRY_LOCK_INDEX(ce))); 41};
144}
145
146static inline int
147__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
148{
149 return !hlist_bl_unhashed(&ce->e_block_list);
150}
151 42
43static struct kmem_cache *mb_entry_cache;
152 44
153static inline void 45static unsigned long mb_cache_shrink(struct mb_cache *cache,
154__mb_cache_entry_unhash_block(struct mb_cache_entry *ce) 46 unsigned int nr_to_scan);
155{
156 if (__mb_cache_entry_is_block_hashed(ce))
157 hlist_bl_del_init(&ce->e_block_list);
158}
159 47
160static inline int 48static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
161__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce) 49 u32 key)
162{ 50{
163 return !hlist_bl_unhashed(&ce->e_index.o_list); 51 return &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
164} 52}
165 53
166static inline void 54/*
167__mb_cache_entry_unhash_index(struct mb_cache_entry *ce) 55 * Number of entries to reclaim synchronously when there are too many entries
168{ 56 * in cache
169 if (__mb_cache_entry_is_index_hashed(ce)) 57 */
170 hlist_bl_del_init(&ce->e_index.o_list); 58#define SYNC_SHRINK_BATCH 64
171}
172 59
173/* 60/*
174 * __mb_cache_entry_unhash_unlock() 61 * mb_cache_entry_create - create entry in cache
175 * 62 * @cache - cache where the entry should be created
176 * This function is called to unhash both the block and index hash 63 * @mask - gfp mask with which the entry should be allocated
177 * chain. 64 * @key - key of the entry
178 * It assumes both the block and index hash chain is locked upon entry. 65 * @block - block that contains data
179 * It also unlock both hash chains both exit 66 * @reusable - is the block reusable by other inodes?
67 *
68 * Creates entry in @cache with key @key and records that data is stored in
69 * block @block. The function returns -EBUSY if entry with the same key
70 * and for the same block already exists in cache. Otherwise 0 is returned.
180 */ 71 */
181static inline void 72int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
182__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce) 73 sector_t block, bool reusable)
183{ 74{
184 __mb_cache_entry_unhash_index(ce); 75 struct mb_cache_entry *entry, *dup;
185 hlist_bl_unlock(ce->e_index_hash_p); 76 struct hlist_bl_node *dup_node;
186 __mb_cache_entry_unhash_block(ce); 77 struct hlist_bl_head *head;
187 hlist_bl_unlock(ce->e_block_hash_p); 78
79 /* Schedule background reclaim if there are too many entries */
80 if (cache->c_entry_count >= cache->c_max_entries)
81 schedule_work(&cache->c_shrink_work);
82 /* Do some sync reclaim if background reclaim cannot keep up */
83 if (cache->c_entry_count >= 2*cache->c_max_entries)
84 mb_cache_shrink(cache, SYNC_SHRINK_BATCH);
85
86 entry = kmem_cache_alloc(mb_entry_cache, mask);
87 if (!entry)
88 return -ENOMEM;
89
90 INIT_LIST_HEAD(&entry->e_list);
91 /* One ref for hash, one ref returned */
92 atomic_set(&entry->e_refcnt, 1);
93 entry->e_key = key;
94 entry->e_block = block;
95 entry->e_reusable = reusable;
96 head = mb_cache_entry_head(cache, key);
97 hlist_bl_lock(head);
98 hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
99 if (dup->e_key == key && dup->e_block == block) {
100 hlist_bl_unlock(head);
101 kmem_cache_free(mb_entry_cache, entry);
102 return -EBUSY;
103 }
104 }
105 hlist_bl_add_head(&entry->e_hash_list, head);
106 hlist_bl_unlock(head);
107
108 spin_lock(&cache->c_list_lock);
109 list_add_tail(&entry->e_list, &cache->c_list);
110 /* Grab ref for LRU list */
111 atomic_inc(&entry->e_refcnt);
112 cache->c_entry_count++;
113 spin_unlock(&cache->c_list_lock);
114
115 return 0;
188} 116}
117EXPORT_SYMBOL(mb_cache_entry_create);
189 118
190static void 119void __mb_cache_entry_free(struct mb_cache_entry *entry)
191__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
192{ 120{
193 struct mb_cache *cache = ce->e_cache; 121 kmem_cache_free(mb_entry_cache, entry);
194
195 mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)));
196 kmem_cache_free(cache->c_entry_cache, ce);
197 atomic_dec(&cache->c_entry_count);
198} 122}
123EXPORT_SYMBOL(__mb_cache_entry_free);
199 124
200static void 125static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
201__mb_cache_entry_release(struct mb_cache_entry *ce) 126 struct mb_cache_entry *entry,
127 u32 key)
202{ 128{
203 /* First lock the entry to serialize access to its local data. */ 129 struct mb_cache_entry *old_entry = entry;
204 __spin_lock_mb_cache_entry(ce); 130 struct hlist_bl_node *node;
205 /* Wake up all processes queuing for this cache entry. */ 131 struct hlist_bl_head *head;
206 if (ce->e_queued) 132
207 wake_up_all(&mb_cache_queue); 133 head = mb_cache_entry_head(cache, key);
208 if (ce->e_used >= MB_CACHE_WRITER) 134 hlist_bl_lock(head);
209 ce->e_used -= MB_CACHE_WRITER; 135 if (entry && !hlist_bl_unhashed(&entry->e_hash_list))
210 /* 136 node = entry->e_hash_list.next;
211 * Make sure that all cache entries on lru_list have 137 else
212 * both e_used and e_qued of 0s. 138 node = hlist_bl_first(head);
213 */ 139 while (node) {
214 ce->e_used--; 140 entry = hlist_bl_entry(node, struct mb_cache_entry,
215 if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) { 141 e_hash_list);
216 if (!__mb_cache_entry_is_block_hashed(ce)) { 142 if (entry->e_key == key && entry->e_reusable) {
217 __spin_unlock_mb_cache_entry(ce); 143 atomic_inc(&entry->e_refcnt);
218 goto forget; 144 goto out;
219 } 145 }
220 /* 146 node = node->next;
221 * Need access to lru list, first drop entry lock,
222 * then reacquire the lock in the proper order.
223 */
224 spin_lock(&mb_cache_spinlock);
225 if (list_empty(&ce->e_lru_list))
226 list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
227 spin_unlock(&mb_cache_spinlock);
228 } 147 }
229 __spin_unlock_mb_cache_entry(ce); 148 entry = NULL;
230 return; 149out:
231forget: 150 hlist_bl_unlock(head);
232 mb_assert(list_empty(&ce->e_lru_list)); 151 if (old_entry)
233 __mb_cache_entry_forget(ce, GFP_KERNEL); 152 mb_cache_entry_put(cache, old_entry);
153
154 return entry;
234} 155}
235 156
236/* 157/*
237 * mb_cache_shrink_scan() memory pressure callback 158 * mb_cache_entry_find_first - find the first entry in cache with given key
238 * 159 * @cache: cache where we should search
239 * This function is called by the kernel memory management when memory 160 * @key: key to look for
240 * gets low.
241 * 161 *
242 * @shrink: (ignored) 162 * Search in @cache for entry with key @key. Grabs reference to the first
243 * @sc: shrink_control passed from reclaim 163 * entry found and returns the entry.
244 *
245 * Returns the number of objects freed.
246 */ 164 */
247static unsigned long 165struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
248mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) 166 u32 key)
249{ 167{
250 LIST_HEAD(free_list); 168 return __entry_find(cache, NULL, key);
251 struct mb_cache_entry *entry, *tmp;
252 int nr_to_scan = sc->nr_to_scan;
253 gfp_t gfp_mask = sc->gfp_mask;
254 unsigned long freed = 0;
255
256 mb_debug("trying to free %d entries", nr_to_scan);
257 spin_lock(&mb_cache_spinlock);
258 while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) {
259 struct mb_cache_entry *ce =
260 list_entry(mb_cache_lru_list.next,
261 struct mb_cache_entry, e_lru_list);
262 list_del_init(&ce->e_lru_list);
263 if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))
264 continue;
265 spin_unlock(&mb_cache_spinlock);
266 /* Prevent any find or get operation on the entry */
267 hlist_bl_lock(ce->e_block_hash_p);
268 hlist_bl_lock(ce->e_index_hash_p);
269 /* Ignore if it is touched by a find/get */
270 if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) ||
271 !list_empty(&ce->e_lru_list)) {
272 hlist_bl_unlock(ce->e_index_hash_p);
273 hlist_bl_unlock(ce->e_block_hash_p);
274 spin_lock(&mb_cache_spinlock);
275 continue;
276 }
277 __mb_cache_entry_unhash_unlock(ce);
278 list_add_tail(&ce->e_lru_list, &free_list);
279 spin_lock(&mb_cache_spinlock);
280 }
281 spin_unlock(&mb_cache_spinlock);
282
283 list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
284 __mb_cache_entry_forget(entry, gfp_mask);
285 freed++;
286 }
287 return freed;
288} 169}
170EXPORT_SYMBOL(mb_cache_entry_find_first);
289 171
290static unsigned long 172/*
291mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) 173 * mb_cache_entry_find_next - find next entry in cache with the same
174 * @cache: cache where we should search
175 * @entry: entry to start search from
176 *
177 * Finds next entry in the hash chain which has the same key as @entry.
178 * If @entry is unhashed (which can happen when deletion of entry races
179 * with the search), finds the first entry in the hash chain. The function
180 * drops reference to @entry and returns with a reference to the found entry.
181 */
182struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
183 struct mb_cache_entry *entry)
292{ 184{
293 struct mb_cache *cache; 185 return __entry_find(cache, entry, entry->e_key);
294 unsigned long count = 0;
295
296 spin_lock(&mb_cache_spinlock);
297 list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
298 mb_debug("cache %s (%d)", cache->c_name,
299 atomic_read(&cache->c_entry_count));
300 count += atomic_read(&cache->c_entry_count);
301 }
302 spin_unlock(&mb_cache_spinlock);
303
304 return vfs_pressure_ratio(count);
305} 186}
306 187EXPORT_SYMBOL(mb_cache_entry_find_next);
307static struct shrinker mb_cache_shrinker = {
308 .count_objects = mb_cache_shrink_count,
309 .scan_objects = mb_cache_shrink_scan,
310 .seeks = DEFAULT_SEEKS,
311};
312 188
313/* 189/*
314 * mb_cache_create() create a new cache 190 * mb_cache_entry_get - get a cache entry by block number (and key)
315 * 191 * @cache - cache we work with
316 * All entries in one cache are equal size. Cache entries may be from 192 * @key - key of block number @block
317 * multiple devices. If this is the first mbcache created, registers 193 * @block - block number
318 * the cache with kernel memory management. Returns NULL if no more
319 * memory was available.
320 *
321 * @name: name of the cache (informal)
322 * @bucket_bits: log2(number of hash buckets)
323 */ 194 */
324struct mb_cache * 195struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
325mb_cache_create(const char *name, int bucket_bits) 196 sector_t block)
326{ 197{
327 int n, bucket_count = 1 << bucket_bits; 198 struct hlist_bl_node *node;
328 struct mb_cache *cache = NULL; 199 struct hlist_bl_head *head;
329 200 struct mb_cache_entry *entry;
330 if (!mb_cache_bg_lock) { 201
331 mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock), 202 head = mb_cache_entry_head(cache, key);
332 GFP_KERNEL); 203 hlist_bl_lock(head);
333 if (!mb_cache_bg_lock) 204 hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
334 return NULL; 205 if (entry->e_key == key && entry->e_block == block) {
335 bgl_lock_init(mb_cache_bg_lock); 206 atomic_inc(&entry->e_refcnt);
336 } 207 goto out;
337 208 }
338 cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
339 if (!cache)
340 return NULL;
341 cache->c_name = name;
342 atomic_set(&cache->c_entry_count, 0);
343 cache->c_bucket_bits = bucket_bits;
344 cache->c_block_hash = kmalloc(bucket_count *
345 sizeof(struct hlist_bl_head), GFP_KERNEL);
346 if (!cache->c_block_hash)
347 goto fail;
348 for (n=0; n<bucket_count; n++)
349 INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
350 cache->c_index_hash = kmalloc(bucket_count *
351 sizeof(struct hlist_bl_head), GFP_KERNEL);
352 if (!cache->c_index_hash)
353 goto fail;
354 for (n=0; n<bucket_count; n++)
355 INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
356 if (!mb_cache_kmem_cache) {
357 mb_cache_kmem_cache = kmem_cache_create(name,
358 sizeof(struct mb_cache_entry), 0,
359 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
360 if (!mb_cache_kmem_cache)
361 goto fail2;
362 } 209 }
363 cache->c_entry_cache = mb_cache_kmem_cache; 210 entry = NULL;
364 211out:
365 /* 212 hlist_bl_unlock(head);
366 * Set an upper limit on the number of cache entries so that the hash 213 return entry;
367 * chains won't grow too long.
368 */
369 cache->c_max_entries = bucket_count << 4;
370
371 spin_lock(&mb_cache_spinlock);
372 list_add(&cache->c_cache_list, &mb_cache_list);
373 spin_unlock(&mb_cache_spinlock);
374 return cache;
375
376fail2:
377 kfree(cache->c_index_hash);
378
379fail:
380 kfree(cache->c_block_hash);
381 kfree(cache);
382 return NULL;
383} 214}
215EXPORT_SYMBOL(mb_cache_entry_get);
384 216
385 217/* mb_cache_entry_delete_block - remove information about block from cache
386/* 218 * @cache - cache we work with
387 * mb_cache_shrink() 219 * @key - key of block @block
388 * 220 * @block - block number
389 * Removes all cache entries of a device from the cache. All cache entries
390 * currently in use cannot be freed, and thus remain in the cache. All others
391 * are freed.
392 * 221 *
393 * @bdev: which device's cache entries to shrink 222 * Remove entry from cache @cache with key @key with data stored in @block.
394 */ 223 */
395void 224void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
396mb_cache_shrink(struct block_device *bdev) 225 sector_t block)
397{ 226{
398 LIST_HEAD(free_list); 227 struct hlist_bl_node *node;
399 struct list_head *l; 228 struct hlist_bl_head *head;
400 struct mb_cache_entry *ce, *tmp; 229 struct mb_cache_entry *entry;
401 230
402 l = &mb_cache_lru_list; 231 head = mb_cache_entry_head(cache, key);
403 spin_lock(&mb_cache_spinlock); 232 hlist_bl_lock(head);
404 while (!list_is_last(l, &mb_cache_lru_list)) { 233 hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
405 l = l->next; 234 if (entry->e_key == key && entry->e_block == block) {
406 ce = list_entry(l, struct mb_cache_entry, e_lru_list); 235 /* We keep hash list reference to keep entry alive */
407 if (ce->e_bdev == bdev) { 236 hlist_bl_del_init(&entry->e_hash_list);
408 list_del_init(&ce->e_lru_list); 237 hlist_bl_unlock(head);
409 if (ce->e_used || ce->e_queued || 238 spin_lock(&cache->c_list_lock);
410 atomic_read(&ce->e_refcnt)) 239 if (!list_empty(&entry->e_list)) {
411 continue; 240 list_del_init(&entry->e_list);
412 spin_unlock(&mb_cache_spinlock); 241 cache->c_entry_count--;
413 /* 242 atomic_dec(&entry->e_refcnt);
414 * Prevent any find or get operation on the entry.
415 */
416 hlist_bl_lock(ce->e_block_hash_p);
417 hlist_bl_lock(ce->e_index_hash_p);
418 /* Ignore if it is touched by a find/get */
419 if (ce->e_used || ce->e_queued ||
420 atomic_read(&ce->e_refcnt) ||
421 !list_empty(&ce->e_lru_list)) {
422 hlist_bl_unlock(ce->e_index_hash_p);
423 hlist_bl_unlock(ce->e_block_hash_p);
424 l = &mb_cache_lru_list;
425 spin_lock(&mb_cache_spinlock);
426 continue;
427 } 243 }
428 __mb_cache_entry_unhash_unlock(ce); 244 spin_unlock(&cache->c_list_lock);
429 mb_assert(!(ce->e_used || ce->e_queued || 245 mb_cache_entry_put(cache, entry);
430 atomic_read(&ce->e_refcnt))); 246 return;
431 list_add_tail(&ce->e_lru_list, &free_list);
432 l = &mb_cache_lru_list;
433 spin_lock(&mb_cache_spinlock);
434 } 247 }
435 } 248 }
436 spin_unlock(&mb_cache_spinlock); 249 hlist_bl_unlock(head);
437
438 list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
439 __mb_cache_entry_forget(ce, GFP_KERNEL);
440 }
441} 250}
251EXPORT_SYMBOL(mb_cache_entry_delete_block);
442 252
443 253/* mb_cache_entry_touch - cache entry got used
444/* 254 * @cache - cache the entry belongs to
445 * mb_cache_destroy() 255 * @entry - entry that got used
446 * 256 *
447 * Shrinks the cache to its minimum possible size (hopefully 0 entries), 257 * Marks entry as used to give hit higher chances of surviving in cache.
448 * and then destroys it. If this was the last mbcache, un-registers the
449 * mbcache from kernel memory management.
450 */ 258 */
451void 259void mb_cache_entry_touch(struct mb_cache *cache,
452mb_cache_destroy(struct mb_cache *cache) 260 struct mb_cache_entry *entry)
453{ 261{
454 LIST_HEAD(free_list); 262 entry->e_referenced = 1;
455 struct mb_cache_entry *ce, *tmp;
456
457 spin_lock(&mb_cache_spinlock);
458 list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) {
459 if (ce->e_cache == cache)
460 list_move_tail(&ce->e_lru_list, &free_list);
461 }
462 list_del(&cache->c_cache_list);
463 spin_unlock(&mb_cache_spinlock);
464
465 list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
466 list_del_init(&ce->e_lru_list);
467 /*
468 * Prevent any find or get operation on the entry.
469 */
470 hlist_bl_lock(ce->e_block_hash_p);
471 hlist_bl_lock(ce->e_index_hash_p);
472 mb_assert(!(ce->e_used || ce->e_queued ||
473 atomic_read(&ce->e_refcnt)));
474 __mb_cache_entry_unhash_unlock(ce);
475 __mb_cache_entry_forget(ce, GFP_KERNEL);
476 }
477
478 if (atomic_read(&cache->c_entry_count) > 0) {
479 mb_error("cache %s: %d orphaned entries",
480 cache->c_name,
481 atomic_read(&cache->c_entry_count));
482 }
483
484 if (list_empty(&mb_cache_list)) {
485 kmem_cache_destroy(mb_cache_kmem_cache);
486 mb_cache_kmem_cache = NULL;
487 }
488 kfree(cache->c_index_hash);
489 kfree(cache->c_block_hash);
490 kfree(cache);
491} 263}
264EXPORT_SYMBOL(mb_cache_entry_touch);
492 265
493/* 266static unsigned long mb_cache_count(struct shrinker *shrink,
494 * mb_cache_entry_alloc() 267 struct shrink_control *sc)
495 *
496 * Allocates a new cache entry. The new entry will not be valid initially,
497 * and thus cannot be looked up yet. It should be filled with data, and
498 * then inserted into the cache using mb_cache_entry_insert(). Returns NULL
499 * if no more memory was available.
500 */
501struct mb_cache_entry *
502mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
503{ 268{
504 struct mb_cache_entry *ce; 269 struct mb_cache *cache = container_of(shrink, struct mb_cache,
505 270 c_shrink);
506 if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
507 struct list_head *l;
508
509 l = &mb_cache_lru_list;
510 spin_lock(&mb_cache_spinlock);
511 while (!list_is_last(l, &mb_cache_lru_list)) {
512 l = l->next;
513 ce = list_entry(l, struct mb_cache_entry, e_lru_list);
514 if (ce->e_cache == cache) {
515 list_del_init(&ce->e_lru_list);
516 if (ce->e_used || ce->e_queued ||
517 atomic_read(&ce->e_refcnt))
518 continue;
519 spin_unlock(&mb_cache_spinlock);
520 /*
521 * Prevent any find or get operation on the
522 * entry.
523 */
524 hlist_bl_lock(ce->e_block_hash_p);
525 hlist_bl_lock(ce->e_index_hash_p);
526 /* Ignore if it is touched by a find/get */
527 if (ce->e_used || ce->e_queued ||
528 atomic_read(&ce->e_refcnt) ||
529 !list_empty(&ce->e_lru_list)) {
530 hlist_bl_unlock(ce->e_index_hash_p);
531 hlist_bl_unlock(ce->e_block_hash_p);
532 l = &mb_cache_lru_list;
533 spin_lock(&mb_cache_spinlock);
534 continue;
535 }
536 mb_assert(list_empty(&ce->e_lru_list));
537 mb_assert(!(ce->e_used || ce->e_queued ||
538 atomic_read(&ce->e_refcnt)));
539 __mb_cache_entry_unhash_unlock(ce);
540 goto found;
541 }
542 }
543 spin_unlock(&mb_cache_spinlock);
544 }
545 271
546 ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); 272 return cache->c_entry_count;
547 if (!ce)
548 return NULL;
549 atomic_inc(&cache->c_entry_count);
550 INIT_LIST_HEAD(&ce->e_lru_list);
551 INIT_HLIST_BL_NODE(&ce->e_block_list);
552 INIT_HLIST_BL_NODE(&ce->e_index.o_list);
553 ce->e_cache = cache;
554 ce->e_queued = 0;
555 atomic_set(&ce->e_refcnt, 0);
556found:
557 ce->e_block_hash_p = &cache->c_block_hash[0];
558 ce->e_index_hash_p = &cache->c_index_hash[0];
559 ce->e_used = 1 + MB_CACHE_WRITER;
560 return ce;
561} 273}
562 274
563 275/* Shrink number of entries in cache */
564/* 276static unsigned long mb_cache_shrink(struct mb_cache *cache,
565 * mb_cache_entry_insert() 277 unsigned int nr_to_scan)
566 *
567 * Inserts an entry that was allocated using mb_cache_entry_alloc() into
568 * the cache. After this, the cache entry can be looked up, but is not yet
569 * in the lru list as the caller still holds a handle to it. Returns 0 on
570 * success, or -EBUSY if a cache entry for that device + inode exists
571 * already (this may happen after a failed lookup, but when another process
572 * has inserted the same cache entry in the meantime).
573 *
574 * @bdev: device the cache entry belongs to
575 * @block: block number
576 * @key: lookup key
577 */
578int
579mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
580 sector_t block, unsigned int key)
581{ 278{
582 struct mb_cache *cache = ce->e_cache; 279 struct mb_cache_entry *entry;
583 unsigned int bucket; 280 struct hlist_bl_head *head;
584 struct hlist_bl_node *l; 281 unsigned int shrunk = 0;
585 struct hlist_bl_head *block_hash_p; 282
586 struct hlist_bl_head *index_hash_p; 283 spin_lock(&cache->c_list_lock);
587 struct mb_cache_entry *lce; 284 while (nr_to_scan-- && !list_empty(&cache->c_list)) {
588 285 entry = list_first_entry(&cache->c_list,
589 mb_assert(ce); 286 struct mb_cache_entry, e_list);
590 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 287 if (entry->e_referenced) {
591 cache->c_bucket_bits); 288 entry->e_referenced = 0;
592 block_hash_p = &cache->c_block_hash[bucket]; 289 list_move_tail(&cache->c_list, &entry->e_list);
593 hlist_bl_lock(block_hash_p); 290 continue;
594 hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
595 if (lce->e_bdev == bdev && lce->e_block == block) {
596 hlist_bl_unlock(block_hash_p);
597 return -EBUSY;
598 } 291 }
292 list_del_init(&entry->e_list);
293 cache->c_entry_count--;
294 /*
295 * We keep LRU list reference so that entry doesn't go away
296 * from under us.
297 */
298 spin_unlock(&cache->c_list_lock);
299 head = mb_cache_entry_head(cache, entry->e_key);
300 hlist_bl_lock(head);
301 if (!hlist_bl_unhashed(&entry->e_hash_list)) {
302 hlist_bl_del_init(&entry->e_hash_list);
303 atomic_dec(&entry->e_refcnt);
304 }
305 hlist_bl_unlock(head);
306 if (mb_cache_entry_put(cache, entry))
307 shrunk++;
308 cond_resched();
309 spin_lock(&cache->c_list_lock);
599 } 310 }
600 mb_assert(!__mb_cache_entry_is_block_hashed(ce)); 311 spin_unlock(&cache->c_list_lock);
601 __mb_cache_entry_unhash_block(ce);
602 __mb_cache_entry_unhash_index(ce);
603 ce->e_bdev = bdev;
604 ce->e_block = block;
605 ce->e_block_hash_p = block_hash_p;
606 ce->e_index.o_key = key;
607 hlist_bl_add_head(&ce->e_block_list, block_hash_p);
608 hlist_bl_unlock(block_hash_p);
609 bucket = hash_long(key, cache->c_bucket_bits);
610 index_hash_p = &cache->c_index_hash[bucket];
611 hlist_bl_lock(index_hash_p);
612 ce->e_index_hash_p = index_hash_p;
613 hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
614 hlist_bl_unlock(index_hash_p);
615 return 0;
616}
617 312
313 return shrunk;
314}
618 315
619/* 316static unsigned long mb_cache_scan(struct shrinker *shrink,
620 * mb_cache_entry_release() 317 struct shrink_control *sc)
621 *
622 * Release a handle to a cache entry. When the last handle to a cache entry
623 * is released it is either freed (if it is invalid) or otherwise inserted
624 * in to the lru list.
625 */
626void
627mb_cache_entry_release(struct mb_cache_entry *ce)
628{ 318{
629 __mb_cache_entry_release(ce); 319 int nr_to_scan = sc->nr_to_scan;
320 struct mb_cache *cache = container_of(shrink, struct mb_cache,
321 c_shrink);
322 return mb_cache_shrink(cache, nr_to_scan);
630} 323}
631 324
325/* We shrink 1/X of the cache when we have too many entries in it */
326#define SHRINK_DIVISOR 16
632 327
633/* 328static void mb_cache_shrink_worker(struct work_struct *work)
634 * mb_cache_entry_free()
635 *
636 */
637void
638mb_cache_entry_free(struct mb_cache_entry *ce)
639{ 329{
640 mb_assert(ce); 330 struct mb_cache *cache = container_of(work, struct mb_cache,
641 mb_assert(list_empty(&ce->e_lru_list)); 331 c_shrink_work);
642 hlist_bl_lock(ce->e_index_hash_p); 332 mb_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR);
643 __mb_cache_entry_unhash_index(ce);
644 hlist_bl_unlock(ce->e_index_hash_p);
645 hlist_bl_lock(ce->e_block_hash_p);
646 __mb_cache_entry_unhash_block(ce);
647 hlist_bl_unlock(ce->e_block_hash_p);
648 __mb_cache_entry_release(ce);
649} 333}
650 334
651
652/* 335/*
653 * mb_cache_entry_get() 336 * mb_cache_create - create cache
337 * @bucket_bits: log2 of the hash table size
654 * 338 *
655 * Get a cache entry by device / block number. (There can only be one entry 339 * Create cache for keys with 2^bucket_bits hash entries.
656 * in the cache per device and block.) Returns NULL if no such cache entry
657 * exists. The returned cache entry is locked for exclusive access ("single
658 * writer").
659 */ 340 */
660struct mb_cache_entry * 341struct mb_cache *mb_cache_create(int bucket_bits)
661mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
662 sector_t block)
663{ 342{
664 unsigned int bucket; 343 struct mb_cache *cache;
665 struct hlist_bl_node *l; 344 int bucket_count = 1 << bucket_bits;
666 struct mb_cache_entry *ce; 345 int i;
667 struct hlist_bl_head *block_hash_p;
668
669 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
670 cache->c_bucket_bits);
671 block_hash_p = &cache->c_block_hash[bucket];
672 /* First serialize access to the block corresponding hash chain. */
673 hlist_bl_lock(block_hash_p);
674 hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
675 mb_assert(ce->e_block_hash_p == block_hash_p);
676 if (ce->e_bdev == bdev && ce->e_block == block) {
677 /*
678 * Prevent a free from removing the entry.
679 */
680 atomic_inc(&ce->e_refcnt);
681 hlist_bl_unlock(block_hash_p);
682 __spin_lock_mb_cache_entry(ce);
683 atomic_dec(&ce->e_refcnt);
684 if (ce->e_used > 0) {
685 DEFINE_WAIT(wait);
686 while (ce->e_used > 0) {
687 ce->e_queued++;
688 prepare_to_wait(&mb_cache_queue, &wait,
689 TASK_UNINTERRUPTIBLE);
690 __spin_unlock_mb_cache_entry(ce);
691 schedule();
692 __spin_lock_mb_cache_entry(ce);
693 ce->e_queued--;
694 }
695 finish_wait(&mb_cache_queue, &wait);
696 }
697 ce->e_used += 1 + MB_CACHE_WRITER;
698 __spin_unlock_mb_cache_entry(ce);
699 346
700 if (!list_empty(&ce->e_lru_list)) { 347 if (!try_module_get(THIS_MODULE))
701 spin_lock(&mb_cache_spinlock); 348 return NULL;
702 list_del_init(&ce->e_lru_list); 349
703 spin_unlock(&mb_cache_spinlock); 350 cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL);
704 } 351 if (!cache)
705 if (!__mb_cache_entry_is_block_hashed(ce)) { 352 goto err_out;
706 __mb_cache_entry_release(ce); 353 cache->c_bucket_bits = bucket_bits;
707 return NULL; 354 cache->c_max_entries = bucket_count << 4;
708 } 355 INIT_LIST_HEAD(&cache->c_list);
709 return ce; 356 spin_lock_init(&cache->c_list_lock);
710 } 357 cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head),
358 GFP_KERNEL);
359 if (!cache->c_hash) {
360 kfree(cache);
361 goto err_out;
711 } 362 }
712 hlist_bl_unlock(block_hash_p); 363 for (i = 0; i < bucket_count; i++)
713 return NULL; 364 INIT_HLIST_BL_HEAD(&cache->c_hash[i]);
714}
715 365
716#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) 366 cache->c_shrink.count_objects = mb_cache_count;
367 cache->c_shrink.scan_objects = mb_cache_scan;
368 cache->c_shrink.seeks = DEFAULT_SEEKS;
369 register_shrinker(&cache->c_shrink);
717 370
718static struct mb_cache_entry * 371 INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker);
719__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
720 struct block_device *bdev, unsigned int key)
721{
722 372
723 /* The index hash chain is alredy acquire by caller. */ 373 return cache;
724 while (l != NULL) { 374
725 struct mb_cache_entry *ce = 375err_out:
726 hlist_bl_entry(l, struct mb_cache_entry, 376 module_put(THIS_MODULE);
727 e_index.o_list);
728 mb_assert(ce->e_index_hash_p == head);
729 if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
730 /*
731 * Prevent a free from removing the entry.
732 */
733 atomic_inc(&ce->e_refcnt);
734 hlist_bl_unlock(head);
735 __spin_lock_mb_cache_entry(ce);
736 atomic_dec(&ce->e_refcnt);
737 ce->e_used++;
738 /* Incrementing before holding the lock gives readers
739 priority over writers. */
740 if (ce->e_used >= MB_CACHE_WRITER) {
741 DEFINE_WAIT(wait);
742
743 while (ce->e_used >= MB_CACHE_WRITER) {
744 ce->e_queued++;
745 prepare_to_wait(&mb_cache_queue, &wait,
746 TASK_UNINTERRUPTIBLE);
747 __spin_unlock_mb_cache_entry(ce);
748 schedule();
749 __spin_lock_mb_cache_entry(ce);
750 ce->e_queued--;
751 }
752 finish_wait(&mb_cache_queue, &wait);
753 }
754 __spin_unlock_mb_cache_entry(ce);
755 if (!list_empty(&ce->e_lru_list)) {
756 spin_lock(&mb_cache_spinlock);
757 list_del_init(&ce->e_lru_list);
758 spin_unlock(&mb_cache_spinlock);
759 }
760 if (!__mb_cache_entry_is_block_hashed(ce)) {
761 __mb_cache_entry_release(ce);
762 return ERR_PTR(-EAGAIN);
763 }
764 return ce;
765 }
766 l = l->next;
767 }
768 hlist_bl_unlock(head);
769 return NULL; 377 return NULL;
770} 378}
771 379EXPORT_SYMBOL(mb_cache_create);
772 380
773/* 381/*
774 * mb_cache_entry_find_first() 382 * mb_cache_destroy - destroy cache
775 * 383 * @cache: the cache to destroy
776 * Find the first cache entry on a given device with a certain key in
777 * an additional index. Additional matches can be found with
778 * mb_cache_entry_find_next(). Returns NULL if no match was found. The
779 * returned cache entry is locked for shared access ("multiple readers").
780 * 384 *
781 * @cache: the cache to search 385 * Free all entries in cache and cache itself. Caller must make sure nobody
782 * @bdev: the device the cache entry should belong to 386 * (except shrinker) can reach @cache when calling this.
783 * @key: the key in the index
784 */ 387 */
785struct mb_cache_entry * 388void mb_cache_destroy(struct mb_cache *cache)
786mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
787 unsigned int key)
788{ 389{
789 unsigned int bucket = hash_long(key, cache->c_bucket_bits); 390 struct mb_cache_entry *entry, *next;
790 struct hlist_bl_node *l;
791 struct mb_cache_entry *ce = NULL;
792 struct hlist_bl_head *index_hash_p;
793
794 index_hash_p = &cache->c_index_hash[bucket];
795 hlist_bl_lock(index_hash_p);
796 if (!hlist_bl_empty(index_hash_p)) {
797 l = hlist_bl_first(index_hash_p);
798 ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
799 } else
800 hlist_bl_unlock(index_hash_p);
801 return ce;
802}
803 391
392 unregister_shrinker(&cache->c_shrink);
804 393
805/* 394 /*
806 * mb_cache_entry_find_next() 395 * We don't bother with any locking. Cache must not be used at this
807 * 396 * point.
808 * Find the next cache entry on a given device with a certain key in an 397 */
809 * additional index. Returns NULL if no match could be found. The previous 398 list_for_each_entry_safe(entry, next, &cache->c_list, e_list) {
810 * entry is atomatically released, so that mb_cache_entry_find_next() can 399 if (!hlist_bl_unhashed(&entry->e_hash_list)) {
811 * be called like this: 400 hlist_bl_del_init(&entry->e_hash_list);
812 * 401 atomic_dec(&entry->e_refcnt);
813 * entry = mb_cache_entry_find_first(); 402 } else
814 * while (entry) { 403 WARN_ON(1);
815 * ... 404 list_del(&entry->e_list);
816 * entry = mb_cache_entry_find_next(entry, ...); 405 WARN_ON(atomic_read(&entry->e_refcnt) != 1);
817 * } 406 mb_cache_entry_put(cache, entry);
818 * 407 }
819 * @prev: The previous match 408 kfree(cache->c_hash);
820 * @bdev: the device the cache entry should belong to 409 kfree(cache);
821 * @key: the key in the index 410 module_put(THIS_MODULE);
822 */
823struct mb_cache_entry *
824mb_cache_entry_find_next(struct mb_cache_entry *prev,
825 struct block_device *bdev, unsigned int key)
826{
827 struct mb_cache *cache = prev->e_cache;
828 unsigned int bucket = hash_long(key, cache->c_bucket_bits);
829 struct hlist_bl_node *l;
830 struct mb_cache_entry *ce;
831 struct hlist_bl_head *index_hash_p;
832
833 index_hash_p = &cache->c_index_hash[bucket];
834 mb_assert(prev->e_index_hash_p == index_hash_p);
835 hlist_bl_lock(index_hash_p);
836 mb_assert(!hlist_bl_empty(index_hash_p));
837 l = prev->e_index.o_list.next;
838 ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
839 __mb_cache_entry_release(prev);
840 return ce;
841} 411}
412EXPORT_SYMBOL(mb_cache_destroy);
842 413
843#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */ 414static int __init mbcache_init(void)
844
845static int __init init_mbcache(void)
846{ 415{
847 register_shrinker(&mb_cache_shrinker); 416 mb_entry_cache = kmem_cache_create("mbcache",
417 sizeof(struct mb_cache_entry), 0,
418 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
419 BUG_ON(!mb_entry_cache);
848 return 0; 420 return 0;
849} 421}
850 422
851static void __exit exit_mbcache(void) 423static void __exit mbcache_exit(void)
852{ 424{
853 unregister_shrinker(&mb_cache_shrinker); 425 kmem_cache_destroy(mb_entry_cache);
854} 426}
855 427
856module_init(init_mbcache) 428module_init(mbcache_init)
857module_exit(exit_mbcache) 429module_exit(mbcache_exit)
858 430
431MODULE_AUTHOR("Jan Kara <jack@suse.cz>");
432MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
433MODULE_LICENSE("GPL");
diff --git a/fs/mpage.c b/fs/mpage.c
index 1480d3a18037..6bd9fd90964e 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -24,6 +24,7 @@
24#include <linux/highmem.h> 24#include <linux/highmem.h>
25#include <linux/prefetch.h> 25#include <linux/prefetch.h>
26#include <linux/mpage.h> 26#include <linux/mpage.h>
27#include <linux/mm_inline.h>
27#include <linux/writeback.h> 28#include <linux/writeback.h>
28#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
29#include <linux/pagevec.h> 30#include <linux/pagevec.h>
@@ -366,7 +367,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
366 map_bh.b_state = 0; 367 map_bh.b_state = 0;
367 map_bh.b_size = 0; 368 map_bh.b_size = 0;
368 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 369 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
369 struct page *page = list_entry(pages->prev, struct page, lru); 370 struct page *page = lru_to_page(pages);
370 371
371 prefetchw(&page->flags); 372 prefetchw(&page->flags);
372 list_del(&page->lru); 373 list_del(&page->lru);
diff --git a/fs/namei.c b/fs/namei.c
index 9c590e0f66e9..1d9ca2d5dff6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1220,8 +1220,8 @@ static int follow_managed(struct path *path, struct nameidata *nd)
1220 1220
1221 if (need_mntput && path->mnt == mnt) 1221 if (need_mntput && path->mnt == mnt)
1222 mntput(path->mnt); 1222 mntput(path->mnt);
1223 if (ret == -EISDIR) 1223 if (ret == -EISDIR || !ret)
1224 ret = 0; 1224 ret = 1;
1225 if (need_mntput) 1225 if (need_mntput)
1226 nd->flags |= LOOKUP_JUMPED; 1226 nd->flags |= LOOKUP_JUMPED;
1227 if (unlikely(ret < 0)) 1227 if (unlikely(ret < 0))
@@ -1444,40 +1444,26 @@ static int follow_dotdot(struct nameidata *nd)
1444 * This looks up the name in dcache, possibly revalidates the old dentry and 1444 * This looks up the name in dcache, possibly revalidates the old dentry and
1445 * allocates a new one if not found or not valid. In the need_lookup argument 1445 * allocates a new one if not found or not valid. In the need_lookup argument
1446 * returns whether i_op->lookup is necessary. 1446 * returns whether i_op->lookup is necessary.
1447 *
1448 * dir->d_inode->i_mutex must be held
1449 */ 1447 */
1450static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, 1448static struct dentry *lookup_dcache(const struct qstr *name,
1451 unsigned int flags, bool *need_lookup) 1449 struct dentry *dir,
1450 unsigned int flags)
1452{ 1451{
1453 struct dentry *dentry; 1452 struct dentry *dentry;
1454 int error; 1453 int error;
1455 1454
1456 *need_lookup = false;
1457 dentry = d_lookup(dir, name); 1455 dentry = d_lookup(dir, name);
1458 if (dentry) { 1456 if (dentry) {
1459 if (dentry->d_flags & DCACHE_OP_REVALIDATE) { 1457 if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
1460 error = d_revalidate(dentry, flags); 1458 error = d_revalidate(dentry, flags);
1461 if (unlikely(error <= 0)) { 1459 if (unlikely(error <= 0)) {
1462 if (error < 0) { 1460 if (!error)
1463 dput(dentry);
1464 return ERR_PTR(error);
1465 } else {
1466 d_invalidate(dentry); 1461 d_invalidate(dentry);
1467 dput(dentry); 1462 dput(dentry);
1468 dentry = NULL; 1463 return ERR_PTR(error);
1469 }
1470 } 1464 }
1471 } 1465 }
1472 } 1466 }
1473
1474 if (!dentry) {
1475 dentry = d_alloc(dir, name);
1476 if (unlikely(!dentry))
1477 return ERR_PTR(-ENOMEM);
1478
1479 *need_lookup = true;
1480 }
1481 return dentry; 1467 return dentry;
1482} 1468}
1483 1469
@@ -1506,45 +1492,44 @@ static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
1506 return dentry; 1492 return dentry;
1507} 1493}
1508 1494
1509static struct dentry *__lookup_hash(struct qstr *name, 1495static struct dentry *__lookup_hash(const struct qstr *name,
1510 struct dentry *base, unsigned int flags) 1496 struct dentry *base, unsigned int flags)
1511{ 1497{
1512 bool need_lookup; 1498 struct dentry *dentry = lookup_dcache(name, base, flags);
1513 struct dentry *dentry;
1514 1499
1515 dentry = lookup_dcache(name, base, flags, &need_lookup); 1500 if (dentry)
1516 if (!need_lookup)
1517 return dentry; 1501 return dentry;
1518 1502
1503 dentry = d_alloc(base, name);
1504 if (unlikely(!dentry))
1505 return ERR_PTR(-ENOMEM);
1506
1519 return lookup_real(base->d_inode, dentry, flags); 1507 return lookup_real(base->d_inode, dentry, flags);
1520} 1508}
1521 1509
1522/*
1523 * It's more convoluted than I'd like it to be, but... it's still fairly
1524 * small and for now I'd prefer to have fast path as straight as possible.
1525 * It _is_ time-critical.
1526 */
1527static int lookup_fast(struct nameidata *nd, 1510static int lookup_fast(struct nameidata *nd,
1528 struct path *path, struct inode **inode, 1511 struct path *path, struct inode **inode,
1529 unsigned *seqp) 1512 unsigned *seqp)
1530{ 1513{
1531 struct vfsmount *mnt = nd->path.mnt; 1514 struct vfsmount *mnt = nd->path.mnt;
1532 struct dentry *dentry, *parent = nd->path.dentry; 1515 struct dentry *dentry, *parent = nd->path.dentry;
1533 int need_reval = 1;
1534 int status = 1; 1516 int status = 1;
1535 int err; 1517 int err;
1536 1518
1537 /* 1519 /*
1538 * Rename seqlock is not required here because in the off chance 1520 * Rename seqlock is not required here because in the off chance
1539 * of a false negative due to a concurrent rename, we're going to 1521 * of a false negative due to a concurrent rename, the caller is
1540 * do the non-racy lookup, below. 1522 * going to fall back to non-racy lookup.
1541 */ 1523 */
1542 if (nd->flags & LOOKUP_RCU) { 1524 if (nd->flags & LOOKUP_RCU) {
1543 unsigned seq; 1525 unsigned seq;
1544 bool negative; 1526 bool negative;
1545 dentry = __d_lookup_rcu(parent, &nd->last, &seq); 1527 dentry = __d_lookup_rcu(parent, &nd->last, &seq);
1546 if (!dentry) 1528 if (unlikely(!dentry)) {
1547 goto unlazy; 1529 if (unlazy_walk(nd, NULL, 0))
1530 return -ECHILD;
1531 return 0;
1532 }
1548 1533
1549 /* 1534 /*
1550 * This sequence count validates that the inode matches 1535 * This sequence count validates that the inode matches
@@ -1552,7 +1537,7 @@ static int lookup_fast(struct nameidata *nd,
1552 */ 1537 */
1553 *inode = d_backing_inode(dentry); 1538 *inode = d_backing_inode(dentry);
1554 negative = d_is_negative(dentry); 1539 negative = d_is_negative(dentry);
1555 if (read_seqcount_retry(&dentry->d_seq, seq)) 1540 if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
1556 return -ECHILD; 1541 return -ECHILD;
1557 1542
1558 /* 1543 /*
@@ -1562,81 +1547,89 @@ static int lookup_fast(struct nameidata *nd,
1562 * The memory barrier in read_seqcount_begin of child is 1547 * The memory barrier in read_seqcount_begin of child is
1563 * enough, we can use __read_seqcount_retry here. 1548 * enough, we can use __read_seqcount_retry here.
1564 */ 1549 */
1565 if (__read_seqcount_retry(&parent->d_seq, nd->seq)) 1550 if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
1566 return -ECHILD; 1551 return -ECHILD;
1567 1552
1568 *seqp = seq; 1553 *seqp = seq;
1569 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { 1554 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
1570 status = d_revalidate(dentry, nd->flags); 1555 status = d_revalidate(dentry, nd->flags);
1571 if (unlikely(status <= 0)) { 1556 if (unlikely(status <= 0)) {
1572 if (status != -ECHILD) 1557 if (unlazy_walk(nd, dentry, seq))
1573 need_reval = 0; 1558 return -ECHILD;
1574 goto unlazy; 1559 if (status == -ECHILD)
1575 } 1560 status = d_revalidate(dentry, nd->flags);
1561 } else {
1562 /*
1563 * Note: do negative dentry check after revalidation in
1564 * case that drops it.
1565 */
1566 if (unlikely(negative))
1567 return -ENOENT;
1568 path->mnt = mnt;
1569 path->dentry = dentry;
1570 if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
1571 return 1;
1572 if (unlazy_walk(nd, dentry, seq))
1573 return -ECHILD;
1576 } 1574 }
1577 /*
1578 * Note: do negative dentry check after revalidation in
1579 * case that drops it.
1580 */
1581 if (negative)
1582 return -ENOENT;
1583 path->mnt = mnt;
1584 path->dentry = dentry;
1585 if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
1586 return 0;
1587unlazy:
1588 if (unlazy_walk(nd, dentry, seq))
1589 return -ECHILD;
1590 } else { 1575 } else {
1591 dentry = __d_lookup(parent, &nd->last); 1576 dentry = __d_lookup(parent, &nd->last);
1577 if (unlikely(!dentry))
1578 return 0;
1579 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
1580 status = d_revalidate(dentry, nd->flags);
1592 } 1581 }
1593
1594 if (unlikely(!dentry))
1595 goto need_lookup;
1596
1597 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
1598 status = d_revalidate(dentry, nd->flags);
1599 if (unlikely(status <= 0)) { 1582 if (unlikely(status <= 0)) {
1600 if (status < 0) { 1583 if (!status)
1601 dput(dentry); 1584 d_invalidate(dentry);
1602 return status;
1603 }
1604 d_invalidate(dentry);
1605 dput(dentry); 1585 dput(dentry);
1606 goto need_lookup; 1586 return status;
1607 } 1587 }
1608
1609 if (unlikely(d_is_negative(dentry))) { 1588 if (unlikely(d_is_negative(dentry))) {
1610 dput(dentry); 1589 dput(dentry);
1611 return -ENOENT; 1590 return -ENOENT;
1612 } 1591 }
1592
1613 path->mnt = mnt; 1593 path->mnt = mnt;
1614 path->dentry = dentry; 1594 path->dentry = dentry;
1615 err = follow_managed(path, nd); 1595 err = follow_managed(path, nd);
1616 if (likely(!err)) 1596 if (likely(err > 0))
1617 *inode = d_backing_inode(path->dentry); 1597 *inode = d_backing_inode(path->dentry);
1618 return err; 1598 return err;
1619
1620need_lookup:
1621 return 1;
1622} 1599}
1623 1600
1624/* Fast lookup failed, do it the slow way */ 1601/* Fast lookup failed, do it the slow way */
1625static int lookup_slow(struct nameidata *nd, struct path *path) 1602static struct dentry *lookup_slow(const struct qstr *name,
1603 struct dentry *dir,
1604 unsigned int flags)
1626{ 1605{
1627 struct dentry *dentry, *parent; 1606 struct dentry *dentry;
1628 1607 inode_lock(dir->d_inode);
1629 parent = nd->path.dentry; 1608 dentry = d_lookup(dir, name);
1630 BUG_ON(nd->inode != parent->d_inode); 1609 if (unlikely(dentry)) {
1631 1610 if ((dentry->d_flags & DCACHE_OP_REVALIDATE) &&
1632 inode_lock(parent->d_inode); 1611 !(flags & LOOKUP_NO_REVAL)) {
1633 dentry = __lookup_hash(&nd->last, parent, nd->flags); 1612 int error = d_revalidate(dentry, flags);
1634 inode_unlock(parent->d_inode); 1613 if (unlikely(error <= 0)) {
1635 if (IS_ERR(dentry)) 1614 if (!error)
1636 return PTR_ERR(dentry); 1615 d_invalidate(dentry);
1637 path->mnt = nd->path.mnt; 1616 dput(dentry);
1638 path->dentry = dentry; 1617 dentry = ERR_PTR(error);
1639 return follow_managed(path, nd); 1618 }
1619 }
1620 if (dentry) {
1621 inode_unlock(dir->d_inode);
1622 return dentry;
1623 }
1624 }
1625 dentry = d_alloc(dir, name);
1626 if (unlikely(!dentry)) {
1627 inode_unlock(dir->d_inode);
1628 return ERR_PTR(-ENOMEM);
1629 }
1630 dentry = lookup_real(dir->d_inode, dentry, flags);
1631 inode_unlock(dir->d_inode);
1632 return dentry;
1640} 1633}
1641 1634
1642static inline int may_lookup(struct nameidata *nd) 1635static inline int may_lookup(struct nameidata *nd)
@@ -1740,18 +1733,25 @@ static int walk_component(struct nameidata *nd, int flags)
1740 return err; 1733 return err;
1741 } 1734 }
1742 err = lookup_fast(nd, &path, &inode, &seq); 1735 err = lookup_fast(nd, &path, &inode, &seq);
1743 if (unlikely(err)) { 1736 if (unlikely(err <= 0)) {
1744 if (err < 0) 1737 if (err < 0)
1745 return err; 1738 return err;
1746 1739 path.dentry = lookup_slow(&nd->last, nd->path.dentry,
1747 err = lookup_slow(nd, &path); 1740 nd->flags);
1748 if (err < 0) 1741 if (IS_ERR(path.dentry))
1742 return PTR_ERR(path.dentry);
1743
1744 path.mnt = nd->path.mnt;
1745 err = follow_managed(&path, nd);
1746 if (unlikely(err < 0))
1749 return err; 1747 return err;
1750 1748
1749 if (unlikely(d_is_negative(path.dentry))) {
1750 path_to_nameidata(&path, nd);
1751 return -ENOENT;
1752 }
1753
1751 seq = 0; /* we are already out of RCU mode */ 1754 seq = 0; /* we are already out of RCU mode */
1752 err = -ENOENT;
1753 if (d_is_negative(path.dentry))
1754 goto out_path_put;
1755 inode = d_backing_inode(path.dentry); 1755 inode = d_backing_inode(path.dentry);
1756 } 1756 }
1757 1757
@@ -1764,10 +1764,6 @@ static int walk_component(struct nameidata *nd, int flags)
1764 nd->inode = inode; 1764 nd->inode = inode;
1765 nd->seq = seq; 1765 nd->seq = seq;
1766 return 0; 1766 return 0;
1767
1768out_path_put:
1769 path_to_nameidata(&path, nd);
1770 return err;
1771} 1767}
1772 1768
1773/* 1769/*
@@ -2373,21 +2369,9 @@ struct dentry *lookup_one_len_unlocked(const char *name,
2373 if (err) 2369 if (err)
2374 return ERR_PTR(err); 2370 return ERR_PTR(err);
2375 2371
2376 /* 2372 ret = lookup_dcache(&this, base, 0);
2377 * __d_lookup() is used to try to get a quick answer and avoid the 2373 if (!ret)
2378 * mutex. A false-negative does no harm. 2374 ret = lookup_slow(&this, base, 0);
2379 */
2380 ret = __d_lookup(base, &this);
2381 if (ret && unlikely(ret->d_flags & DCACHE_OP_REVALIDATE)) {
2382 dput(ret);
2383 ret = NULL;
2384 }
2385 if (ret)
2386 return ret;
2387
2388 inode_lock(base->d_inode);
2389 ret = __lookup_hash(&this, base, 0);
2390 inode_unlock(base->d_inode);
2391 return ret; 2375 return ret;
2392} 2376}
2393EXPORT_SYMBOL(lookup_one_len_unlocked); 2377EXPORT_SYMBOL(lookup_one_len_unlocked);
@@ -2465,31 +2449,21 @@ mountpoint_last(struct nameidata *nd, struct path *path)
2465 if (error) 2449 if (error)
2466 return error; 2450 return error;
2467 dentry = dget(nd->path.dentry); 2451 dentry = dget(nd->path.dentry);
2468 goto done; 2452 } else {
2469 } 2453 dentry = d_lookup(dir, &nd->last);
2470
2471 inode_lock(dir->d_inode);
2472 dentry = d_lookup(dir, &nd->last);
2473 if (!dentry) {
2474 /*
2475 * No cached dentry. Mounted dentries are pinned in the cache,
2476 * so that means that this dentry is probably a symlink or the
2477 * path doesn't actually point to a mounted dentry.
2478 */
2479 dentry = d_alloc(dir, &nd->last);
2480 if (!dentry) { 2454 if (!dentry) {
2481 inode_unlock(dir->d_inode); 2455 /*
2482 return -ENOMEM; 2456 * No cached dentry. Mounted dentries are pinned in the
2483 } 2457 * cache, so that means that this dentry is probably
2484 dentry = lookup_real(dir->d_inode, dentry, nd->flags); 2458 * a symlink or the path doesn't actually point
2485 if (IS_ERR(dentry)) { 2459 * to a mounted dentry.
2486 inode_unlock(dir->d_inode); 2460 */
2487 return PTR_ERR(dentry); 2461 dentry = lookup_slow(&nd->last, dir,
2462 nd->flags | LOOKUP_NO_REVAL);
2463 if (IS_ERR(dentry))
2464 return PTR_ERR(dentry);
2488 } 2465 }
2489 } 2466 }
2490 inode_unlock(dir->d_inode);
2491
2492done:
2493 if (d_is_negative(dentry)) { 2467 if (d_is_negative(dentry)) {
2494 dput(dentry); 2468 dput(dentry);
2495 return -ENOENT; 2469 return -ENOENT;
@@ -3018,16 +2992,22 @@ static int lookup_open(struct nameidata *nd, struct path *path,
3018 struct inode *dir_inode = dir->d_inode; 2992 struct inode *dir_inode = dir->d_inode;
3019 struct dentry *dentry; 2993 struct dentry *dentry;
3020 int error; 2994 int error;
3021 bool need_lookup; 2995 bool need_lookup = false;
3022 2996
3023 *opened &= ~FILE_CREATED; 2997 *opened &= ~FILE_CREATED;
3024 dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup); 2998 dentry = lookup_dcache(&nd->last, dir, nd->flags);
3025 if (IS_ERR(dentry)) 2999 if (IS_ERR(dentry))
3026 return PTR_ERR(dentry); 3000 return PTR_ERR(dentry);
3027 3001
3028 /* Cached positive dentry: will open in f_op->open */ 3002 if (!dentry) {
3029 if (!need_lookup && dentry->d_inode) 3003 dentry = d_alloc(dir, &nd->last);
3004 if (unlikely(!dentry))
3005 return -ENOMEM;
3006 need_lookup = true;
3007 } else if (dentry->d_inode) {
3008 /* Cached positive dentry: will open in f_op->open */
3030 goto out_no_open; 3009 goto out_no_open;
3010 }
3031 3011
3032 if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) { 3012 if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
3033 return atomic_open(nd, dentry, path, file, op, got_write, 3013 return atomic_open(nd, dentry, path, file, op, got_write,
@@ -3111,13 +3091,14 @@ static int do_last(struct nameidata *nd,
3111 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; 3091 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
3112 /* we _can_ be in RCU mode here */ 3092 /* we _can_ be in RCU mode here */
3113 error = lookup_fast(nd, &path, &inode, &seq); 3093 error = lookup_fast(nd, &path, &inode, &seq);
3114 if (likely(!error)) 3094 if (likely(error > 0))
3115 goto finish_lookup; 3095 goto finish_lookup;
3116 3096
3117 if (error < 0) 3097 if (error < 0)
3118 return error; 3098 return error;
3119 3099
3120 BUG_ON(nd->inode != dir->d_inode); 3100 BUG_ON(nd->inode != dir->d_inode);
3101 BUG_ON(nd->flags & LOOKUP_RCU);
3121 } else { 3102 } else {
3122 /* create side of things */ 3103 /* create side of things */
3123 /* 3104 /*
@@ -3172,12 +3153,6 @@ retry_lookup:
3172 } 3153 }
3173 3154
3174 /* 3155 /*
3175 * create/update audit record if it already exists.
3176 */
3177 if (d_is_positive(path.dentry))
3178 audit_inode(nd->name, path.dentry, 0);
3179
3180 /*
3181 * If atomic_open() acquired write access it is dropped now due to 3156 * If atomic_open() acquired write access it is dropped now due to
3182 * possible mount and symlink following (this might be optimized away if 3157 * possible mount and symlink following (this might be optimized away if
3183 * necessary...) 3158 * necessary...)
@@ -3187,6 +3162,16 @@ retry_lookup:
3187 got_write = false; 3162 got_write = false;
3188 } 3163 }
3189 3164
3165 if (unlikely(d_is_negative(path.dentry))) {
3166 path_to_nameidata(&path, nd);
3167 return -ENOENT;
3168 }
3169
3170 /*
3171 * create/update audit record if it already exists.
3172 */
3173 audit_inode(nd->name, path.dentry, 0);
3174
3190 if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) { 3175 if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
3191 path_to_nameidata(&path, nd); 3176 path_to_nameidata(&path, nd);
3192 return -EEXIST; 3177 return -EEXIST;
@@ -3196,12 +3181,7 @@ retry_lookup:
3196 if (unlikely(error < 0)) 3181 if (unlikely(error < 0))
3197 return error; 3182 return error;
3198 3183
3199 BUG_ON(nd->flags & LOOKUP_RCU);
3200 seq = 0; /* out of RCU mode, so the value doesn't matter */ 3184 seq = 0; /* out of RCU mode, so the value doesn't matter */
3201 if (unlikely(d_is_negative(path.dentry))) {
3202 path_to_nameidata(&path, nd);
3203 return -ENOENT;
3204 }
3205 inode = d_backing_inode(path.dentry); 3185 inode = d_backing_inode(path.dentry);
3206finish_lookup: 3186finish_lookup:
3207 if (nd->depth) 3187 if (nd->depth)
@@ -3707,31 +3687,6 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
3707 return sys_mkdirat(AT_FDCWD, pathname, mode); 3687 return sys_mkdirat(AT_FDCWD, pathname, mode);
3708} 3688}
3709 3689
3710/*
3711 * The dentry_unhash() helper will try to drop the dentry early: we
3712 * should have a usage count of 1 if we're the only user of this
3713 * dentry, and if that is true (possibly after pruning the dcache),
3714 * then we drop the dentry now.
3715 *
3716 * A low-level filesystem can, if it choses, legally
3717 * do a
3718 *
3719 * if (!d_unhashed(dentry))
3720 * return -EBUSY;
3721 *
3722 * if it cannot handle the case of removing a directory
3723 * that is still in use by something else..
3724 */
3725void dentry_unhash(struct dentry *dentry)
3726{
3727 shrink_dcache_parent(dentry);
3728 spin_lock(&dentry->d_lock);
3729 if (dentry->d_lockref.count == 1)
3730 __d_drop(dentry);
3731 spin_unlock(&dentry->d_lock);
3732}
3733EXPORT_SYMBOL(dentry_unhash);
3734
3735int vfs_rmdir(struct inode *dir, struct dentry *dentry) 3690int vfs_rmdir(struct inode *dir, struct dentry *dentry)
3736{ 3691{
3737 int error = may_delete(dir, dentry, 1); 3692 int error = may_delete(dir, dentry, 1);
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index ddd0138f410c..02e4d87d2ed3 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -446,8 +446,8 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
446 kfree(bl); 446 kfree(bl);
447} 447}
448 448
449static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode, 449static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode,
450 gfp_t gfp_flags) 450 gfp_t gfp_flags, bool is_scsi_layout)
451{ 451{
452 struct pnfs_block_layout *bl; 452 struct pnfs_block_layout *bl;
453 453
@@ -460,9 +460,22 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
460 bl->bl_ext_ro = RB_ROOT; 460 bl->bl_ext_ro = RB_ROOT;
461 spin_lock_init(&bl->bl_ext_lock); 461 spin_lock_init(&bl->bl_ext_lock);
462 462
463 bl->bl_scsi_layout = is_scsi_layout;
463 return &bl->bl_layout; 464 return &bl->bl_layout;
464} 465}
465 466
467static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
468 gfp_t gfp_flags)
469{
470 return __bl_alloc_layout_hdr(inode, gfp_flags, false);
471}
472
473static struct pnfs_layout_hdr *sl_alloc_layout_hdr(struct inode *inode,
474 gfp_t gfp_flags)
475{
476 return __bl_alloc_layout_hdr(inode, gfp_flags, true);
477}
478
466static void bl_free_lseg(struct pnfs_layout_segment *lseg) 479static void bl_free_lseg(struct pnfs_layout_segment *lseg)
467{ 480{
468 dprintk("%s enter\n", __func__); 481 dprintk("%s enter\n", __func__);
@@ -743,7 +756,7 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
743 756
744static bool 757static bool
745is_aligned_req(struct nfs_pageio_descriptor *pgio, 758is_aligned_req(struct nfs_pageio_descriptor *pgio,
746 struct nfs_page *req, unsigned int alignment) 759 struct nfs_page *req, unsigned int alignment, bool is_write)
747{ 760{
748 /* 761 /*
749 * Always accept buffered writes, higher layers take care of the 762 * Always accept buffered writes, higher layers take care of the
@@ -758,7 +771,8 @@ is_aligned_req(struct nfs_pageio_descriptor *pgio,
758 if (IS_ALIGNED(req->wb_bytes, alignment)) 771 if (IS_ALIGNED(req->wb_bytes, alignment))
759 return true; 772 return true;
760 773
761 if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) { 774 if (is_write &&
775 (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode))) {
762 /* 776 /*
763 * If the write goes up to the inode size, just write 777 * If the write goes up to the inode size, just write
764 * the full page. Data past the inode size is 778 * the full page. Data past the inode size is
@@ -775,7 +789,7 @@ is_aligned_req(struct nfs_pageio_descriptor *pgio,
775static void 789static void
776bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 790bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
777{ 791{
778 if (!is_aligned_req(pgio, req, SECTOR_SIZE)) { 792 if (!is_aligned_req(pgio, req, SECTOR_SIZE, false)) {
779 nfs_pageio_reset_read_mds(pgio); 793 nfs_pageio_reset_read_mds(pgio);
780 return; 794 return;
781 } 795 }
@@ -791,7 +805,7 @@ static size_t
791bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 805bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
792 struct nfs_page *req) 806 struct nfs_page *req)
793{ 807{
794 if (!is_aligned_req(pgio, req, SECTOR_SIZE)) 808 if (!is_aligned_req(pgio, req, SECTOR_SIZE, false))
795 return 0; 809 return 0;
796 return pnfs_generic_pg_test(pgio, prev, req); 810 return pnfs_generic_pg_test(pgio, prev, req);
797} 811}
@@ -824,7 +838,7 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
824{ 838{
825 u64 wb_size; 839 u64 wb_size;
826 840
827 if (!is_aligned_req(pgio, req, PAGE_SIZE)) { 841 if (!is_aligned_req(pgio, req, PAGE_SIZE, true)) {
828 nfs_pageio_reset_write_mds(pgio); 842 nfs_pageio_reset_write_mds(pgio);
829 return; 843 return;
830 } 844 }
@@ -846,7 +860,7 @@ static size_t
846bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 860bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
847 struct nfs_page *req) 861 struct nfs_page *req)
848{ 862{
849 if (!is_aligned_req(pgio, req, PAGE_SIZE)) 863 if (!is_aligned_req(pgio, req, PAGE_SIZE, true))
850 return 0; 864 return 0;
851 return pnfs_generic_pg_test(pgio, prev, req); 865 return pnfs_generic_pg_test(pgio, prev, req);
852} 866}
@@ -888,22 +902,53 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
888 .sync = pnfs_generic_sync, 902 .sync = pnfs_generic_sync,
889}; 903};
890 904
905static struct pnfs_layoutdriver_type scsilayout_type = {
906 .id = LAYOUT_SCSI,
907 .name = "LAYOUT_SCSI",
908 .owner = THIS_MODULE,
909 .flags = PNFS_LAYOUTRET_ON_SETATTR |
910 PNFS_READ_WHOLE_PAGE,
911 .read_pagelist = bl_read_pagelist,
912 .write_pagelist = bl_write_pagelist,
913 .alloc_layout_hdr = sl_alloc_layout_hdr,
914 .free_layout_hdr = bl_free_layout_hdr,
915 .alloc_lseg = bl_alloc_lseg,
916 .free_lseg = bl_free_lseg,
917 .return_range = bl_return_range,
918 .prepare_layoutcommit = bl_prepare_layoutcommit,
919 .cleanup_layoutcommit = bl_cleanup_layoutcommit,
920 .set_layoutdriver = bl_set_layoutdriver,
921 .alloc_deviceid_node = bl_alloc_deviceid_node,
922 .free_deviceid_node = bl_free_deviceid_node,
923 .pg_read_ops = &bl_pg_read_ops,
924 .pg_write_ops = &bl_pg_write_ops,
925 .sync = pnfs_generic_sync,
926};
927
928
891static int __init nfs4blocklayout_init(void) 929static int __init nfs4blocklayout_init(void)
892{ 930{
893 int ret; 931 int ret;
894 932
895 dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); 933 dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
896 934
897 ret = pnfs_register_layoutdriver(&blocklayout_type); 935 ret = bl_init_pipefs();
898 if (ret) 936 if (ret)
899 goto out; 937 goto out;
900 ret = bl_init_pipefs(); 938
939 ret = pnfs_register_layoutdriver(&blocklayout_type);
901 if (ret) 940 if (ret)
902 goto out_unregister; 941 goto out_cleanup_pipe;
942
943 ret = pnfs_register_layoutdriver(&scsilayout_type);
944 if (ret)
945 goto out_unregister_block;
903 return 0; 946 return 0;
904 947
905out_unregister: 948out_unregister_block:
906 pnfs_unregister_layoutdriver(&blocklayout_type); 949 pnfs_unregister_layoutdriver(&blocklayout_type);
950out_cleanup_pipe:
951 bl_cleanup_pipefs();
907out: 952out:
908 return ret; 953 return ret;
909} 954}
@@ -913,8 +958,9 @@ static void __exit nfs4blocklayout_exit(void)
913 dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", 958 dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
914 __func__); 959 __func__);
915 960
916 bl_cleanup_pipefs(); 961 pnfs_unregister_layoutdriver(&scsilayout_type);
917 pnfs_unregister_layoutdriver(&blocklayout_type); 962 pnfs_unregister_layoutdriver(&blocklayout_type);
963 bl_cleanup_pipefs();
918} 964}
919 965
920MODULE_ALIAS("nfs-layouttype4-3"); 966MODULE_ALIAS("nfs-layouttype4-3");
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index c556640dcf3b..bc21205309e0 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -55,7 +55,6 @@ struct pnfs_block_dev;
55 */ 55 */
56#define PNFS_BLOCK_UUID_LEN 128 56#define PNFS_BLOCK_UUID_LEN 128
57 57
58
59struct pnfs_block_volume { 58struct pnfs_block_volume {
60 enum pnfs_block_volume_type type; 59 enum pnfs_block_volume_type type;
61 union { 60 union {
@@ -82,6 +81,13 @@ struct pnfs_block_volume {
82 u32 volumes_count; 81 u32 volumes_count;
83 u32 volumes[PNFS_BLOCK_MAX_DEVICES]; 82 u32 volumes[PNFS_BLOCK_MAX_DEVICES];
84 } stripe; 83 } stripe;
84 struct {
85 enum scsi_code_set code_set;
86 enum scsi_designator_type designator_type;
87 int designator_len;
88 u8 designator[256];
89 u64 pr_key;
90 } scsi;
85 }; 91 };
86}; 92};
87 93
@@ -106,6 +112,9 @@ struct pnfs_block_dev {
106 struct block_device *bdev; 112 struct block_device *bdev;
107 u64 disk_offset; 113 u64 disk_offset;
108 114
115 u64 pr_key;
116 bool pr_registered;
117
109 bool (*map)(struct pnfs_block_dev *dev, u64 offset, 118 bool (*map)(struct pnfs_block_dev *dev, u64 offset,
110 struct pnfs_block_dev_map *map); 119 struct pnfs_block_dev_map *map);
111}; 120};
@@ -131,6 +140,7 @@ struct pnfs_block_layout {
131 struct rb_root bl_ext_rw; 140 struct rb_root bl_ext_rw;
132 struct rb_root bl_ext_ro; 141 struct rb_root bl_ext_ro;
133 spinlock_t bl_ext_lock; /* Protects list manipulation */ 142 spinlock_t bl_ext_lock; /* Protects list manipulation */
143 bool bl_scsi_layout;
134}; 144};
135 145
136static inline struct pnfs_block_layout * 146static inline struct pnfs_block_layout *
@@ -182,6 +192,6 @@ void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
182dev_t bl_resolve_deviceid(struct nfs_server *server, 192dev_t bl_resolve_deviceid(struct nfs_server *server,
183 struct pnfs_block_volume *b, gfp_t gfp_mask); 193 struct pnfs_block_volume *b, gfp_t gfp_mask);
184int __init bl_init_pipefs(void); 194int __init bl_init_pipefs(void);
185void __exit bl_cleanup_pipefs(void); 195void bl_cleanup_pipefs(void);
186 196
187#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ 197#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index a861bbdfe577..e5b89675263e 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -1,11 +1,12 @@
1/* 1/*
2 * Copyright (c) 2014 Christoph Hellwig. 2 * Copyright (c) 2014-2016 Christoph Hellwig.
3 */ 3 */
4#include <linux/sunrpc/svc.h> 4#include <linux/sunrpc/svc.h>
5#include <linux/blkdev.h> 5#include <linux/blkdev.h>
6#include <linux/nfs4.h> 6#include <linux/nfs4.h>
7#include <linux/nfs_fs.h> 7#include <linux/nfs_fs.h>
8#include <linux/nfs_xdr.h> 8#include <linux/nfs_xdr.h>
9#include <linux/pr.h>
9 10
10#include "blocklayout.h" 11#include "blocklayout.h"
11 12
@@ -21,6 +22,17 @@ bl_free_device(struct pnfs_block_dev *dev)
21 bl_free_device(&dev->children[i]); 22 bl_free_device(&dev->children[i]);
22 kfree(dev->children); 23 kfree(dev->children);
23 } else { 24 } else {
25 if (dev->pr_registered) {
26 const struct pr_ops *ops =
27 dev->bdev->bd_disk->fops->pr_ops;
28 int error;
29
30 error = ops->pr_register(dev->bdev, dev->pr_key, 0,
31 false);
32 if (error)
33 pr_err("failed to unregister PR key.\n");
34 }
35
24 if (dev->bdev) 36 if (dev->bdev)
25 blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE); 37 blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
26 } 38 }
@@ -113,6 +125,24 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
113 for (i = 0; i < b->stripe.volumes_count; i++) 125 for (i = 0; i < b->stripe.volumes_count; i++)
114 b->stripe.volumes[i] = be32_to_cpup(p++); 126 b->stripe.volumes[i] = be32_to_cpup(p++);
115 break; 127 break;
128 case PNFS_BLOCK_VOLUME_SCSI:
129 p = xdr_inline_decode(xdr, 4 + 4 + 4);
130 if (!p)
131 return -EIO;
132 b->scsi.code_set = be32_to_cpup(p++);
133 b->scsi.designator_type = be32_to_cpup(p++);
134 b->scsi.designator_len = be32_to_cpup(p++);
135 p = xdr_inline_decode(xdr, b->scsi.designator_len);
136 if (!p)
137 return -EIO;
138 if (b->scsi.designator_len > 256)
139 return -EIO;
140 memcpy(&b->scsi.designator, p, b->scsi.designator_len);
141 p = xdr_inline_decode(xdr, 8);
142 if (!p)
143 return -EIO;
144 p = xdr_decode_hyper(p, &b->scsi.pr_key);
145 break;
116 default: 146 default:
117 dprintk("unknown volume type!\n"); 147 dprintk("unknown volume type!\n");
118 return -EIO; 148 return -EIO;
@@ -216,6 +246,116 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
216 return 0; 246 return 0;
217} 247}
218 248
249static bool
250bl_validate_designator(struct pnfs_block_volume *v)
251{
252 switch (v->scsi.designator_type) {
253 case PS_DESIGNATOR_EUI64:
254 if (v->scsi.code_set != PS_CODE_SET_BINARY)
255 return false;
256
257 if (v->scsi.designator_len != 8 &&
258 v->scsi.designator_len != 10 &&
259 v->scsi.designator_len != 16)
260 return false;
261
262 return true;
263 case PS_DESIGNATOR_NAA:
264 if (v->scsi.code_set != PS_CODE_SET_BINARY)
265 return false;
266
267 if (v->scsi.designator_len != 8 &&
268 v->scsi.designator_len != 16)
269 return false;
270
271 return true;
272 case PS_DESIGNATOR_T10:
273 case PS_DESIGNATOR_NAME:
274 pr_err("pNFS: unsupported designator "
275 "(code set %d, type %d, len %d.\n",
276 v->scsi.code_set,
277 v->scsi.designator_type,
278 v->scsi.designator_len);
279 return false;
280 default:
281 pr_err("pNFS: invalid designator "
282 "(code set %d, type %d, len %d.\n",
283 v->scsi.code_set,
284 v->scsi.designator_type,
285 v->scsi.designator_len);
286 return false;
287 }
288}
289
290static int
291bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
292 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
293{
294 struct pnfs_block_volume *v = &volumes[idx];
295 const struct pr_ops *ops;
296 const char *devname;
297 int error;
298
299 if (!bl_validate_designator(v))
300 return -EINVAL;
301
302 switch (v->scsi.designator_len) {
303 case 8:
304 devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN",
305 v->scsi.designator);
306 break;
307 case 12:
308 devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
309 v->scsi.designator);
310 break;
311 case 16:
312 devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
313 v->scsi.designator);
314 break;
315 default:
316 return -EINVAL;
317 }
318
319 d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
320 if (IS_ERR(d->bdev)) {
321 pr_warn("pNFS: failed to open device %s (%ld)\n",
322 devname, PTR_ERR(d->bdev));
323 kfree(devname);
324 return PTR_ERR(d->bdev);
325 }
326
327 kfree(devname);
328
329 d->len = i_size_read(d->bdev->bd_inode);
330 d->map = bl_map_simple;
331 d->pr_key = v->scsi.pr_key;
332
333 pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
334 d->bdev->bd_disk->disk_name, d->pr_key);
335
336 ops = d->bdev->bd_disk->fops->pr_ops;
337 if (!ops) {
338 pr_err("pNFS: block device %s does not support reservations.",
339 d->bdev->bd_disk->disk_name);
340 error = -EINVAL;
341 goto out_blkdev_put;
342 }
343
344 error = ops->pr_register(d->bdev, 0, d->pr_key, true);
345 if (error) {
346 pr_err("pNFS: failed to register key for block device %s.",
347 d->bdev->bd_disk->disk_name);
348 goto out_blkdev_put;
349 }
350
351 d->pr_registered = true;
352 return 0;
353
354out_blkdev_put:
355 blkdev_put(d->bdev, FMODE_READ);
356 return error;
357}
358
219static int 359static int
220bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d, 360bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
221 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) 361 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
@@ -303,6 +443,8 @@ bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
303 return bl_parse_concat(server, d, volumes, idx, gfp_mask); 443 return bl_parse_concat(server, d, volumes, idx, gfp_mask);
304 case PNFS_BLOCK_VOLUME_STRIPE: 444 case PNFS_BLOCK_VOLUME_STRIPE:
305 return bl_parse_stripe(server, d, volumes, idx, gfp_mask); 445 return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
446 case PNFS_BLOCK_VOLUME_SCSI:
447 return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
306 default: 448 default:
307 dprintk("unsupported volume type: %d\n", volumes[idx].type); 449 dprintk("unsupported volume type: %d\n", volumes[idx].type);
308 return -EIO; 450 return -EIO;
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 35ab51c04814..720b3ff55fa9 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2014 Christoph Hellwig. 2 * Copyright (c) 2014-2016 Christoph Hellwig.
3 */ 3 */
4 4
5#include <linux/vmalloc.h> 5#include <linux/vmalloc.h>
@@ -462,10 +462,12 @@ out:
462 return err; 462 return err;
463} 463}
464 464
465static size_t ext_tree_layoutupdate_size(size_t count) 465static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
466{ 466{
467 return sizeof(__be32) /* number of entries */ + 467 if (bl->bl_scsi_layout)
468 PNFS_BLOCK_EXTENT_SIZE * count; 468 return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
469 else
470 return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
469} 471}
470 472
471static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg, 473static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
@@ -483,6 +485,23 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
483 } 485 }
484} 486}
485 487
488static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
489{
490 p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
491 NFS4_DEVICEID4_SIZE);
492 p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
493 p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
494 p = xdr_encode_hyper(p, 0LL);
495 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
496 return p;
497}
498
499static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
500{
501 p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
502 return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
503}
504
486static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, 505static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
487 size_t buffer_size, size_t *count) 506 size_t buffer_size, size_t *count)
488{ 507{
@@ -496,19 +515,16 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
496 continue; 515 continue;
497 516
498 (*count)++; 517 (*count)++;
499 if (ext_tree_layoutupdate_size(*count) > buffer_size) { 518 if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
500 /* keep counting.. */ 519 /* keep counting.. */
501 ret = -ENOSPC; 520 ret = -ENOSPC;
502 continue; 521 continue;
503 } 522 }
504 523
505 p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data, 524 if (bl->bl_scsi_layout)
506 NFS4_DEVICEID4_SIZE); 525 p = encode_scsi_range(be, p);
507 p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT); 526 else
508 p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT); 527 p = encode_block_extent(be, p);
509 p = xdr_encode_hyper(p, 0LL);
510 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
511
512 be->be_tag = EXTENT_COMMITTING; 528 be->be_tag = EXTENT_COMMITTING;
513 } 529 }
514 spin_unlock(&bl->bl_ext_lock); 530 spin_unlock(&bl->bl_ext_lock);
@@ -537,7 +553,7 @@ retry:
537 if (unlikely(ret)) { 553 if (unlikely(ret)) {
538 ext_tree_free_commitdata(arg, buffer_size); 554 ext_tree_free_commitdata(arg, buffer_size);
539 555
540 buffer_size = ext_tree_layoutupdate_size(count); 556 buffer_size = ext_tree_layoutupdate_size(bl, count);
541 count = 0; 557 count = 0;
542 558
543 arg->layoutupdate_pages = 559 arg->layoutupdate_pages =
@@ -556,7 +572,7 @@ retry:
556 } 572 }
557 573
558 *start_p = cpu_to_be32(count); 574 *start_p = cpu_to_be32(count);
559 arg->layoutupdate_len = ext_tree_layoutupdate_size(count); 575 arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
560 576
561 if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { 577 if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
562 void *p = start_p, *end = p + arg->layoutupdate_len; 578 void *p = start_p, *end = p + arg->layoutupdate_len;
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index dbe5839cdeba..9fb067a6f7e0 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -281,7 +281,7 @@ out:
281 return ret; 281 return ret;
282} 282}
283 283
284void __exit bl_cleanup_pipefs(void) 284void bl_cleanup_pipefs(void)
285{ 285{
286 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); 286 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
287 unregister_pernet_subsys(&nfs4blocklayout_net_ops); 287 unregister_pernet_subsys(&nfs4blocklayout_net_ops);
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index ff8195bd75ea..5fe1cecbf9f0 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -37,10 +37,11 @@ enum nfs4_callback_opnum {
37 OP_CB_ILLEGAL = 10044, 37 OP_CB_ILLEGAL = 10044,
38}; 38};
39 39
40struct nfs4_slot;
40struct cb_process_state { 41struct cb_process_state {
41 __be32 drc_status; 42 __be32 drc_status;
42 struct nfs_client *clp; 43 struct nfs_client *clp;
43 u32 slotid; 44 struct nfs4_slot *slot;
44 u32 minorversion; 45 u32 minorversion;
45 struct net *net; 46 struct net *net;
46}; 47};
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index f0939d097406..618ced381a14 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -354,47 +354,38 @@ out:
354 * a single outstanding callback request at a time. 354 * a single outstanding callback request at a time.
355 */ 355 */
356static __be32 356static __be32
357validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) 357validate_seqid(const struct nfs4_slot_table *tbl, const struct nfs4_slot *slot,
358 const struct cb_sequenceargs * args)
358{ 359{
359 struct nfs4_slot *slot; 360 dprintk("%s enter. slotid %u seqid %u, slot table seqid: %u\n",
360 361 __func__, args->csa_slotid, args->csa_sequenceid, slot->seq_nr);
361 dprintk("%s enter. slotid %u seqid %u\n",
362 __func__, args->csa_slotid, args->csa_sequenceid);
363 362
364 if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS) 363 if (args->csa_slotid > tbl->server_highest_slotid)
365 return htonl(NFS4ERR_BADSLOT); 364 return htonl(NFS4ERR_BADSLOT);
366 365
367 slot = tbl->slots + args->csa_slotid;
368 dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr);
369
370 /* Normal */
371 if (likely(args->csa_sequenceid == slot->seq_nr + 1))
372 goto out_ok;
373
374 /* Replay */ 366 /* Replay */
375 if (args->csa_sequenceid == slot->seq_nr) { 367 if (args->csa_sequenceid == slot->seq_nr) {
376 dprintk("%s seqid %u is a replay\n", 368 dprintk("%s seqid %u is a replay\n",
377 __func__, args->csa_sequenceid); 369 __func__, args->csa_sequenceid);
370 if (nfs4_test_locked_slot(tbl, slot->slot_nr))
371 return htonl(NFS4ERR_DELAY);
378 /* Signal process_op to set this error on next op */ 372 /* Signal process_op to set this error on next op */
379 if (args->csa_cachethis == 0) 373 if (args->csa_cachethis == 0)
380 return htonl(NFS4ERR_RETRY_UNCACHED_REP); 374 return htonl(NFS4ERR_RETRY_UNCACHED_REP);
381 375
382 /* The ca_maxresponsesize_cached is 0 with no DRC */ 376 /* Liar! We never allowed you to set csa_cachethis != 0 */
383 else if (args->csa_cachethis == 1) 377 return htonl(NFS4ERR_SEQ_FALSE_RETRY);
384 return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
385 } 378 }
386 379
387 /* Wraparound */ 380 /* Wraparound */
388 if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) { 381 if (unlikely(slot->seq_nr == 0xFFFFFFFFU)) {
389 slot->seq_nr = 1; 382 if (args->csa_sequenceid == 1)
390 goto out_ok; 383 return htonl(NFS4_OK);
391 } 384 } else if (likely(args->csa_sequenceid == slot->seq_nr + 1))
385 return htonl(NFS4_OK);
392 386
393 /* Misordered request */ 387 /* Misordered request */
394 return htonl(NFS4ERR_SEQ_MISORDERED); 388 return htonl(NFS4ERR_SEQ_MISORDERED);
395out_ok:
396 tbl->highest_used_slotid = args->csa_slotid;
397 return htonl(NFS4_OK);
398} 389}
399 390
400/* 391/*
@@ -473,6 +464,12 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
473 tbl = &clp->cl_session->bc_slot_table; 464 tbl = &clp->cl_session->bc_slot_table;
474 slot = tbl->slots + args->csa_slotid; 465 slot = tbl->slots + args->csa_slotid;
475 466
467 /* Set up res before grabbing the spinlock */
468 memcpy(&res->csr_sessionid, &args->csa_sessionid,
469 sizeof(res->csr_sessionid));
470 res->csr_sequenceid = args->csa_sequenceid;
471 res->csr_slotid = args->csa_slotid;
472
476 spin_lock(&tbl->slot_tbl_lock); 473 spin_lock(&tbl->slot_tbl_lock);
477 /* state manager is resetting the session */ 474 /* state manager is resetting the session */
478 if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { 475 if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
@@ -485,18 +482,26 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
485 goto out_unlock; 482 goto out_unlock;
486 } 483 }
487 484
488 memcpy(&res->csr_sessionid, &args->csa_sessionid, 485 status = htonl(NFS4ERR_BADSLOT);
489 sizeof(res->csr_sessionid)); 486 slot = nfs4_lookup_slot(tbl, args->csa_slotid);
490 res->csr_sequenceid = args->csa_sequenceid; 487 if (IS_ERR(slot))
491 res->csr_slotid = args->csa_slotid; 488 goto out_unlock;
492 res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; 489
493 res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; 490 res->csr_highestslotid = tbl->server_highest_slotid;
491 res->csr_target_highestslotid = tbl->target_highest_slotid;
494 492
495 status = validate_seqid(tbl, args); 493 status = validate_seqid(tbl, slot, args);
496 if (status) 494 if (status)
497 goto out_unlock; 495 goto out_unlock;
496 if (!nfs4_try_to_lock_slot(tbl, slot)) {
497 status = htonl(NFS4ERR_DELAY);
498 goto out_unlock;
499 }
500 cps->slot = slot;
498 501
499 cps->slotid = args->csa_slotid; 502 /* The ca_maxresponsesize_cached is 0 with no DRC */
503 if (args->csa_cachethis != 0)
504 return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
500 505
501 /* 506 /*
502 * Check for pending referring calls. If a match is found, a 507 * Check for pending referring calls. If a match is found, a
@@ -513,7 +518,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
513 * If CB_SEQUENCE returns an error, then the state of the slot 518 * If CB_SEQUENCE returns an error, then the state of the slot
514 * (sequence ID, cached reply) MUST NOT change. 519 * (sequence ID, cached reply) MUST NOT change.
515 */ 520 */
516 slot->seq_nr++; 521 slot->seq_nr = args->csa_sequenceid;
517out_unlock: 522out_unlock:
518 spin_unlock(&tbl->slot_tbl_lock); 523 spin_unlock(&tbl->slot_tbl_lock);
519 524
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 646cdac73488..976c90608e56 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -752,7 +752,8 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
752 return htonl(NFS_OK); 752 return htonl(NFS_OK);
753} 753}
754 754
755static void nfs4_callback_free_slot(struct nfs4_session *session) 755static void nfs4_callback_free_slot(struct nfs4_session *session,
756 struct nfs4_slot *slot)
756{ 757{
757 struct nfs4_slot_table *tbl = &session->bc_slot_table; 758 struct nfs4_slot_table *tbl = &session->bc_slot_table;
758 759
@@ -761,15 +762,17 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
761 * Let the state manager know callback processing done. 762 * Let the state manager know callback processing done.
762 * A single slot, so highest used slotid is either 0 or -1 763 * A single slot, so highest used slotid is either 0 or -1
763 */ 764 */
764 tbl->highest_used_slotid = NFS4_NO_SLOT; 765 nfs4_free_slot(tbl, slot);
765 nfs4_slot_tbl_drain_complete(tbl); 766 nfs4_slot_tbl_drain_complete(tbl);
766 spin_unlock(&tbl->slot_tbl_lock); 767 spin_unlock(&tbl->slot_tbl_lock);
767} 768}
768 769
769static void nfs4_cb_free_slot(struct cb_process_state *cps) 770static void nfs4_cb_free_slot(struct cb_process_state *cps)
770{ 771{
771 if (cps->slotid != NFS4_NO_SLOT) 772 if (cps->slot) {
772 nfs4_callback_free_slot(cps->clp->cl_session); 773 nfs4_callback_free_slot(cps->clp->cl_session, cps->slot);
774 cps->slot = NULL;
775 }
773} 776}
774 777
775#else /* CONFIG_NFS_V4_1 */ 778#else /* CONFIG_NFS_V4_1 */
@@ -893,7 +896,6 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
893 struct cb_process_state cps = { 896 struct cb_process_state cps = {
894 .drc_status = 0, 897 .drc_status = 0,
895 .clp = NULL, 898 .clp = NULL,
896 .slotid = NFS4_NO_SLOT,
897 .net = SVC_NET(rqstp), 899 .net = SVC_NET(rqstp),
898 }; 900 };
899 unsigned int nops = 0; 901 unsigned int nops = 0;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 9cce67043f92..4bfa7d8bcade 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1360,19 +1360,15 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
1360 dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry); 1360 dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry);
1361 nfs_inc_stats(dir, NFSIOS_VFSLOOKUP); 1361 nfs_inc_stats(dir, NFSIOS_VFSLOOKUP);
1362 1362
1363 res = ERR_PTR(-ENAMETOOLONG); 1363 if (unlikely(dentry->d_name.len > NFS_SERVER(dir)->namelen))
1364 if (dentry->d_name.len > NFS_SERVER(dir)->namelen) 1364 return ERR_PTR(-ENAMETOOLONG);
1365 goto out;
1366 1365
1367 /* 1366 /*
1368 * If we're doing an exclusive create, optimize away the lookup 1367 * If we're doing an exclusive create, optimize away the lookup
1369 * but don't hash the dentry. 1368 * but don't hash the dentry.
1370 */ 1369 */
1371 if (nfs_is_exclusive_create(dir, flags)) { 1370 if (nfs_is_exclusive_create(dir, flags))
1372 d_instantiate(dentry, NULL); 1371 return NULL;
1373 res = NULL;
1374 goto out;
1375 }
1376 1372
1377 res = ERR_PTR(-ENOMEM); 1373 res = ERR_PTR(-ENOMEM);
1378 fhandle = nfs_alloc_fhandle(); 1374 fhandle = nfs_alloc_fhandle();
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 748bb813b8ec..89bf093d342a 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -233,7 +233,7 @@ EXPORT_SYMBOL_GPL(nfs_file_mmap);
233 * nfs_file_write() that a write error occurred, and hence cause it to 233 * nfs_file_write() that a write error occurred, and hence cause it to
234 * fall back to doing a synchronous write. 234 * fall back to doing a synchronous write.
235 */ 235 */
236int 236static int
237nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync) 237nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
238{ 238{
239 struct nfs_open_context *ctx = nfs_file_open_context(file); 239 struct nfs_open_context *ctx = nfs_file_open_context(file);
@@ -263,9 +263,8 @@ nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
263out: 263out:
264 return ret; 264 return ret;
265} 265}
266EXPORT_SYMBOL_GPL(nfs_file_fsync_commit);
267 266
268static int 267int
269nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) 268nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
270{ 269{
271 int ret; 270 int ret;
@@ -273,13 +272,15 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
273 272
274 trace_nfs_fsync_enter(inode); 273 trace_nfs_fsync_enter(inode);
275 274
276 nfs_inode_dio_wait(inode); 275 inode_dio_wait(inode);
277 do { 276 do {
278 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 277 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
279 if (ret != 0) 278 if (ret != 0)
280 break; 279 break;
281 inode_lock(inode); 280 inode_lock(inode);
282 ret = nfs_file_fsync_commit(file, start, end, datasync); 281 ret = nfs_file_fsync_commit(file, start, end, datasync);
282 if (!ret)
283 ret = pnfs_sync_inode(inode, !!datasync);
283 inode_unlock(inode); 284 inode_unlock(inode);
284 /* 285 /*
285 * If nfs_file_fsync_commit detected a server reboot, then 286 * If nfs_file_fsync_commit detected a server reboot, then
@@ -293,6 +294,7 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
293 trace_nfs_fsync_exit(inode, ret); 294 trace_nfs_fsync_exit(inode, ret);
294 return ret; 295 return ret;
295} 296}
297EXPORT_SYMBOL_GPL(nfs_file_fsync);
296 298
297/* 299/*
298 * Decide whether a read/modify/write cycle may be more efficient 300 * Decide whether a read/modify/write cycle may be more efficient
@@ -368,7 +370,7 @@ start:
368 /* 370 /*
369 * Wait for O_DIRECT to complete 371 * Wait for O_DIRECT to complete
370 */ 372 */
371 nfs_inode_dio_wait(mapping->host); 373 inode_dio_wait(mapping->host);
372 374
373 page = grab_cache_page_write_begin(mapping, index, flags); 375 page = grab_cache_page_write_begin(mapping, index, flags);
374 if (!page) 376 if (!page)
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index eb370460ce20..add0e5a70bd6 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -418,6 +418,8 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
418 pnfs_error_mark_layout_for_return(ino, lseg); 418 pnfs_error_mark_layout_for_return(ino, lseg);
419 } else 419 } else
420 pnfs_error_mark_layout_for_return(ino, lseg); 420 pnfs_error_mark_layout_for_return(ino, lseg);
421 ds = NULL;
422 goto out;
421 } 423 }
422out_update_creds: 424out_update_creds:
423 if (ff_layout_update_mirror_cred(mirror, ds)) 425 if (ff_layout_update_mirror_cred(mirror, ds))
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 86faecf8f328..33d18c411905 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -141,7 +141,7 @@ void nfs_evict_inode(struct inode *inode)
141 141
142int nfs_sync_inode(struct inode *inode) 142int nfs_sync_inode(struct inode *inode)
143{ 143{
144 nfs_inode_dio_wait(inode); 144 inode_dio_wait(inode);
145 return nfs_wb_all(inode); 145 return nfs_wb_all(inode);
146} 146}
147EXPORT_SYMBOL_GPL(nfs_sync_inode); 147EXPORT_SYMBOL_GPL(nfs_sync_inode);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9a547aa3ec8e..565f8135ae1f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -358,7 +358,7 @@ int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
358int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); 358int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
359 359
360/* file.c */ 360/* file.c */
361int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int); 361int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync);
362loff_t nfs_file_llseek(struct file *, loff_t, int); 362loff_t nfs_file_llseek(struct file *, loff_t, int);
363ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); 363ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
364ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, 364ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
@@ -515,10 +515,6 @@ extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry);
515/* direct.c */ 515/* direct.c */
516void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, 516void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
517 struct nfs_direct_req *dreq); 517 struct nfs_direct_req *dreq);
518static inline void nfs_inode_dio_wait(struct inode *inode)
519{
520 inode_dio_wait(inode);
521}
522extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); 518extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
523 519
524/* nfs4proc.c */ 520/* nfs4proc.c */
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 57ca1c8039c1..22c35abbee9d 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -128,37 +128,6 @@ nfs4_file_flush(struct file *file, fl_owner_t id)
128 return vfs_fsync(file, 0); 128 return vfs_fsync(file, 0);
129} 129}
130 130
131static int
132nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
133{
134 int ret;
135 struct inode *inode = file_inode(file);
136
137 trace_nfs_fsync_enter(inode);
138
139 nfs_inode_dio_wait(inode);
140 do {
141 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
142 if (ret != 0)
143 break;
144 inode_lock(inode);
145 ret = nfs_file_fsync_commit(file, start, end, datasync);
146 if (!ret)
147 ret = pnfs_sync_inode(inode, !!datasync);
148 inode_unlock(inode);
149 /*
150 * If nfs_file_fsync_commit detected a server reboot, then
151 * resend all dirty pages that might have been covered by
152 * the NFS_CONTEXT_RESEND_WRITES flag
153 */
154 start = 0;
155 end = LLONG_MAX;
156 } while (ret == -EAGAIN);
157
158 trace_nfs_fsync_exit(inode, ret);
159 return ret;
160}
161
162#ifdef CONFIG_NFS_V4_2 131#ifdef CONFIG_NFS_V4_2
163static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence) 132static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
164{ 133{
@@ -266,7 +235,7 @@ const struct file_operations nfs4_file_operations = {
266 .open = nfs4_file_open, 235 .open = nfs4_file_open,
267 .flush = nfs4_file_flush, 236 .flush = nfs4_file_flush,
268 .release = nfs_file_release, 237 .release = nfs_file_release,
269 .fsync = nfs4_file_fsync, 238 .fsync = nfs_file_fsync,
270 .lock = nfs_lock, 239 .lock = nfs_lock,
271 .flock = nfs_flock, 240 .flock = nfs_flock,
272 .splice_read = nfs_file_splice_read, 241 .splice_read = nfs_file_splice_read,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 14881594dd07..327b8c34d360 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2461,14 +2461,15 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
2461 2461
2462 dentry = opendata->dentry; 2462 dentry = opendata->dentry;
2463 if (d_really_is_negative(dentry)) { 2463 if (d_really_is_negative(dentry)) {
2464 /* FIXME: Is this d_drop() ever needed? */ 2464 struct dentry *alias;
2465 d_drop(dentry); 2465 d_drop(dentry);
2466 dentry = d_add_unique(dentry, igrab(state->inode)); 2466 alias = d_exact_alias(dentry, state->inode);
2467 if (dentry == NULL) { 2467 if (!alias)
2468 dentry = opendata->dentry; 2468 alias = d_splice_alias(igrab(state->inode), dentry);
2469 } else { 2469 /* d_splice_alias() can't fail here - it's a non-directory */
2470 if (alias) {
2470 dput(ctx->dentry); 2471 dput(ctx->dentry);
2471 ctx->dentry = dentry; 2472 ctx->dentry = dentry = alias;
2472 } 2473 }
2473 nfs_set_verifier(dentry, 2474 nfs_set_verifier(dentry,
2474 nfs_save_change_attribute(d_inode(opendata->dir))); 2475 nfs_save_change_attribute(d_inode(opendata->dir)));
@@ -6782,13 +6783,26 @@ nfs41_same_server_scope(struct nfs41_server_scope *a,
6782 return false; 6783 return false;
6783} 6784}
6784 6785
6786static void
6787nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata)
6788{
6789}
6790
6791static const struct rpc_call_ops nfs4_bind_one_conn_to_session_ops = {
6792 .rpc_call_done = &nfs4_bind_one_conn_to_session_done,
6793};
6794
6785/* 6795/*
6786 * nfs4_proc_bind_conn_to_session() 6796 * nfs4_proc_bind_one_conn_to_session()
6787 * 6797 *
6788 * The 4.1 client currently uses the same TCP connection for the 6798 * The 4.1 client currently uses the same TCP connection for the
6789 * fore and backchannel. 6799 * fore and backchannel.
6790 */ 6800 */
6791int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred) 6801static
6802int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt,
6803 struct rpc_xprt *xprt,
6804 struct nfs_client *clp,
6805 struct rpc_cred *cred)
6792{ 6806{
6793 int status; 6807 int status;
6794 struct nfs41_bind_conn_to_session_args args = { 6808 struct nfs41_bind_conn_to_session_args args = {
@@ -6803,6 +6817,14 @@ int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred
6803 .rpc_resp = &res, 6817 .rpc_resp = &res,
6804 .rpc_cred = cred, 6818 .rpc_cred = cred,
6805 }; 6819 };
6820 struct rpc_task_setup task_setup_data = {
6821 .rpc_client = clnt,
6822 .rpc_xprt = xprt,
6823 .callback_ops = &nfs4_bind_one_conn_to_session_ops,
6824 .rpc_message = &msg,
6825 .flags = RPC_TASK_TIMEOUT,
6826 };
6827 struct rpc_task *task;
6806 6828
6807 dprintk("--> %s\n", __func__); 6829 dprintk("--> %s\n", __func__);
6808 6830
@@ -6810,7 +6832,16 @@ int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred
6810 if (!(clp->cl_session->flags & SESSION4_BACK_CHAN)) 6832 if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
6811 args.dir = NFS4_CDFC4_FORE; 6833 args.dir = NFS4_CDFC4_FORE;
6812 6834
6813 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); 6835 /* Do not set the backchannel flag unless this is clnt->cl_xprt */
6836 if (xprt != rcu_access_pointer(clnt->cl_xprt))
6837 args.dir = NFS4_CDFC4_FORE;
6838
6839 task = rpc_run_task(&task_setup_data);
6840 if (!IS_ERR(task)) {
6841 status = task->tk_status;
6842 rpc_put_task(task);
6843 } else
6844 status = PTR_ERR(task);
6814 trace_nfs4_bind_conn_to_session(clp, status); 6845 trace_nfs4_bind_conn_to_session(clp, status);
6815 if (status == 0) { 6846 if (status == 0) {
6816 if (memcmp(res.sessionid.data, 6847 if (memcmp(res.sessionid.data,
@@ -6837,6 +6868,31 @@ out:
6837 return status; 6868 return status;
6838} 6869}
6839 6870
6871struct rpc_bind_conn_calldata {
6872 struct nfs_client *clp;
6873 struct rpc_cred *cred;
6874};
6875
6876static int
6877nfs4_proc_bind_conn_to_session_callback(struct rpc_clnt *clnt,
6878 struct rpc_xprt *xprt,
6879 void *calldata)
6880{
6881 struct rpc_bind_conn_calldata *p = calldata;
6882
6883 return nfs4_proc_bind_one_conn_to_session(clnt, xprt, p->clp, p->cred);
6884}
6885
6886int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred)
6887{
6888 struct rpc_bind_conn_calldata data = {
6889 .clp = clp,
6890 .cred = cred,
6891 };
6892 return rpc_clnt_iterate_for_each_xprt(clp->cl_rpcclient,
6893 nfs4_proc_bind_conn_to_session_callback, &data);
6894}
6895
6840/* 6896/*
6841 * Minimum set of SP4_MACH_CRED operations from RFC 5661 in the enforce map 6897 * Minimum set of SP4_MACH_CRED operations from RFC 5661 in the enforce map
6842 * and operations we'd like to see to enable certain features in the allow map 6898 * and operations we'd like to see to enable certain features in the allow map
@@ -7319,7 +7375,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
7319 args->bc_attrs.max_resp_sz = PAGE_SIZE; 7375 args->bc_attrs.max_resp_sz = PAGE_SIZE;
7320 args->bc_attrs.max_resp_sz_cached = 0; 7376 args->bc_attrs.max_resp_sz_cached = 0;
7321 args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS; 7377 args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
7322 args->bc_attrs.max_reqs = 1; 7378 args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS;
7323 7379
7324 dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u " 7380 dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
7325 "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n", 7381 "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index e23366effcfb..332d06e64fa9 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -135,6 +135,43 @@ static struct nfs4_slot *nfs4_find_or_create_slot(struct nfs4_slot_table *tbl,
135 return ERR_PTR(-ENOMEM); 135 return ERR_PTR(-ENOMEM);
136} 136}
137 137
138static void nfs4_lock_slot(struct nfs4_slot_table *tbl,
139 struct nfs4_slot *slot)
140{
141 u32 slotid = slot->slot_nr;
142
143 __set_bit(slotid, tbl->used_slots);
144 if (slotid > tbl->highest_used_slotid ||
145 tbl->highest_used_slotid == NFS4_NO_SLOT)
146 tbl->highest_used_slotid = slotid;
147 slot->generation = tbl->generation;
148}
149
150/*
151 * nfs4_try_to_lock_slot - Given a slot try to allocate it
152 *
153 * Note: must be called with the slot_tbl_lock held.
154 */
155bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
156{
157 if (nfs4_test_locked_slot(tbl, slot->slot_nr))
158 return false;
159 nfs4_lock_slot(tbl, slot);
160 return true;
161}
162
163/*
164 * nfs4_lookup_slot - Find a slot but don't allocate it
165 *
166 * Note: must be called with the slot_tbl_lock held.
167 */
168struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid)
169{
170 if (slotid <= tbl->max_slotid)
171 return nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
172 return ERR_PTR(-E2BIG);
173}
174
138/* 175/*
139 * nfs4_alloc_slot - efficiently look for a free slot 176 * nfs4_alloc_slot - efficiently look for a free slot
140 * 177 *
@@ -153,18 +190,11 @@ struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl)
153 __func__, tbl->used_slots[0], tbl->highest_used_slotid, 190 __func__, tbl->used_slots[0], tbl->highest_used_slotid,
154 tbl->max_slotid + 1); 191 tbl->max_slotid + 1);
155 slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slotid + 1); 192 slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slotid + 1);
156 if (slotid > tbl->max_slotid) 193 if (slotid <= tbl->max_slotid) {
157 goto out; 194 ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
158 ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT); 195 if (!IS_ERR(ret))
159 if (IS_ERR(ret)) 196 nfs4_lock_slot(tbl, ret);
160 goto out; 197 }
161 __set_bit(slotid, tbl->used_slots);
162 if (slotid > tbl->highest_used_slotid ||
163 tbl->highest_used_slotid == NFS4_NO_SLOT)
164 tbl->highest_used_slotid = slotid;
165 ret->generation = tbl->generation;
166
167out:
168 dprintk("<-- %s used_slots=%04lx highest_used=%u slotid=%u\n", 198 dprintk("<-- %s used_slots=%04lx highest_used=%u slotid=%u\n",
169 __func__, tbl->used_slots[0], tbl->highest_used_slotid, 199 __func__, tbl->used_slots[0], tbl->highest_used_slotid,
170 !IS_ERR(ret) ? ret->slot_nr : NFS4_NO_SLOT); 200 !IS_ERR(ret) ? ret->slot_nr : NFS4_NO_SLOT);
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index e3ea2c5324d6..5b51298d1d03 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -77,6 +77,8 @@ extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,
77 unsigned int max_reqs, const char *queue); 77 unsigned int max_reqs, const char *queue);
78extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl); 78extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl);
79extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl); 79extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
80extern struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid);
81extern bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
80extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); 82extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
81extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); 83extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
82bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, 84bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
@@ -88,6 +90,12 @@ static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl)
88 return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); 90 return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
89} 91}
90 92
93static inline bool nfs4_test_locked_slot(const struct nfs4_slot_table *tbl,
94 u32 slotid)
95{
96 return !!test_bit(slotid, tbl->used_slots);
97}
98
91#if defined(CONFIG_NFS_V4_1) 99#if defined(CONFIG_NFS_V4_1)
92extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, 100extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl,
93 u32 target_highest_slotid); 101 u32 target_highest_slotid);
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 81ac6480f9e7..4aaed890048f 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -606,12 +606,22 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
606 dprintk("%s: DS %s: trying address %s\n", 606 dprintk("%s: DS %s: trying address %s\n",
607 __func__, ds->ds_remotestr, da->da_remotestr); 607 __func__, ds->ds_remotestr, da->da_remotestr);
608 608
609 clp = get_v3_ds_connect(mds_srv->nfs_client, 609 if (!IS_ERR(clp)) {
610 struct xprt_create xprt_args = {
611 .ident = XPRT_TRANSPORT_TCP,
612 .net = clp->cl_net,
613 .dstaddr = (struct sockaddr *)&da->da_addr,
614 .addrlen = da->da_addrlen,
615 .servername = clp->cl_hostname,
616 };
617 /* Add this address as an alias */
618 rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
619 rpc_clnt_test_and_add_xprt, NULL);
620 } else
621 clp = get_v3_ds_connect(mds_srv->nfs_client,
610 (struct sockaddr *)&da->da_addr, 622 (struct sockaddr *)&da->da_addr,
611 da->da_addrlen, IPPROTO_TCP, 623 da->da_addrlen, IPPROTO_TCP,
612 timeo, retrans, au_flavor); 624 timeo, retrans, au_flavor);
613 if (!IS_ERR(clp))
614 break;
615 } 625 }
616 626
617 if (IS_ERR(clp)) { 627 if (IS_ERR(clp)) {
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index a0b77fc1bd39..c9f583d7bac8 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -84,12 +84,30 @@ config NFSD_V4
84 If unsure, say N. 84 If unsure, say N.
85 85
86config NFSD_PNFS 86config NFSD_PNFS
87 bool "NFSv4.1 server support for Parallel NFS (pNFS)" 87 bool
88 depends on NFSD_V4 88
89config NFSD_BLOCKLAYOUT
90 bool "NFSv4.1 server support for pNFS block layouts"
91 depends on NFSD_V4 && BLOCK
92 select NFSD_PNFS
93 help
94 This option enables support for the exporting pNFS block layouts
95 in the kernel's NFS server. The pNFS block layout enables NFS
96 clients to directly perform I/O to block devices accesible to both
97 the server and the clients. See RFC 5663 for more details.
98
99 If unsure, say N.
100
101config NFSD_SCSILAYOUT
102 bool "NFSv4.1 server support for pNFS SCSI layouts"
103 depends on NFSD_V4 && BLOCK
104 select NFSD_PNFS
89 help 105 help
90 This option enables support for the parallel NFS features of the 106 This option enables support for the exporting pNFS SCSI layouts
91 minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS 107 in the kernel's NFS server. The pNFS SCSI layout enables NFS
92 server. 108 clients to directly perform I/O to SCSI devices accesible to both
109 the server and the clients. See draft-ietf-nfsv4-scsi-layout for
110 more details.
93 111
94 If unsure, say N. 112 If unsure, say N.
95 113
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 9a6028e120c6..3ae5f3c77e28 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -17,4 +17,6 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
17nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o 17nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
18nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ 18nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
19 nfs4acl.o nfs4callback.o nfs4recover.o 19 nfs4acl.o nfs4callback.o nfs4recover.o
20nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o 20nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
21nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
22nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index c29d9421bd5e..e55b5242614d 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -1,11 +1,14 @@
1/* 1/*
2 * Copyright (c) 2014 Christoph Hellwig. 2 * Copyright (c) 2014-2016 Christoph Hellwig.
3 */ 3 */
4#include <linux/exportfs.h> 4#include <linux/exportfs.h>
5#include <linux/genhd.h> 5#include <linux/genhd.h>
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <linux/pr.h>
7 8
8#include <linux/nfsd/debug.h> 9#include <linux/nfsd/debug.h>
10#include <scsi/scsi_proto.h>
11#include <scsi/scsi_common.h>
9 12
10#include "blocklayoutxdr.h" 13#include "blocklayoutxdr.h"
11#include "pnfs.h" 14#include "pnfs.h"
@@ -13,37 +16,6 @@
13#define NFSDDBG_FACILITY NFSDDBG_PNFS 16#define NFSDDBG_FACILITY NFSDDBG_PNFS
14 17
15 18
16static int
17nfsd4_block_get_device_info_simple(struct super_block *sb,
18 struct nfsd4_getdeviceinfo *gdp)
19{
20 struct pnfs_block_deviceaddr *dev;
21 struct pnfs_block_volume *b;
22
23 dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
24 sizeof(struct pnfs_block_volume), GFP_KERNEL);
25 if (!dev)
26 return -ENOMEM;
27 gdp->gd_device = dev;
28
29 dev->nr_volumes = 1;
30 b = &dev->volumes[0];
31
32 b->type = PNFS_BLOCK_VOLUME_SIMPLE;
33 b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
34 return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
35 &b->simple.offset);
36}
37
38static __be32
39nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
40 struct nfsd4_getdeviceinfo *gdp)
41{
42 if (sb->s_bdev != sb->s_bdev->bd_contains)
43 return nfserr_inval;
44 return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
45}
46
47static __be32 19static __be32
48nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, 20nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
49 struct nfsd4_layoutget *args) 21 struct nfsd4_layoutget *args)
@@ -141,20 +113,13 @@ out_layoutunavailable:
141} 113}
142 114
143static __be32 115static __be32
144nfsd4_block_proc_layoutcommit(struct inode *inode, 116nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp,
145 struct nfsd4_layoutcommit *lcp) 117 struct iomap *iomaps, int nr_iomaps)
146{ 118{
147 loff_t new_size = lcp->lc_last_wr + 1; 119 loff_t new_size = lcp->lc_last_wr + 1;
148 struct iattr iattr = { .ia_valid = 0 }; 120 struct iattr iattr = { .ia_valid = 0 };
149 struct iomap *iomaps;
150 int nr_iomaps;
151 int error; 121 int error;
152 122
153 nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
154 lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
155 if (nr_iomaps < 0)
156 return nfserrno(nr_iomaps);
157
158 if (lcp->lc_mtime.tv_nsec == UTIME_NOW || 123 if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
159 timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0) 124 timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
160 lcp->lc_mtime = current_fs_time(inode->i_sb); 125 lcp->lc_mtime = current_fs_time(inode->i_sb);
@@ -172,6 +137,54 @@ nfsd4_block_proc_layoutcommit(struct inode *inode,
172 return nfserrno(error); 137 return nfserrno(error);
173} 138}
174 139
140#ifdef CONFIG_NFSD_BLOCKLAYOUT
141static int
142nfsd4_block_get_device_info_simple(struct super_block *sb,
143 struct nfsd4_getdeviceinfo *gdp)
144{
145 struct pnfs_block_deviceaddr *dev;
146 struct pnfs_block_volume *b;
147
148 dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
149 sizeof(struct pnfs_block_volume), GFP_KERNEL);
150 if (!dev)
151 return -ENOMEM;
152 gdp->gd_device = dev;
153
154 dev->nr_volumes = 1;
155 b = &dev->volumes[0];
156
157 b->type = PNFS_BLOCK_VOLUME_SIMPLE;
158 b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
159 return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
160 &b->simple.offset);
161}
162
163static __be32
164nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
165 struct nfs4_client *clp,
166 struct nfsd4_getdeviceinfo *gdp)
167{
168 if (sb->s_bdev != sb->s_bdev->bd_contains)
169 return nfserr_inval;
170 return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
171}
172
173static __be32
174nfsd4_block_proc_layoutcommit(struct inode *inode,
175 struct nfsd4_layoutcommit *lcp)
176{
177 struct iomap *iomaps;
178 int nr_iomaps;
179
180 nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
181 lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
182 if (nr_iomaps < 0)
183 return nfserrno(nr_iomaps);
184
185 return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
186}
187
175const struct nfsd4_layout_ops bl_layout_ops = { 188const struct nfsd4_layout_ops bl_layout_ops = {
176 /* 189 /*
177 * Pretend that we send notification to the client. This is a blatant 190 * Pretend that we send notification to the client. This is a blatant
@@ -190,3 +203,206 @@ const struct nfsd4_layout_ops bl_layout_ops = {
190 .encode_layoutget = nfsd4_block_encode_layoutget, 203 .encode_layoutget = nfsd4_block_encode_layoutget,
191 .proc_layoutcommit = nfsd4_block_proc_layoutcommit, 204 .proc_layoutcommit = nfsd4_block_proc_layoutcommit,
192}; 205};
206#endif /* CONFIG_NFSD_BLOCKLAYOUT */
207
208#ifdef CONFIG_NFSD_SCSILAYOUT
209static int nfsd4_scsi_identify_device(struct block_device *bdev,
210 struct pnfs_block_volume *b)
211{
212 struct request_queue *q = bdev->bd_disk->queue;
213 struct request *rq;
214 size_t bufflen = 252, len, id_len;
215 u8 *buf, *d, type, assoc;
216 int error;
217
218 buf = kzalloc(bufflen, GFP_KERNEL);
219 if (!buf)
220 return -ENOMEM;
221
222 rq = blk_get_request(q, READ, GFP_KERNEL);
223 if (IS_ERR(rq)) {
224 error = -ENOMEM;
225 goto out_free_buf;
226 }
227 blk_rq_set_block_pc(rq);
228
229 error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
230 if (error)
231 goto out_put_request;
232
233 rq->cmd[0] = INQUIRY;
234 rq->cmd[1] = 1;
235 rq->cmd[2] = 0x83;
236 rq->cmd[3] = bufflen >> 8;
237 rq->cmd[4] = bufflen & 0xff;
238 rq->cmd_len = COMMAND_SIZE(INQUIRY);
239
240 error = blk_execute_rq(rq->q, NULL, rq, 1);
241 if (error) {
242 pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
243 rq->errors);
244 goto out_put_request;
245 }
246
247 len = (buf[2] << 8) + buf[3] + 4;
248 if (len > bufflen) {
249 pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n",
250 len);
251 goto out_put_request;
252 }
253
254 d = buf + 4;
255 for (d = buf + 4; d < buf + len; d += id_len + 4) {
256 id_len = d[3];
257 type = d[1] & 0xf;
258 assoc = (d[1] >> 4) & 0x3;
259
260 /*
261 * We only care about a EUI-64 and NAA designator types
262 * with LU association.
263 */
264 if (assoc != 0x00)
265 continue;
266 if (type != 0x02 && type != 0x03)
267 continue;
268 if (id_len != 8 && id_len != 12 && id_len != 16)
269 continue;
270
271 b->scsi.code_set = PS_CODE_SET_BINARY;
272 b->scsi.designator_type = type == 0x02 ?
273 PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA;
274 b->scsi.designator_len = id_len;
275 memcpy(b->scsi.designator, d + 4, id_len);
276
277 /*
278 * If we found a 8 or 12 byte descriptor continue on to
279 * see if a 16 byte one is available. If we find a
280 * 16 byte descriptor we're done.
281 */
282 if (id_len == 16)
283 break;
284 }
285
286out_put_request:
287 blk_put_request(rq);
288out_free_buf:
289 kfree(buf);
290 return error;
291}
292
293#define NFSD_MDS_PR_KEY 0x0100000000000000
294
295/*
296 * We use the client ID as a unique key for the reservations.
297 * This allows us to easily fence a client when recalls fail.
298 */
299static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp)
300{
301 return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id;
302}
303
304static int
305nfsd4_block_get_device_info_scsi(struct super_block *sb,
306 struct nfs4_client *clp,
307 struct nfsd4_getdeviceinfo *gdp)
308{
309 struct pnfs_block_deviceaddr *dev;
310 struct pnfs_block_volume *b;
311 const struct pr_ops *ops;
312 int error;
313
314 dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
315 sizeof(struct pnfs_block_volume), GFP_KERNEL);
316 if (!dev)
317 return -ENOMEM;
318 gdp->gd_device = dev;
319
320 dev->nr_volumes = 1;
321 b = &dev->volumes[0];
322
323 b->type = PNFS_BLOCK_VOLUME_SCSI;
324 b->scsi.pr_key = nfsd4_scsi_pr_key(clp);
325
326 error = nfsd4_scsi_identify_device(sb->s_bdev, b);
327 if (error)
328 return error;
329
330 ops = sb->s_bdev->bd_disk->fops->pr_ops;
331 if (!ops) {
332 pr_err("pNFS: device %s does not support PRs.\n",
333 sb->s_id);
334 return -EINVAL;
335 }
336
337 error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
338 if (error) {
339 pr_err("pNFS: failed to register key for device %s.\n",
340 sb->s_id);
341 return -EINVAL;
342 }
343
344 error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
345 PR_EXCLUSIVE_ACCESS_REG_ONLY, 0);
346 if (error) {
347 pr_err("pNFS: failed to reserve device %s.\n",
348 sb->s_id);
349 return -EINVAL;
350 }
351
352 return 0;
353}
354
355static __be32
356nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb,
357 struct nfs4_client *clp,
358 struct nfsd4_getdeviceinfo *gdp)
359{
360 if (sb->s_bdev != sb->s_bdev->bd_contains)
361 return nfserr_inval;
362 return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp));
363}
364static __be32
365nfsd4_scsi_proc_layoutcommit(struct inode *inode,
366 struct nfsd4_layoutcommit *lcp)
367{
368 struct iomap *iomaps;
369 int nr_iomaps;
370
371 nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
372 lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
373 if (nr_iomaps < 0)
374 return nfserrno(nr_iomaps);
375
376 return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
377}
378
379static void
380nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls)
381{
382 struct nfs4_client *clp = ls->ls_stid.sc_client;
383 struct block_device *bdev = ls->ls_file->f_path.mnt->mnt_sb->s_bdev;
384
385 bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
386 nfsd4_scsi_pr_key(clp), 0, true);
387}
388
389const struct nfsd4_layout_ops scsi_layout_ops = {
390 /*
391 * Pretend that we send notification to the client. This is a blatant
392 * lie to force recent Linux clients to cache our device IDs.
393 * We rarely ever change the device ID, so the harm of leaking deviceids
394 * for a while isn't too bad. Unfortunately RFC5661 is a complete mess
395 * in this regard, but I filed errata 4119 for this a while ago, and
396 * hopefully the Linux client will eventually start caching deviceids
397 * without this again.
398 */
399 .notify_types =
400 NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
401 .proc_getdeviceinfo = nfsd4_scsi_proc_getdeviceinfo,
402 .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo,
403 .proc_layoutget = nfsd4_block_proc_layoutget,
404 .encode_layoutget = nfsd4_block_encode_layoutget,
405 .proc_layoutcommit = nfsd4_scsi_proc_layoutcommit,
406 .fence_client = nfsd4_scsi_fence_client,
407};
408#endif /* CONFIG_NFSD_SCSILAYOUT */
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 6d834dc9bbc8..6c3b316f932e 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2014 Christoph Hellwig. 2 * Copyright (c) 2014-2016 Christoph Hellwig.
3 */ 3 */
4#include <linux/sunrpc/svc.h> 4#include <linux/sunrpc/svc.h>
5#include <linux/exportfs.h> 5#include <linux/exportfs.h>
@@ -53,6 +53,18 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
53 p = xdr_encode_hyper(p, b->simple.offset); 53 p = xdr_encode_hyper(p, b->simple.offset);
54 p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len); 54 p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
55 break; 55 break;
56 case PNFS_BLOCK_VOLUME_SCSI:
57 len = 4 + 4 + 4 + 4 + b->scsi.designator_len + 8;
58 p = xdr_reserve_space(xdr, len);
59 if (!p)
60 return -ETOOSMALL;
61
62 *p++ = cpu_to_be32(b->type);
63 *p++ = cpu_to_be32(b->scsi.code_set);
64 *p++ = cpu_to_be32(b->scsi.designator_type);
65 p = xdr_encode_opaque(p, b->scsi.designator, b->scsi.designator_len);
66 p = xdr_encode_hyper(p, b->scsi.pr_key);
67 break;
56 default: 68 default:
57 return -ENOTSUPP; 69 return -ENOTSUPP;
58 } 70 }
@@ -93,18 +105,22 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
93 u32 block_size) 105 u32 block_size)
94{ 106{
95 struct iomap *iomaps; 107 struct iomap *iomaps;
96 u32 nr_iomaps, expected, i; 108 u32 nr_iomaps, i;
97 109
98 if (len < sizeof(u32)) { 110 if (len < sizeof(u32)) {
99 dprintk("%s: extent array too small: %u\n", __func__, len); 111 dprintk("%s: extent array too small: %u\n", __func__, len);
100 return -EINVAL; 112 return -EINVAL;
101 } 113 }
114 len -= sizeof(u32);
115 if (len % PNFS_BLOCK_EXTENT_SIZE) {
116 dprintk("%s: extent array invalid: %u\n", __func__, len);
117 return -EINVAL;
118 }
102 119
103 nr_iomaps = be32_to_cpup(p++); 120 nr_iomaps = be32_to_cpup(p++);
104 expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE; 121 if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) {
105 if (len != expected) {
106 dprintk("%s: extent array size mismatch: %u/%u\n", 122 dprintk("%s: extent array size mismatch: %u/%u\n",
107 __func__, len, expected); 123 __func__, len, nr_iomaps);
108 return -EINVAL; 124 return -EINVAL;
109 } 125 }
110 126
@@ -155,3 +171,54 @@ fail:
155 kfree(iomaps); 171 kfree(iomaps);
156 return -EINVAL; 172 return -EINVAL;
157} 173}
174
175int
176nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
177 u32 block_size)
178{
179 struct iomap *iomaps;
180 u32 nr_iomaps, expected, i;
181
182 if (len < sizeof(u32)) {
183 dprintk("%s: extent array too small: %u\n", __func__, len);
184 return -EINVAL;
185 }
186
187 nr_iomaps = be32_to_cpup(p++);
188 expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE;
189 if (len != expected) {
190 dprintk("%s: extent array size mismatch: %u/%u\n",
191 __func__, len, expected);
192 return -EINVAL;
193 }
194
195 iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
196 if (!iomaps) {
197 dprintk("%s: failed to allocate extent array\n", __func__);
198 return -ENOMEM;
199 }
200
201 for (i = 0; i < nr_iomaps; i++) {
202 u64 val;
203
204 p = xdr_decode_hyper(p, &val);
205 if (val & (block_size - 1)) {
206 dprintk("%s: unaligned offset 0x%llx\n", __func__, val);
207 goto fail;
208 }
209 iomaps[i].offset = val;
210
211 p = xdr_decode_hyper(p, &val);
212 if (val & (block_size - 1)) {
213 dprintk("%s: unaligned length 0x%llx\n", __func__, val);
214 goto fail;
215 }
216 iomaps[i].length = val;
217 }
218
219 *iomapp = iomaps;
220 return nr_iomaps;
221fail:
222 kfree(iomaps);
223 return -EINVAL;
224}
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index 6de925fe8499..397bc7563a49 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -15,6 +15,11 @@ struct pnfs_block_extent {
15 enum pnfs_block_extent_state es; 15 enum pnfs_block_extent_state es;
16}; 16};
17 17
18struct pnfs_block_range {
19 u64 foff;
20 u64 len;
21};
22
18/* 23/*
19 * Random upper cap for the uuid length to avoid unbounded allocation. 24 * Random upper cap for the uuid length to avoid unbounded allocation.
20 * Not actually limited by the protocol. 25 * Not actually limited by the protocol.
@@ -29,6 +34,13 @@ struct pnfs_block_volume {
29 u32 sig_len; 34 u32 sig_len;
30 u8 sig[PNFS_BLOCK_UUID_LEN]; 35 u8 sig[PNFS_BLOCK_UUID_LEN];
31 } simple; 36 } simple;
37 struct {
38 enum scsi_code_set code_set;
39 enum scsi_designator_type designator_type;
40 int designator_len;
41 u8 designator[256];
42 u64 pr_key;
43 } scsi;
32 }; 44 };
33}; 45};
34 46
@@ -43,5 +55,7 @@ __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
43 struct nfsd4_layoutget *lgp); 55 struct nfsd4_layoutget *lgp);
44int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, 56int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
45 u32 block_size); 57 u32 block_size);
58int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
59 u32 block_size);
46 60
47#endif /* _NFSD_BLOCKLAYOUTXDR_H */ 61#endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 7b755b7f785c..51c3b06e8036 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -147,6 +147,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
147{ 147{
148 __be32 nfserr; 148 __be32 nfserr;
149 u32 max_blocksize = svc_max_payload(rqstp); 149 u32 max_blocksize = svc_max_payload(rqstp);
150 unsigned long cnt = min(argp->count, max_blocksize);
150 151
151 dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n", 152 dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
152 SVCFH_fmt(&argp->fh), 153 SVCFH_fmt(&argp->fh),
@@ -157,7 +158,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
157 * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) 158 * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
158 * + 1 (xdr opaque byte count) = 26 159 * + 1 (xdr opaque byte count) = 26
159 */ 160 */
160 resp->count = min(argp->count, max_blocksize); 161 resp->count = cnt;
161 svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); 162 svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
162 163
163 fh_copy(&resp->fh, &argp->fh); 164 fh_copy(&resp->fh, &argp->fh);
@@ -167,8 +168,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
167 &resp->count); 168 &resp->count);
168 if (nfserr == 0) { 169 if (nfserr == 0) {
169 struct inode *inode = d_inode(resp->fh.fh_dentry); 170 struct inode *inode = d_inode(resp->fh.fh_dentry);
170 171 resp->eof = nfsd_eof_on_read(cnt, resp->count, argp->offset,
171 resp->eof = (argp->offset + resp->count) >= inode->i_size; 172 inode->i_size);
172 } 173 }
173 174
174 RETURN_STATUS(nfserr); 175 RETURN_STATUS(nfserr);
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index ce2d010d3b17..825c7bc8d789 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * Copyright (c) 2014 Christoph Hellwig. 2 * Copyright (c) 2014 Christoph Hellwig.
3 */ 3 */
4#include <linux/blkdev.h>
4#include <linux/kmod.h> 5#include <linux/kmod.h>
5#include <linux/file.h> 6#include <linux/file.h>
6#include <linux/jhash.h> 7#include <linux/jhash.h>
@@ -26,7 +27,12 @@ static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
26static const struct lock_manager_operations nfsd4_layouts_lm_ops; 27static const struct lock_manager_operations nfsd4_layouts_lm_ops;
27 28
28const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { 29const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
30#ifdef CONFIG_NFSD_BLOCKLAYOUT
29 [LAYOUT_BLOCK_VOLUME] = &bl_layout_ops, 31 [LAYOUT_BLOCK_VOLUME] = &bl_layout_ops,
32#endif
33#ifdef CONFIG_NFSD_SCSILAYOUT
34 [LAYOUT_SCSI] = &scsi_layout_ops,
35#endif
30}; 36};
31 37
32/* pNFS device ID to export fsid mapping */ 38/* pNFS device ID to export fsid mapping */
@@ -121,10 +127,24 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
121 if (!(exp->ex_flags & NFSEXP_PNFS)) 127 if (!(exp->ex_flags & NFSEXP_PNFS))
122 return; 128 return;
123 129
130 /*
131 * Check if the file system supports exporting a block-like layout.
132 * If the block device supports reservations prefer the SCSI layout,
133 * otherwise advertise the block layout.
134 */
135#ifdef CONFIG_NFSD_BLOCKLAYOUT
124 if (sb->s_export_op->get_uuid && 136 if (sb->s_export_op->get_uuid &&
125 sb->s_export_op->map_blocks && 137 sb->s_export_op->map_blocks &&
126 sb->s_export_op->commit_blocks) 138 sb->s_export_op->commit_blocks)
127 exp->ex_layout_type = LAYOUT_BLOCK_VOLUME; 139 exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
140#endif
141#ifdef CONFIG_NFSD_SCSILAYOUT
142 /* overwrite block layout selection if needed */
143 if (sb->s_export_op->map_blocks &&
144 sb->s_export_op->commit_blocks &&
145 sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops)
146 exp->ex_layout_type = LAYOUT_SCSI;
147#endif
128} 148}
129 149
130static void 150static void
@@ -590,8 +610,6 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
590 610
591 rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str)); 611 rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
592 612
593 trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
594
595 printk(KERN_WARNING 613 printk(KERN_WARNING
596 "nfsd: client %s failed to respond to layout recall. " 614 "nfsd: client %s failed to respond to layout recall. "
597 " Fencing..\n", addr_str); 615 " Fencing..\n", addr_str);
@@ -626,6 +644,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
626 container_of(cb, struct nfs4_layout_stateid, ls_recall); 644 container_of(cb, struct nfs4_layout_stateid, ls_recall);
627 struct nfsd_net *nn; 645 struct nfsd_net *nn;
628 ktime_t now, cutoff; 646 ktime_t now, cutoff;
647 const struct nfsd4_layout_ops *ops;
629 LIST_HEAD(reaplist); 648 LIST_HEAD(reaplist);
630 649
631 650
@@ -661,7 +680,13 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
661 /* 680 /*
662 * Unknown error or non-responding client, we'll need to fence. 681 * Unknown error or non-responding client, we'll need to fence.
663 */ 682 */
664 nfsd4_cb_layout_fail(ls); 683 trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
684
685 ops = nfsd4_layout_ops[ls->ls_layout_type];
686 if (ops->fence_client)
687 ops->fence_client(ls);
688 else
689 nfsd4_cb_layout_fail(ls);
665 return -1; 690 return -1;
666 } 691 }
667} 692}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 4cba7865f496..de1ff1d98bb1 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -864,12 +864,10 @@ static __be32
864nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 864nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
865 struct nfsd4_secinfo *secinfo) 865 struct nfsd4_secinfo *secinfo)
866{ 866{
867 struct svc_fh resfh;
868 struct svc_export *exp; 867 struct svc_export *exp;
869 struct dentry *dentry; 868 struct dentry *dentry;
870 __be32 err; 869 __be32 err;
871 870
872 fh_init(&resfh, NFS4_FHSIZE);
873 err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC); 871 err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC);
874 if (err) 872 if (err)
875 return err; 873 return err;
@@ -878,6 +876,7 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
878 &exp, &dentry); 876 &exp, &dentry);
879 if (err) 877 if (err)
880 return err; 878 return err;
879 fh_unlock(&cstate->current_fh);
881 if (d_really_is_negative(dentry)) { 880 if (d_really_is_negative(dentry)) {
882 exp_put(exp); 881 exp_put(exp);
883 err = nfserr_noent; 882 err = nfserr_noent;
@@ -1269,8 +1268,10 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
1269 goto out; 1268 goto out;
1270 1269
1271 nfserr = nfs_ok; 1270 nfserr = nfs_ok;
1272 if (gdp->gd_maxcount != 0) 1271 if (gdp->gd_maxcount != 0) {
1273 nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp); 1272 nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb,
1273 cstate->session->se_client, gdp);
1274 }
1274 1275
1275 gdp->gd_notify_types &= ops->notify_types; 1276 gdp->gd_notify_types &= ops->notify_types;
1276out: 1277out:
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index dc8ebecf5618..66eaeb1e8c2c 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -32,10 +32,10 @@
32* 32*
33*/ 33*/
34 34
35#include <crypto/hash.h>
35#include <linux/file.h> 36#include <linux/file.h>
36#include <linux/slab.h> 37#include <linux/slab.h>
37#include <linux/namei.h> 38#include <linux/namei.h>
38#include <linux/crypto.h>
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/fs.h> 40#include <linux/fs.h>
41#include <linux/module.h> 41#include <linux/module.h>
@@ -104,29 +104,35 @@ static int
104nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname) 104nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname)
105{ 105{
106 struct xdr_netobj cksum; 106 struct xdr_netobj cksum;
107 struct hash_desc desc; 107 struct crypto_shash *tfm;
108 struct scatterlist sg;
109 int status; 108 int status;
110 109
111 dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n", 110 dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
112 clname->len, clname->data); 111 clname->len, clname->data);
113 desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; 112 tfm = crypto_alloc_shash("md5", 0, 0);
114 desc.tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); 113 if (IS_ERR(tfm)) {
115 if (IS_ERR(desc.tfm)) { 114 status = PTR_ERR(tfm);
116 status = PTR_ERR(desc.tfm);
117 goto out_no_tfm; 115 goto out_no_tfm;
118 } 116 }
119 117
120 cksum.len = crypto_hash_digestsize(desc.tfm); 118 cksum.len = crypto_shash_digestsize(tfm);
121 cksum.data = kmalloc(cksum.len, GFP_KERNEL); 119 cksum.data = kmalloc(cksum.len, GFP_KERNEL);
122 if (cksum.data == NULL) { 120 if (cksum.data == NULL) {
123 status = -ENOMEM; 121 status = -ENOMEM;
124 goto out; 122 goto out;
125 } 123 }
126 124
127 sg_init_one(&sg, clname->data, clname->len); 125 {
126 SHASH_DESC_ON_STACK(desc, tfm);
127
128 desc->tfm = tfm;
129 desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
130
131 status = crypto_shash_digest(desc, clname->data, clname->len,
132 cksum.data);
133 shash_desc_zero(desc);
134 }
128 135
129 status = crypto_hash_digest(&desc, &sg, sg.length, cksum.data);
130 if (status) 136 if (status)
131 goto out; 137 goto out;
132 138
@@ -135,7 +141,7 @@ nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname)
135 status = 0; 141 status = 0;
136out: 142out:
137 kfree(cksum.data); 143 kfree(cksum.data);
138 crypto_free_hash(desc.tfm); 144 crypto_free_shash(tfm);
139out_no_tfm: 145out_no_tfm:
140 return status; 146 return status;
141} 147}
@@ -1260,6 +1266,7 @@ nfsd4_umh_cltrack_init(struct net *net)
1260 /* XXX: The usermode helper s not working in container yet. */ 1266 /* XXX: The usermode helper s not working in container yet. */
1261 if (net != &init_net) { 1267 if (net != &init_net) {
1262 pr_warn("NFSD: attempt to initialize umh client tracking in a container ignored.\n"); 1268 pr_warn("NFSD: attempt to initialize umh client tracking in a container ignored.\n");
1269 kfree(grace_start);
1263 return -EINVAL; 1270 return -EINVAL;
1264 } 1271 }
1265 1272
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c484a2b6cd10..0462eeddfff9 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2408,7 +2408,8 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
2408 default: /* checked by xdr code */ 2408 default: /* checked by xdr code */
2409 WARN_ON_ONCE(1); 2409 WARN_ON_ONCE(1);
2410 case SP4_SSV: 2410 case SP4_SSV:
2411 return nfserr_encr_alg_unsupp; 2411 status = nfserr_encr_alg_unsupp;
2412 goto out_nolock;
2412 } 2413 }
2413 2414
2414 /* Cases below refer to rfc 5661 section 18.35.4: */ 2415 /* Cases below refer to rfc 5661 section 18.35.4: */
@@ -2586,21 +2587,26 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs
2586 return nfs_ok; 2587 return nfs_ok;
2587} 2588}
2588 2589
2590/*
2591 * Server's NFSv4.1 backchannel support is AUTH_SYS-only for now.
2592 * These are based on similar macros in linux/sunrpc/msg_prot.h .
2593 */
2594#define RPC_MAX_HEADER_WITH_AUTH_SYS \
2595 (RPC_CALLHDRSIZE + 2 * (2 + UNX_CALLSLACK))
2596
2597#define RPC_MAX_REPHEADER_WITH_AUTH_SYS \
2598 (RPC_REPHDRSIZE + (2 + NUL_REPLYSLACK))
2599
2589#define NFSD_CB_MAX_REQ_SZ ((NFS4_enc_cb_recall_sz + \ 2600#define NFSD_CB_MAX_REQ_SZ ((NFS4_enc_cb_recall_sz + \
2590 RPC_MAX_HEADER_WITH_AUTH) * sizeof(__be32)) 2601 RPC_MAX_HEADER_WITH_AUTH_SYS) * sizeof(__be32))
2591#define NFSD_CB_MAX_RESP_SZ ((NFS4_dec_cb_recall_sz + \ 2602#define NFSD_CB_MAX_RESP_SZ ((NFS4_dec_cb_recall_sz + \
2592 RPC_MAX_REPHEADER_WITH_AUTH) * sizeof(__be32)) 2603 RPC_MAX_REPHEADER_WITH_AUTH_SYS) * \
2604 sizeof(__be32))
2593 2605
2594static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca) 2606static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
2595{ 2607{
2596 ca->headerpadsz = 0; 2608 ca->headerpadsz = 0;
2597 2609
2598 /*
2599 * These RPC_MAX_HEADER macros are overkill, especially since we
2600 * don't even do gss on the backchannel yet. But this is still
2601 * less than 1k. Tighten up this estimate in the unlikely event
2602 * it turns out to be a problem for some client:
2603 */
2604 if (ca->maxreq_sz < NFSD_CB_MAX_REQ_SZ) 2610 if (ca->maxreq_sz < NFSD_CB_MAX_REQ_SZ)
2605 return nfserr_toosmall; 2611 return nfserr_toosmall;
2606 if (ca->maxresp_sz < NFSD_CB_MAX_RESP_SZ) 2612 if (ca->maxresp_sz < NFSD_CB_MAX_RESP_SZ)
@@ -2710,10 +2716,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
2710 goto out_free_conn; 2716 goto out_free_conn;
2711 } 2717 }
2712 status = nfs_ok; 2718 status = nfs_ok;
2713 /* 2719 /* Persistent sessions are not supported */
2714 * We do not support RDMA or persistent sessions
2715 */
2716 cr_ses->flags &= ~SESSION4_PERSIST; 2720 cr_ses->flags &= ~SESSION4_PERSIST;
2721 /* Upshifting from TCP to RDMA is not supported */
2717 cr_ses->flags &= ~SESSION4_RDMA; 2722 cr_ses->flags &= ~SESSION4_RDMA;
2718 2723
2719 init_session(rqstp, new, conf, cr_ses); 2724 init_session(rqstp, new, conf, cr_ses);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index d6ef0955a979..9df898ba648f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1072,8 +1072,9 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
1072 1072
1073 READ_BUF(4); 1073 READ_BUF(4);
1074 rename->rn_snamelen = be32_to_cpup(p++); 1074 rename->rn_snamelen = be32_to_cpup(p++);
1075 READ_BUF(rename->rn_snamelen + 4); 1075 READ_BUF(rename->rn_snamelen);
1076 SAVEMEM(rename->rn_sname, rename->rn_snamelen); 1076 SAVEMEM(rename->rn_sname, rename->rn_snamelen);
1077 READ_BUF(4);
1077 rename->rn_tnamelen = be32_to_cpup(p++); 1078 rename->rn_tnamelen = be32_to_cpup(p++);
1078 READ_BUF(rename->rn_tnamelen); 1079 READ_BUF(rename->rn_tnamelen);
1079 SAVEMEM(rename->rn_tname, rename->rn_tnamelen); 1080 SAVEMEM(rename->rn_tname, rename->rn_tnamelen);
@@ -1155,13 +1156,14 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
1155 READ_BUF(8); 1156 READ_BUF(8);
1156 setclientid->se_callback_prog = be32_to_cpup(p++); 1157 setclientid->se_callback_prog = be32_to_cpup(p++);
1157 setclientid->se_callback_netid_len = be32_to_cpup(p++); 1158 setclientid->se_callback_netid_len = be32_to_cpup(p++);
1158 1159 READ_BUF(setclientid->se_callback_netid_len);
1159 READ_BUF(setclientid->se_callback_netid_len + 4);
1160 SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len); 1160 SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len);
1161 READ_BUF(4);
1161 setclientid->se_callback_addr_len = be32_to_cpup(p++); 1162 setclientid->se_callback_addr_len = be32_to_cpup(p++);
1162 1163
1163 READ_BUF(setclientid->se_callback_addr_len + 4); 1164 READ_BUF(setclientid->se_callback_addr_len);
1164 SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len); 1165 SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len);
1166 READ_BUF(4);
1165 setclientid->se_callback_ident = be32_to_cpup(p++); 1167 setclientid->se_callback_ident = be32_to_cpup(p++);
1166 1168
1167 DECODE_TAIL; 1169 DECODE_TAIL;
@@ -1835,8 +1837,9 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1835 1837
1836 READ_BUF(4); 1838 READ_BUF(4);
1837 argp->taglen = be32_to_cpup(p++); 1839 argp->taglen = be32_to_cpup(p++);
1838 READ_BUF(argp->taglen + 8); 1840 READ_BUF(argp->taglen);
1839 SAVEMEM(argp->tag, argp->taglen); 1841 SAVEMEM(argp->tag, argp->taglen);
1842 READ_BUF(8);
1840 argp->minorversion = be32_to_cpup(p++); 1843 argp->minorversion = be32_to_cpup(p++);
1841 argp->opcnt = be32_to_cpup(p++); 1844 argp->opcnt = be32_to_cpup(p++);
1842 max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2); 1845 max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2);
@@ -3060,7 +3063,7 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
3060 p = xdr_encode_opaque_fixed(p, bcts->sessionid.data, 3063 p = xdr_encode_opaque_fixed(p, bcts->sessionid.data,
3061 NFS4_MAX_SESSIONID_LEN); 3064 NFS4_MAX_SESSIONID_LEN);
3062 *p++ = cpu_to_be32(bcts->dir); 3065 *p++ = cpu_to_be32(bcts->dir);
3063 /* Sorry, we do not yet support RDMA over 4.1: */ 3066 /* Upshifting from TCP to RDMA is not supported */
3064 *p++ = cpu_to_be32(0); 3067 *p++ = cpu_to_be32(0);
3065 } 3068 }
3066 return nfserr; 3069 return nfserr;
@@ -3362,6 +3365,7 @@ static __be32 nfsd4_encode_splice_read(
3362 struct xdr_stream *xdr = &resp->xdr; 3365 struct xdr_stream *xdr = &resp->xdr;
3363 struct xdr_buf *buf = xdr->buf; 3366 struct xdr_buf *buf = xdr->buf;
3364 u32 eof; 3367 u32 eof;
3368 long len;
3365 int space_left; 3369 int space_left;
3366 __be32 nfserr; 3370 __be32 nfserr;
3367 __be32 *p = xdr->p - 2; 3371 __be32 *p = xdr->p - 2;
@@ -3370,6 +3374,7 @@ static __be32 nfsd4_encode_splice_read(
3370 if (xdr->end - xdr->p < 1) 3374 if (xdr->end - xdr->p < 1)
3371 return nfserr_resource; 3375 return nfserr_resource;
3372 3376
3377 len = maxcount;
3373 nfserr = nfsd_splice_read(read->rd_rqstp, file, 3378 nfserr = nfsd_splice_read(read->rd_rqstp, file,
3374 read->rd_offset, &maxcount); 3379 read->rd_offset, &maxcount);
3375 if (nfserr) { 3380 if (nfserr) {
@@ -3382,8 +3387,8 @@ static __be32 nfsd4_encode_splice_read(
3382 return nfserr; 3387 return nfserr;
3383 } 3388 }
3384 3389
3385 eof = (read->rd_offset + maxcount >= 3390 eof = nfsd_eof_on_read(len, maxcount, read->rd_offset,
3386 d_inode(read->rd_fhp->fh_dentry)->i_size); 3391 d_inode(read->rd_fhp->fh_dentry)->i_size);
3387 3392
3388 *(p++) = htonl(eof); 3393 *(p++) = htonl(eof);
3389 *(p++) = htonl(maxcount); 3394 *(p++) = htonl(maxcount);
@@ -3453,14 +3458,15 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
3453 } 3458 }
3454 read->rd_vlen = v; 3459 read->rd_vlen = v;
3455 3460
3461 len = maxcount;
3456 nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec, 3462 nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec,
3457 read->rd_vlen, &maxcount); 3463 read->rd_vlen, &maxcount);
3458 if (nfserr) 3464 if (nfserr)
3459 return nfserr; 3465 return nfserr;
3460 xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3)); 3466 xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
3461 3467
3462 eof = (read->rd_offset + maxcount >= 3468 eof = nfsd_eof_on_read(len, maxcount, read->rd_offset,
3463 d_inode(read->rd_fhp->fh_dentry)->i_size); 3469 d_inode(read->rd_fhp->fh_dentry)->i_size);
3464 3470
3465 tmp = htonl(eof); 3471 tmp = htonl(eof);
3466 write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4); 3472 write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4);
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index d4c4453674c6..7d073b9b1553 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -21,6 +21,7 @@ struct nfsd4_layout_ops {
21 u32 notify_types; 21 u32 notify_types;
22 22
23 __be32 (*proc_getdeviceinfo)(struct super_block *sb, 23 __be32 (*proc_getdeviceinfo)(struct super_block *sb,
24 struct nfs4_client *clp,
24 struct nfsd4_getdeviceinfo *gdevp); 25 struct nfsd4_getdeviceinfo *gdevp);
25 __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr, 26 __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
26 struct nfsd4_getdeviceinfo *gdevp); 27 struct nfsd4_getdeviceinfo *gdevp);
@@ -32,10 +33,17 @@ struct nfsd4_layout_ops {
32 33
33 __be32 (*proc_layoutcommit)(struct inode *inode, 34 __be32 (*proc_layoutcommit)(struct inode *inode,
34 struct nfsd4_layoutcommit *lcp); 35 struct nfsd4_layoutcommit *lcp);
36
37 void (*fence_client)(struct nfs4_layout_stateid *ls);
35}; 38};
36 39
37extern const struct nfsd4_layout_ops *nfsd4_layout_ops[]; 40extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
41#ifdef CONFIG_NFSD_BLOCKLAYOUT
38extern const struct nfsd4_layout_ops bl_layout_ops; 42extern const struct nfsd4_layout_ops bl_layout_ops;
43#endif
44#ifdef CONFIG_NFSD_SCSILAYOUT
45extern const struct nfsd4_layout_ops scsi_layout_ops;
46#endif
39 47
40__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, 48__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
41 struct nfsd4_compound_state *cstate, stateid_t *stateid, 49 struct nfsd4_compound_state *cstate, stateid_t *stateid,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 5d2a57e4c03a..d40010e4f1a9 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -870,7 +870,7 @@ __be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen,
870 870
871 oldfs = get_fs(); 871 oldfs = get_fs();
872 set_fs(KERNEL_DS); 872 set_fs(KERNEL_DS);
873 host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset); 873 host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset, 0);
874 set_fs(oldfs); 874 set_fs(oldfs);
875 return nfsd_finish_read(file, count, host_err); 875 return nfsd_finish_read(file, count, host_err);
876} 876}
@@ -957,7 +957,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
957 957
958 /* Write the data. */ 958 /* Write the data. */
959 oldfs = get_fs(); set_fs(KERNEL_DS); 959 oldfs = get_fs(); set_fs(KERNEL_DS);
960 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos); 960 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos, 0);
961 set_fs(oldfs); 961 set_fs(oldfs);
962 if (host_err < 0) 962 if (host_err < 0)
963 goto out_nfserr; 963 goto out_nfserr;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index c11ba316f23f..2d573ec057f8 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -139,4 +139,23 @@ static inline int nfsd_create_is_exclusive(int createmode)
139 || createmode == NFS4_CREATE_EXCLUSIVE4_1; 139 || createmode == NFS4_CREATE_EXCLUSIVE4_1;
140} 140}
141 141
142static inline bool nfsd_eof_on_read(long requested, long read,
143 loff_t offset, loff_t size)
144{
145 /* We assume a short read means eof: */
146 if (requested > read)
147 return true;
148 /*
149 * A non-short read might also reach end of file. The spec
150 * still requires us to set eof in that case.
151 *
152 * Further operations may have modified the file size since
153 * the read, so the following check is not atomic with the read.
154 * We've only seen that cause a problem for a client in the case
155 * where the read returned a count of 0 without setting eof.
156 * That case was fixed by the addition of the above check.
157 */
158 return (offset + read >= size);
159}
160
142#endif /* LINUX_NFSD_VFS_H */ 161#endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 45d650addd56..c20df77eff99 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -180,7 +180,7 @@ void nilfs_page_bug(struct page *page)
180 180
181 printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx " 181 printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
182 "mapping=%p ino=%lu\n", 182 "mapping=%p ino=%lu\n",
183 page, atomic_read(&page->_count), 183 page, page_ref_count(page),
184 (unsigned long long)page->index, page->flags, m, ino); 184 (unsigned long long)page->index, page->flags, m, ino);
185 185
186 if (page_has_buffers(page)) { 186 if (page_has_buffers(page)) {
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index ce210d4951a1..e27e6527912b 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -41,7 +41,8 @@ ocfs2-objs := \
41 quota_local.o \ 41 quota_local.o \
42 quota_global.o \ 42 quota_global.o \
43 xattr.o \ 43 xattr.o \
44 acl.o 44 acl.o \
45 filecheck.o
45 46
46ocfs2_stackglue-objs := stackglue.o 47ocfs2_stackglue-objs := stackglue.o
47ocfs2_stack_o2cb-objs := stack_o2cb.o 48ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d002579c6f2b..70907d638b60 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2516,21 +2516,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
2516 struct ocfs2_extent_block *eb; 2516 struct ocfs2_extent_block *eb;
2517 u32 range; 2517 u32 range;
2518 2518
2519 /*
2520 * In normal tree rotation process, we will never touch the
2521 * tree branch above subtree_index and ocfs2_extend_rotate_transaction
2522 * doesn't reserve the credits for them either.
2523 *
2524 * But we do have a special case here which will update the rightmost
2525 * records for all the bh in the path.
2526 * So we have to allocate extra credits and access them.
2527 */
2528 ret = ocfs2_extend_trans(handle, subtree_index);
2529 if (ret) {
2530 mlog_errno(ret);
2531 goto out;
2532 }
2533
2534 ret = ocfs2_journal_access_path(et->et_ci, handle, path); 2519 ret = ocfs2_journal_access_path(et->et_ci, handle, path);
2535 if (ret) { 2520 if (ret) {
2536 mlog_errno(ret); 2521 mlog_errno(ret);
@@ -2956,7 +2941,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
2956 right_path->p_node[subtree_root].bh->b_blocknr, 2941 right_path->p_node[subtree_root].bh->b_blocknr,
2957 right_path->p_tree_depth); 2942 right_path->p_tree_depth);
2958 2943
2959 ret = ocfs2_extend_rotate_transaction(handle, subtree_root, 2944 ret = ocfs2_extend_rotate_transaction(handle, 0,
2960 orig_credits, left_path); 2945 orig_credits, left_path);
2961 if (ret) { 2946 if (ret) {
2962 mlog_errno(ret); 2947 mlog_errno(ret);
@@ -3029,21 +3014,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
3029 struct ocfs2_extent_block *eb; 3014 struct ocfs2_extent_block *eb;
3030 struct ocfs2_extent_list *el; 3015 struct ocfs2_extent_list *el;
3031 3016
3032
3033 ret = ocfs2_et_sanity_check(et); 3017 ret = ocfs2_et_sanity_check(et);
3034 if (ret) 3018 if (ret)
3035 goto out; 3019 goto out;
3036 /*
3037 * There's two ways we handle this depending on
3038 * whether path is the only existing one.
3039 */
3040 ret = ocfs2_extend_rotate_transaction(handle, 0,
3041 handle->h_buffer_credits,
3042 path);
3043 if (ret) {
3044 mlog_errno(ret);
3045 goto out;
3046 }
3047 3020
3048 ret = ocfs2_journal_access_path(et->et_ci, handle, path); 3021 ret = ocfs2_journal_access_path(et->et_ci, handle, path);
3049 if (ret) { 3022 if (ret) {
@@ -3641,6 +3614,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3641 */ 3614 */
3642 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && 3615 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3643 le16_to_cpu(el->l_next_free_rec) == 1) { 3616 le16_to_cpu(el->l_next_free_rec) == 1) {
3617 /* extend credit for ocfs2_remove_rightmost_path */
3618 ret = ocfs2_extend_rotate_transaction(handle, 0,
3619 handle->h_buffer_credits,
3620 right_path);
3621 if (ret) {
3622 mlog_errno(ret);
3623 goto out;
3624 }
3644 3625
3645 ret = ocfs2_remove_rightmost_path(handle, et, 3626 ret = ocfs2_remove_rightmost_path(handle, et,
3646 right_path, 3627 right_path,
@@ -3679,6 +3660,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
3679 BUG_ON(ctxt->c_contig_type == CONTIG_NONE); 3660 BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3680 3661
3681 if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { 3662 if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3663 /* extend credit for ocfs2_remove_rightmost_path */
3664 ret = ocfs2_extend_rotate_transaction(handle, 0,
3665 handle->h_buffer_credits,
3666 path);
3667 if (ret) {
3668 mlog_errno(ret);
3669 goto out;
3670 }
3682 /* 3671 /*
3683 * The merge code will need to create an empty 3672 * The merge code will need to create an empty
3684 * extent to take the place of the newly 3673 * extent to take the place of the newly
@@ -3727,6 +3716,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
3727 */ 3716 */
3728 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); 3717 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3729 3718
3719 /* extend credit for ocfs2_remove_rightmost_path */
3720 ret = ocfs2_extend_rotate_transaction(handle, 0,
3721 handle->h_buffer_credits,
3722 path);
3723 if (ret) {
3724 mlog_errno(ret);
3725 goto out;
3726 }
3727
3730 /* The merge left us with an empty extent, remove it. */ 3728 /* The merge left us with an empty extent, remove it. */
3731 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); 3729 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3732 if (ret) { 3730 if (ret) {
@@ -3748,6 +3746,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
3748 goto out; 3746 goto out;
3749 } 3747 }
3750 3748
3749 /* extend credit for ocfs2_remove_rightmost_path */
3750 ret = ocfs2_extend_rotate_transaction(handle, 0,
3751 handle->h_buffer_credits,
3752 path);
3753 if (ret) {
3754 mlog_errno(ret);
3755 goto out;
3756 }
3757
3751 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); 3758 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3752 /* 3759 /*
3753 * Error from this last rotate is not critical, so 3760 * Error from this last rotate is not critical, so
@@ -3783,6 +3790,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
3783 } 3790 }
3784 3791
3785 if (ctxt->c_split_covers_rec) { 3792 if (ctxt->c_split_covers_rec) {
3793 /* extend credit for ocfs2_remove_rightmost_path */
3794 ret = ocfs2_extend_rotate_transaction(handle, 0,
3795 handle->h_buffer_credits,
3796 path);
3797 if (ret) {
3798 mlog_errno(ret);
3799 ret = 0;
3800 goto out;
3801 }
3802
3786 /* 3803 /*
3787 * The merge may have left an empty extent in 3804 * The merge may have left an empty extent in
3788 * our leaf. Try to rotate it away. 3805 * our leaf. Try to rotate it away.
@@ -5342,6 +5359,15 @@ static int ocfs2_truncate_rec(handle_t *handle,
5342 struct ocfs2_extent_block *eb; 5359 struct ocfs2_extent_block *eb;
5343 5360
5344 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { 5361 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5362 /* extend credit for ocfs2_remove_rightmost_path */
5363 ret = ocfs2_extend_rotate_transaction(handle, 0,
5364 handle->h_buffer_credits,
5365 path);
5366 if (ret) {
5367 mlog_errno(ret);
5368 goto out;
5369 }
5370
5345 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); 5371 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5346 if (ret) { 5372 if (ret) {
5347 mlog_errno(ret); 5373 mlog_errno(ret);
@@ -5928,16 +5954,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5928 5954
5929 ocfs2_journal_dirty(handle, tl_bh); 5955 ocfs2_journal_dirty(handle, tl_bh);
5930 5956
5931 /* TODO: Perhaps we can calculate the bulk of the
5932 * credits up front rather than extending like
5933 * this. */
5934 status = ocfs2_extend_trans(handle,
5935 OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5936 if (status < 0) {
5937 mlog_errno(status);
5938 goto bail;
5939 }
5940
5941 rec = tl->tl_recs[i]; 5957 rec = tl->tl_recs[i];
5942 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, 5958 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5943 le32_to_cpu(rec.t_start)); 5959 le32_to_cpu(rec.t_start));
@@ -5958,6 +5974,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5958 goto bail; 5974 goto bail;
5959 } 5975 }
5960 } 5976 }
5977
5978 status = ocfs2_extend_trans(handle,
5979 OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5980 if (status < 0) {
5981 mlog_errno(status);
5982 goto bail;
5983 }
5961 i--; 5984 i--;
5962 } 5985 }
5963 5986
@@ -6016,7 +6039,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
6016 goto out_mutex; 6039 goto out_mutex;
6017 } 6040 }
6018 6041
6019 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 6042 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
6020 if (IS_ERR(handle)) { 6043 if (IS_ERR(handle)) {
6021 status = PTR_ERR(handle); 6044 status = PTR_ERR(handle);
6022 mlog_errno(status); 6045 mlog_errno(status);
@@ -6079,7 +6102,7 @@ void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
6079 if (cancel) 6102 if (cancel)
6080 cancel_delayed_work(&osb->osb_truncate_log_wq); 6103 cancel_delayed_work(&osb->osb_truncate_log_wq);
6081 6104
6082 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, 6105 queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq,
6083 OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); 6106 OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
6084 } 6107 }
6085} 6108}
@@ -6253,7 +6276,7 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
6253 6276
6254 if (tl_inode) { 6277 if (tl_inode) {
6255 cancel_delayed_work(&osb->osb_truncate_log_wq); 6278 cancel_delayed_work(&osb->osb_truncate_log_wq);
6256 flush_workqueue(ocfs2_wq); 6279 flush_workqueue(osb->ocfs2_wq);
6257 6280
6258 status = ocfs2_flush_truncate_log(osb); 6281 status = ocfs2_flush_truncate_log(osb);
6259 if (status < 0) 6282 if (status < 0)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index cda0361e95a4..1581240a7ca0 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -499,153 +499,6 @@ bail:
499 return status; 499 return status;
500} 500}
501 501
502/*
503 * TODO: Make this into a generic get_blocks function.
504 *
505 * From do_direct_io in direct-io.c:
506 * "So what we do is to permit the ->get_blocks function to populate
507 * bh.b_size with the size of IO which is permitted at this offset and
508 * this i_blkbits."
509 *
510 * This function is called directly from get_more_blocks in direct-io.c.
511 *
512 * called like this: dio->get_blocks(dio->inode, fs_startblk,
513 * fs_count, map_bh, dio->rw == WRITE);
514 */
515static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
516 struct buffer_head *bh_result, int create)
517{
518 int ret;
519 u32 cpos = 0;
520 int alloc_locked = 0;
521 u64 p_blkno, inode_blocks, contig_blocks;
522 unsigned int ext_flags;
523 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
524 unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
525 unsigned long len = bh_result->b_size;
526 unsigned int clusters_to_alloc = 0, contig_clusters = 0;
527
528 cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
529
530 /* This function won't even be called if the request isn't all
531 * nicely aligned and of the right size, so there's no need
532 * for us to check any of that. */
533
534 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
535
536 down_read(&OCFS2_I(inode)->ip_alloc_sem);
537
538 /* This figures out the size of the next contiguous block, and
539 * our logical offset */
540 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
541 &contig_blocks, &ext_flags);
542 up_read(&OCFS2_I(inode)->ip_alloc_sem);
543
544 if (ret) {
545 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
546 (unsigned long long)iblock);
547 ret = -EIO;
548 goto bail;
549 }
550
551 /* We should already CoW the refcounted extent in case of create. */
552 BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
553
554 /* allocate blocks if no p_blkno is found, and create == 1 */
555 if (!p_blkno && create) {
556 ret = ocfs2_inode_lock(inode, NULL, 1);
557 if (ret < 0) {
558 mlog_errno(ret);
559 goto bail;
560 }
561
562 alloc_locked = 1;
563
564 down_write(&OCFS2_I(inode)->ip_alloc_sem);
565
566 /* fill hole, allocate blocks can't be larger than the size
567 * of the hole */
568 clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
569 contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb,
570 contig_blocks);
571 if (clusters_to_alloc > contig_clusters)
572 clusters_to_alloc = contig_clusters;
573
574 /* allocate extent and insert them into the extent tree */
575 ret = ocfs2_extend_allocation(inode, cpos,
576 clusters_to_alloc, 0);
577 if (ret < 0) {
578 up_write(&OCFS2_I(inode)->ip_alloc_sem);
579 mlog_errno(ret);
580 goto bail;
581 }
582
583 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
584 &contig_blocks, &ext_flags);
585 if (ret < 0) {
586 up_write(&OCFS2_I(inode)->ip_alloc_sem);
587 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
588 (unsigned long long)iblock);
589 ret = -EIO;
590 goto bail;
591 }
592 set_buffer_new(bh_result);
593 up_write(&OCFS2_I(inode)->ip_alloc_sem);
594 }
595
596 /*
597 * get_more_blocks() expects us to describe a hole by clearing
598 * the mapped bit on bh_result().
599 *
600 * Consider an unwritten extent as a hole.
601 */
602 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
603 map_bh(bh_result, inode->i_sb, p_blkno);
604 else
605 clear_buffer_mapped(bh_result);
606
607 /* make sure we don't map more than max_blocks blocks here as
608 that's all the kernel will handle at this point. */
609 if (max_blocks < contig_blocks)
610 contig_blocks = max_blocks;
611 bh_result->b_size = contig_blocks << blocksize_bits;
612bail:
613 if (alloc_locked)
614 ocfs2_inode_unlock(inode, 1);
615 return ret;
616}
617
618/*
619 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
620 * particularly interested in the aio/dio case. We use the rw_lock DLM lock
621 * to protect io on one node from truncation on another.
622 */
623static void ocfs2_dio_end_io(struct kiocb *iocb,
624 loff_t offset,
625 ssize_t bytes,
626 void *private)
627{
628 struct inode *inode = file_inode(iocb->ki_filp);
629 int level;
630
631 /* this io's submitter should not have unlocked this before we could */
632 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
633
634 if (ocfs2_iocb_is_unaligned_aio(iocb)) {
635 ocfs2_iocb_clear_unaligned_aio(iocb);
636
637 mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
638 }
639
640 /* Let rw unlock to be done later to protect append direct io write */
641 if (offset + bytes <= i_size_read(inode)) {
642 ocfs2_iocb_clear_rw_locked(iocb);
643
644 level = ocfs2_iocb_rw_locked_level(iocb);
645 ocfs2_rw_unlock(inode, level);
646 }
647}
648
649static int ocfs2_releasepage(struct page *page, gfp_t wait) 502static int ocfs2_releasepage(struct page *page, gfp_t wait)
650{ 503{
651 if (!page_has_buffers(page)) 504 if (!page_has_buffers(page))
@@ -653,363 +506,6 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
653 return try_to_free_buffers(page); 506 return try_to_free_buffers(page);
654} 507}
655 508
656static int ocfs2_is_overwrite(struct ocfs2_super *osb,
657 struct inode *inode, loff_t offset)
658{
659 int ret = 0;
660 u32 v_cpos = 0;
661 u32 p_cpos = 0;
662 unsigned int num_clusters = 0;
663 unsigned int ext_flags = 0;
664
665 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
666 ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
667 &num_clusters, &ext_flags);
668 if (ret < 0) {
669 mlog_errno(ret);
670 return ret;
671 }
672
673 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
674 return 1;
675
676 return 0;
677}
678
679static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb,
680 struct inode *inode, loff_t offset,
681 u64 zero_len, int cluster_align)
682{
683 u32 p_cpos = 0;
684 u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
685 unsigned int num_clusters = 0;
686 unsigned int ext_flags = 0;
687 int ret = 0;
688
689 if (offset <= i_size_read(inode) || cluster_align)
690 return 0;
691
692 ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
693 &ext_flags);
694 if (ret < 0) {
695 mlog_errno(ret);
696 return ret;
697 }
698
699 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
700 u64 s = i_size_read(inode);
701 sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) +
702 (do_div(s, osb->s_clustersize) >> 9);
703
704 ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector,
705 zero_len >> 9, GFP_NOFS, false);
706 if (ret < 0)
707 mlog_errno(ret);
708 }
709
710 return ret;
711}
712
713static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb,
714 struct inode *inode, loff_t offset)
715{
716 u64 zero_start, zero_len, total_zero_len;
717 u32 p_cpos = 0, clusters_to_add;
718 u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
719 unsigned int num_clusters = 0;
720 unsigned int ext_flags = 0;
721 u32 size_div, offset_div;
722 int ret = 0;
723
724 {
725 u64 o = offset;
726 u64 s = i_size_read(inode);
727
728 offset_div = do_div(o, osb->s_clustersize);
729 size_div = do_div(s, osb->s_clustersize);
730 }
731
732 if (offset <= i_size_read(inode))
733 return 0;
734
735 clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) -
736 ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode));
737 total_zero_len = offset - i_size_read(inode);
738 if (clusters_to_add)
739 total_zero_len -= offset_div;
740
741 /* Allocate clusters to fill out holes, and this is only needed
742 * when we add more than one clusters. Otherwise the cluster will
743 * be allocated during direct IO */
744 if (clusters_to_add > 1) {
745 ret = ocfs2_extend_allocation(inode,
746 OCFS2_I(inode)->ip_clusters,
747 clusters_to_add - 1, 0);
748 if (ret) {
749 mlog_errno(ret);
750 goto out;
751 }
752 }
753
754 while (total_zero_len) {
755 ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
756 &ext_flags);
757 if (ret < 0) {
758 mlog_errno(ret);
759 goto out;
760 }
761
762 zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) +
763 size_div;
764 zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) -
765 size_div;
766 zero_len = min(total_zero_len, zero_len);
767
768 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
769 ret = blkdev_issue_zeroout(osb->sb->s_bdev,
770 zero_start >> 9, zero_len >> 9,
771 GFP_NOFS, false);
772 if (ret < 0) {
773 mlog_errno(ret);
774 goto out;
775 }
776 }
777
778 total_zero_len -= zero_len;
779 v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div);
780
781 /* Only at first iteration can be cluster not aligned.
782 * So set size_div to 0 for the rest */
783 size_div = 0;
784 }
785
786out:
787 return ret;
788}
789
790static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
791 struct iov_iter *iter,
792 loff_t offset)
793{
794 ssize_t ret = 0;
795 ssize_t written = 0;
796 bool orphaned = false;
797 int is_overwrite = 0;
798 struct file *file = iocb->ki_filp;
799 struct inode *inode = file_inode(file)->i_mapping->host;
800 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
801 struct buffer_head *di_bh = NULL;
802 size_t count = iter->count;
803 journal_t *journal = osb->journal->j_journal;
804 u64 zero_len_head, zero_len_tail;
805 int cluster_align_head, cluster_align_tail;
806 loff_t final_size = offset + count;
807 int append_write = offset >= i_size_read(inode) ? 1 : 0;
808 unsigned int num_clusters = 0;
809 unsigned int ext_flags = 0;
810
811 {
812 u64 o = offset;
813 u64 s = i_size_read(inode);
814
815 zero_len_head = do_div(o, 1 << osb->s_clustersize_bits);
816 cluster_align_head = !zero_len_head;
817
818 zero_len_tail = osb->s_clustersize -
819 do_div(s, osb->s_clustersize);
820 if ((offset - i_size_read(inode)) < zero_len_tail)
821 zero_len_tail = offset - i_size_read(inode);
822 cluster_align_tail = !zero_len_tail;
823 }
824
825 /*
826 * when final_size > inode->i_size, inode->i_size will be
827 * updated after direct write, so add the inode to orphan
828 * dir first.
829 */
830 if (final_size > i_size_read(inode)) {
831 ret = ocfs2_add_inode_to_orphan(osb, inode);
832 if (ret < 0) {
833 mlog_errno(ret);
834 goto out;
835 }
836 orphaned = true;
837 }
838
839 if (append_write) {
840 ret = ocfs2_inode_lock(inode, NULL, 1);
841 if (ret < 0) {
842 mlog_errno(ret);
843 goto clean_orphan;
844 }
845
846 /* zeroing out the previously allocated cluster tail
847 * that but not zeroed */
848 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
849 down_read(&OCFS2_I(inode)->ip_alloc_sem);
850 ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
851 zero_len_tail, cluster_align_tail);
852 up_read(&OCFS2_I(inode)->ip_alloc_sem);
853 } else {
854 down_write(&OCFS2_I(inode)->ip_alloc_sem);
855 ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
856 offset);
857 up_write(&OCFS2_I(inode)->ip_alloc_sem);
858 }
859 if (ret < 0) {
860 mlog_errno(ret);
861 ocfs2_inode_unlock(inode, 1);
862 goto clean_orphan;
863 }
864
865 is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
866 if (is_overwrite < 0) {
867 mlog_errno(is_overwrite);
868 ret = is_overwrite;
869 ocfs2_inode_unlock(inode, 1);
870 goto clean_orphan;
871 }
872
873 ocfs2_inode_unlock(inode, 1);
874 }
875
876 written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
877 offset, ocfs2_direct_IO_get_blocks,
878 ocfs2_dio_end_io, NULL, 0);
879 /* overwrite aio may return -EIOCBQUEUED, and it is not an error */
880 if ((written < 0) && (written != -EIOCBQUEUED)) {
881 loff_t i_size = i_size_read(inode);
882
883 if (offset + count > i_size) {
884 ret = ocfs2_inode_lock(inode, &di_bh, 1);
885 if (ret < 0) {
886 mlog_errno(ret);
887 goto clean_orphan;
888 }
889
890 if (i_size == i_size_read(inode)) {
891 ret = ocfs2_truncate_file(inode, di_bh,
892 i_size);
893 if (ret < 0) {
894 if (ret != -ENOSPC)
895 mlog_errno(ret);
896
897 ocfs2_inode_unlock(inode, 1);
898 brelse(di_bh);
899 di_bh = NULL;
900 goto clean_orphan;
901 }
902 }
903
904 ocfs2_inode_unlock(inode, 1);
905 brelse(di_bh);
906 di_bh = NULL;
907
908 ret = jbd2_journal_force_commit(journal);
909 if (ret < 0)
910 mlog_errno(ret);
911 }
912 } else if (written > 0 && append_write && !is_overwrite &&
913 !cluster_align_head) {
914 /* zeroing out the allocated cluster head */
915 u32 p_cpos = 0;
916 u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
917
918 ret = ocfs2_inode_lock(inode, NULL, 0);
919 if (ret < 0) {
920 mlog_errno(ret);
921 goto clean_orphan;
922 }
923
924 ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
925 &num_clusters, &ext_flags);
926 if (ret < 0) {
927 mlog_errno(ret);
928 ocfs2_inode_unlock(inode, 0);
929 goto clean_orphan;
930 }
931
932 BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
933
934 ret = blkdev_issue_zeroout(osb->sb->s_bdev,
935 (u64)p_cpos << (osb->s_clustersize_bits - 9),
936 zero_len_head >> 9, GFP_NOFS, false);
937 if (ret < 0)
938 mlog_errno(ret);
939
940 ocfs2_inode_unlock(inode, 0);
941 }
942
943clean_orphan:
944 if (orphaned) {
945 int tmp_ret;
946 int update_isize = written > 0 ? 1 : 0;
947 loff_t end = update_isize ? offset + written : 0;
948
949 tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1);
950 if (tmp_ret < 0) {
951 ret = tmp_ret;
952 mlog_errno(ret);
953 goto out;
954 }
955
956 tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
957 update_isize, end);
958 if (tmp_ret < 0) {
959 ocfs2_inode_unlock(inode, 1);
960 ret = tmp_ret;
961 mlog_errno(ret);
962 brelse(di_bh);
963 goto out;
964 }
965
966 ocfs2_inode_unlock(inode, 1);
967 brelse(di_bh);
968
969 tmp_ret = jbd2_journal_force_commit(journal);
970 if (tmp_ret < 0) {
971 ret = tmp_ret;
972 mlog_errno(tmp_ret);
973 }
974 }
975
976out:
977 if (ret >= 0)
978 ret = written;
979 return ret;
980}
981
982static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
983 loff_t offset)
984{
985 struct file *file = iocb->ki_filp;
986 struct inode *inode = file_inode(file)->i_mapping->host;
987 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
988 int full_coherency = !(osb->s_mount_opt &
989 OCFS2_MOUNT_COHERENCY_BUFFERED);
990
991 /*
992 * Fallback to buffered I/O if we see an inode without
993 * extents.
994 */
995 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
996 return 0;
997
998 /* Fallback to buffered I/O if we are appending and
999 * concurrent O_DIRECT writes are allowed.
1000 */
1001 if (i_size_read(inode) <= offset && !full_coherency)
1002 return 0;
1003
1004 if (iov_iter_rw(iter) == READ)
1005 return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
1006 iter, offset,
1007 ocfs2_direct_IO_get_blocks,
1008 ocfs2_dio_end_io, NULL, 0);
1009 else
1010 return ocfs2_direct_IO_write(iocb, iter, offset);
1011}
1012
1013static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, 509static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
1014 u32 cpos, 510 u32 cpos,
1015 unsigned int *start, 511 unsigned int *start,
@@ -1196,6 +692,13 @@ next_bh:
1196 692
1197#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) 693#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
1198 694
695struct ocfs2_unwritten_extent {
696 struct list_head ue_node;
697 struct list_head ue_ip_node;
698 u32 ue_cpos;
699 u32 ue_phys;
700};
701
1199/* 702/*
1200 * Describe the state of a single cluster to be written to. 703 * Describe the state of a single cluster to be written to.
1201 */ 704 */
@@ -1207,7 +710,7 @@ struct ocfs2_write_cluster_desc {
1207 * filled. 710 * filled.
1208 */ 711 */
1209 unsigned c_new; 712 unsigned c_new;
1210 unsigned c_unwritten; 713 unsigned c_clear_unwritten;
1211 unsigned c_needs_zero; 714 unsigned c_needs_zero;
1212}; 715};
1213 716
@@ -1219,6 +722,9 @@ struct ocfs2_write_ctxt {
1219 /* First cluster allocated in a nonsparse extend */ 722 /* First cluster allocated in a nonsparse extend */
1220 u32 w_first_new_cpos; 723 u32 w_first_new_cpos;
1221 724
725 /* Type of caller. Must be one of buffer, mmap, direct. */
726 ocfs2_write_type_t w_type;
727
1222 struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; 728 struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
1223 729
1224 /* 730 /*
@@ -1267,6 +773,8 @@ struct ocfs2_write_ctxt {
1267 struct buffer_head *w_di_bh; 773 struct buffer_head *w_di_bh;
1268 774
1269 struct ocfs2_cached_dealloc_ctxt w_dealloc; 775 struct ocfs2_cached_dealloc_ctxt w_dealloc;
776
777 struct list_head w_unwritten_list;
1270}; 778};
1271 779
1272void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) 780void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
@@ -1305,8 +813,25 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
1305 ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); 813 ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
1306} 814}
1307 815
1308static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) 816static void ocfs2_free_unwritten_list(struct inode *inode,
817 struct list_head *head)
1309{ 818{
819 struct ocfs2_inode_info *oi = OCFS2_I(inode);
820 struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL;
821
822 list_for_each_entry_safe(ue, tmp, head, ue_node) {
823 list_del(&ue->ue_node);
824 spin_lock(&oi->ip_lock);
825 list_del(&ue->ue_ip_node);
826 spin_unlock(&oi->ip_lock);
827 kfree(ue);
828 }
829}
830
831static void ocfs2_free_write_ctxt(struct inode *inode,
832 struct ocfs2_write_ctxt *wc)
833{
834 ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list);
1310 ocfs2_unlock_pages(wc); 835 ocfs2_unlock_pages(wc);
1311 brelse(wc->w_di_bh); 836 brelse(wc->w_di_bh);
1312 kfree(wc); 837 kfree(wc);
@@ -1314,7 +839,8 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
1314 839
1315static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, 840static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
1316 struct ocfs2_super *osb, loff_t pos, 841 struct ocfs2_super *osb, loff_t pos,
1317 unsigned len, struct buffer_head *di_bh) 842 unsigned len, ocfs2_write_type_t type,
843 struct buffer_head *di_bh)
1318{ 844{
1319 u32 cend; 845 u32 cend;
1320 struct ocfs2_write_ctxt *wc; 846 struct ocfs2_write_ctxt *wc;
@@ -1329,6 +855,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
1329 wc->w_clen = cend - wc->w_cpos + 1; 855 wc->w_clen = cend - wc->w_cpos + 1;
1330 get_bh(di_bh); 856 get_bh(di_bh);
1331 wc->w_di_bh = di_bh; 857 wc->w_di_bh = di_bh;
858 wc->w_type = type;
1332 859
1333 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) 860 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
1334 wc->w_large_pages = 1; 861 wc->w_large_pages = 1;
@@ -1336,6 +863,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
1336 wc->w_large_pages = 0; 863 wc->w_large_pages = 0;
1337 864
1338 ocfs2_init_dealloc_ctxt(&wc->w_dealloc); 865 ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
866 INIT_LIST_HEAD(&wc->w_unwritten_list);
1339 867
1340 *wcp = wc; 868 *wcp = wc;
1341 869
@@ -1396,12 +924,13 @@ static void ocfs2_write_failure(struct inode *inode,
1396 to = user_pos + user_len; 924 to = user_pos + user_len;
1397 struct page *tmppage; 925 struct page *tmppage;
1398 926
1399 ocfs2_zero_new_buffers(wc->w_target_page, from, to); 927 if (wc->w_target_page)
928 ocfs2_zero_new_buffers(wc->w_target_page, from, to);
1400 929
1401 for(i = 0; i < wc->w_num_pages; i++) { 930 for(i = 0; i < wc->w_num_pages; i++) {
1402 tmppage = wc->w_pages[i]; 931 tmppage = wc->w_pages[i];
1403 932
1404 if (page_has_buffers(tmppage)) { 933 if (tmppage && page_has_buffers(tmppage)) {
1405 if (ocfs2_should_order_data(inode)) 934 if (ocfs2_should_order_data(inode))
1406 ocfs2_jbd2_file_inode(wc->w_handle, inode); 935 ocfs2_jbd2_file_inode(wc->w_handle, inode);
1407 936
@@ -1531,11 +1060,13 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1531 wc->w_num_pages = 1; 1060 wc->w_num_pages = 1;
1532 start = target_index; 1061 start = target_index;
1533 } 1062 }
1063 end_index = (user_pos + user_len - 1) >> PAGE_CACHE_SHIFT;
1534 1064
1535 for(i = 0; i < wc->w_num_pages; i++) { 1065 for(i = 0; i < wc->w_num_pages; i++) {
1536 index = start + i; 1066 index = start + i;
1537 1067
1538 if (index == target_index && mmap_page) { 1068 if (index >= target_index && index <= end_index &&
1069 wc->w_type == OCFS2_WRITE_MMAP) {
1539 /* 1070 /*
1540 * ocfs2_pagemkwrite() is a little different 1071 * ocfs2_pagemkwrite() is a little different
1541 * and wants us to directly use the page 1072 * and wants us to directly use the page
@@ -1554,6 +1085,11 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1554 page_cache_get(mmap_page); 1085 page_cache_get(mmap_page);
1555 wc->w_pages[i] = mmap_page; 1086 wc->w_pages[i] = mmap_page;
1556 wc->w_target_locked = true; 1087 wc->w_target_locked = true;
1088 } else if (index >= target_index && index <= end_index &&
1089 wc->w_type == OCFS2_WRITE_DIRECT) {
1090 /* Direct write has no mapping page. */
1091 wc->w_pages[i] = NULL;
1092 continue;
1557 } else { 1093 } else {
1558 wc->w_pages[i] = find_or_create_page(mapping, index, 1094 wc->w_pages[i] = find_or_create_page(mapping, index,
1559 GFP_NOFS); 1095 GFP_NOFS);
@@ -1578,19 +1114,20 @@ out:
1578 * Prepare a single cluster for write one cluster into the file. 1114 * Prepare a single cluster for write one cluster into the file.
1579 */ 1115 */
1580static int ocfs2_write_cluster(struct address_space *mapping, 1116static int ocfs2_write_cluster(struct address_space *mapping,
1581 u32 phys, unsigned int unwritten, 1117 u32 *phys, unsigned int new,
1118 unsigned int clear_unwritten,
1582 unsigned int should_zero, 1119 unsigned int should_zero,
1583 struct ocfs2_alloc_context *data_ac, 1120 struct ocfs2_alloc_context *data_ac,
1584 struct ocfs2_alloc_context *meta_ac, 1121 struct ocfs2_alloc_context *meta_ac,
1585 struct ocfs2_write_ctxt *wc, u32 cpos, 1122 struct ocfs2_write_ctxt *wc, u32 cpos,
1586 loff_t user_pos, unsigned user_len) 1123 loff_t user_pos, unsigned user_len)
1587{ 1124{
1588 int ret, i, new; 1125 int ret, i;
1589 u64 v_blkno, p_blkno; 1126 u64 p_blkno;
1590 struct inode *inode = mapping->host; 1127 struct inode *inode = mapping->host;
1591 struct ocfs2_extent_tree et; 1128 struct ocfs2_extent_tree et;
1129 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
1592 1130
1593 new = phys == 0 ? 1 : 0;
1594 if (new) { 1131 if (new) {
1595 u32 tmp_pos; 1132 u32 tmp_pos;
1596 1133
@@ -1600,9 +1137,9 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1600 */ 1137 */
1601 tmp_pos = cpos; 1138 tmp_pos = cpos;
1602 ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode, 1139 ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
1603 &tmp_pos, 1, 0, wc->w_di_bh, 1140 &tmp_pos, 1, !clear_unwritten,
1604 wc->w_handle, data_ac, 1141 wc->w_di_bh, wc->w_handle,
1605 meta_ac, NULL); 1142 data_ac, meta_ac, NULL);
1606 /* 1143 /*
1607 * This shouldn't happen because we must have already 1144 * This shouldn't happen because we must have already
1608 * calculated the correct meta data allocation required. The 1145 * calculated the correct meta data allocation required. The
@@ -1619,11 +1156,11 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1619 mlog_errno(ret); 1156 mlog_errno(ret);
1620 goto out; 1157 goto out;
1621 } 1158 }
1622 } else if (unwritten) { 1159 } else if (clear_unwritten) {
1623 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), 1160 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
1624 wc->w_di_bh); 1161 wc->w_di_bh);
1625 ret = ocfs2_mark_extent_written(inode, &et, 1162 ret = ocfs2_mark_extent_written(inode, &et,
1626 wc->w_handle, cpos, 1, phys, 1163 wc->w_handle, cpos, 1, *phys,
1627 meta_ac, &wc->w_dealloc); 1164 meta_ac, &wc->w_dealloc);
1628 if (ret < 0) { 1165 if (ret < 0) {
1629 mlog_errno(ret); 1166 mlog_errno(ret);
@@ -1631,30 +1168,33 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1631 } 1168 }
1632 } 1169 }
1633 1170
1634 if (should_zero)
1635 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
1636 else
1637 v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
1638
1639 /* 1171 /*
1640 * The only reason this should fail is due to an inability to 1172 * The only reason this should fail is due to an inability to
1641 * find the extent added. 1173 * find the extent added.
1642 */ 1174 */
1643 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, 1175 ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL);
1644 NULL);
1645 if (ret < 0) { 1176 if (ret < 0) {
1646 mlog(ML_ERROR, "Get physical blkno failed for inode %llu, " 1177 mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
1647 "at logical block %llu", 1178 "at logical cluster %u",
1648 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1179 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
1649 (unsigned long long)v_blkno);
1650 goto out; 1180 goto out;
1651 } 1181 }
1652 1182
1653 BUG_ON(p_blkno == 0); 1183 BUG_ON(*phys == 0);
1184
1185 p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys);
1186 if (!should_zero)
1187 p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1);
1654 1188
1655 for(i = 0; i < wc->w_num_pages; i++) { 1189 for(i = 0; i < wc->w_num_pages; i++) {
1656 int tmpret; 1190 int tmpret;
1657 1191
1192 /* This is the direct io target page. */
1193 if (wc->w_pages[i] == NULL) {
1194 p_blkno++;
1195 continue;
1196 }
1197
1658 tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, 1198 tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
1659 wc->w_pages[i], cpos, 1199 wc->w_pages[i], cpos,
1660 user_pos, user_len, 1200 user_pos, user_len,
@@ -1701,8 +1241,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
1701 if ((cluster_off + local_len) > osb->s_clustersize) 1241 if ((cluster_off + local_len) > osb->s_clustersize)
1702 local_len = osb->s_clustersize - cluster_off; 1242 local_len = osb->s_clustersize - cluster_off;
1703 1243
1704 ret = ocfs2_write_cluster(mapping, desc->c_phys, 1244 ret = ocfs2_write_cluster(mapping, &desc->c_phys,
1705 desc->c_unwritten, 1245 desc->c_new,
1246 desc->c_clear_unwritten,
1706 desc->c_needs_zero, 1247 desc->c_needs_zero,
1707 data_ac, meta_ac, 1248 data_ac, meta_ac,
1708 wc, desc->c_cpos, pos, local_len); 1249 wc, desc->c_cpos, pos, local_len);
@@ -1773,6 +1314,66 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
1773} 1314}
1774 1315
1775/* 1316/*
1317 * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to
1318 * do the zero work. And should not to clear UNWRITTEN since it will be cleared
1319 * by the direct io procedure.
1320 * If this is a new extent that allocated by direct io, we should mark it in
1321 * the ip_unwritten_list.
1322 */
1323static int ocfs2_unwritten_check(struct inode *inode,
1324 struct ocfs2_write_ctxt *wc,
1325 struct ocfs2_write_cluster_desc *desc)
1326{
1327 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1328 struct ocfs2_unwritten_extent *ue = NULL, *new = NULL;
1329 int ret = 0;
1330
1331 if (!desc->c_needs_zero)
1332 return 0;
1333
1334retry:
1335 spin_lock(&oi->ip_lock);
1336 /* Needs not to zero no metter buffer or direct. The one who is zero
1337 * the cluster is doing zero. And he will clear unwritten after all
1338 * cluster io finished. */
1339 list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) {
1340 if (desc->c_cpos == ue->ue_cpos) {
1341 BUG_ON(desc->c_new);
1342 desc->c_needs_zero = 0;
1343 desc->c_clear_unwritten = 0;
1344 goto unlock;
1345 }
1346 }
1347
1348 if (wc->w_type != OCFS2_WRITE_DIRECT)
1349 goto unlock;
1350
1351 if (new == NULL) {
1352 spin_unlock(&oi->ip_lock);
1353 new = kmalloc(sizeof(struct ocfs2_unwritten_extent),
1354 GFP_NOFS);
1355 if (new == NULL) {
1356 ret = -ENOMEM;
1357 goto out;
1358 }
1359 goto retry;
1360 }
1361 /* This direct write will doing zero. */
1362 new->ue_cpos = desc->c_cpos;
1363 new->ue_phys = desc->c_phys;
1364 desc->c_clear_unwritten = 0;
1365 list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
1366 list_add_tail(&new->ue_node, &wc->w_unwritten_list);
1367 new = NULL;
1368unlock:
1369 spin_unlock(&oi->ip_lock);
1370out:
1371 if (new)
1372 kfree(new);
1373 return ret;
1374}
1375
1376/*
1776 * Populate each single-cluster write descriptor in the write context 1377 * Populate each single-cluster write descriptor in the write context
1777 * with information about the i/o to be done. 1378 * with information about the i/o to be done.
1778 * 1379 *
@@ -1847,14 +1448,21 @@ static int ocfs2_populate_write_desc(struct inode *inode,
1847 if (phys == 0) { 1448 if (phys == 0) {
1848 desc->c_new = 1; 1449 desc->c_new = 1;
1849 desc->c_needs_zero = 1; 1450 desc->c_needs_zero = 1;
1451 desc->c_clear_unwritten = 1;
1850 *clusters_to_alloc = *clusters_to_alloc + 1; 1452 *clusters_to_alloc = *clusters_to_alloc + 1;
1851 } 1453 }
1852 1454
1853 if (ext_flags & OCFS2_EXT_UNWRITTEN) { 1455 if (ext_flags & OCFS2_EXT_UNWRITTEN) {
1854 desc->c_unwritten = 1; 1456 desc->c_clear_unwritten = 1;
1855 desc->c_needs_zero = 1; 1457 desc->c_needs_zero = 1;
1856 } 1458 }
1857 1459
1460 ret = ocfs2_unwritten_check(inode, wc, desc);
1461 if (ret) {
1462 mlog_errno(ret);
1463 goto out;
1464 }
1465
1858 num_clusters--; 1466 num_clusters--;
1859 } 1467 }
1860 1468
@@ -2017,8 +1625,10 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode,
2017 if (ret) 1625 if (ret)
2018 mlog_errno(ret); 1626 mlog_errno(ret);
2019 1627
2020 wc->w_first_new_cpos = 1628 /* There is no wc if this is call from direct. */
2021 ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); 1629 if (wc)
1630 wc->w_first_new_cpos =
1631 ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
2022 1632
2023 return ret; 1633 return ret;
2024} 1634}
@@ -2072,9 +1682,8 @@ out:
2072 return ret; 1682 return ret;
2073} 1683}
2074 1684
2075int ocfs2_write_begin_nolock(struct file *filp, 1685int ocfs2_write_begin_nolock(struct address_space *mapping,
2076 struct address_space *mapping, 1686 loff_t pos, unsigned len, ocfs2_write_type_t type,
2077 loff_t pos, unsigned len, unsigned flags,
2078 struct page **pagep, void **fsdata, 1687 struct page **pagep, void **fsdata,
2079 struct buffer_head *di_bh, struct page *mmap_page) 1688 struct buffer_head *di_bh, struct page *mmap_page)
2080{ 1689{
@@ -2091,7 +1700,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
2091 int try_free = 1, ret1; 1700 int try_free = 1, ret1;
2092 1701
2093try_again: 1702try_again:
2094 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); 1703 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh);
2095 if (ret) { 1704 if (ret) {
2096 mlog_errno(ret); 1705 mlog_errno(ret);
2097 return ret; 1706 return ret;
@@ -2110,14 +1719,17 @@ try_again:
2110 } 1719 }
2111 } 1720 }
2112 1721
2113 if (ocfs2_sparse_alloc(osb)) 1722 /* Direct io change i_size late, should not zero tail here. */
2114 ret = ocfs2_zero_tail(inode, di_bh, pos); 1723 if (type != OCFS2_WRITE_DIRECT) {
2115 else 1724 if (ocfs2_sparse_alloc(osb))
2116 ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len, 1725 ret = ocfs2_zero_tail(inode, di_bh, pos);
2117 wc); 1726 else
2118 if (ret) { 1727 ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
2119 mlog_errno(ret); 1728 len, wc);
2120 goto out; 1729 if (ret) {
1730 mlog_errno(ret);
1731 goto out;
1732 }
2121 } 1733 }
2122 1734
2123 ret = ocfs2_check_range_for_refcount(inode, pos, len); 1735 ret = ocfs2_check_range_for_refcount(inode, pos, len);
@@ -2148,7 +1760,7 @@ try_again:
2148 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1760 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2149 (long long)i_size_read(inode), 1761 (long long)i_size_read(inode),
2150 le32_to_cpu(di->i_clusters), 1762 le32_to_cpu(di->i_clusters),
2151 pos, len, flags, mmap_page, 1763 pos, len, type, mmap_page,
2152 clusters_to_alloc, extents_to_split); 1764 clusters_to_alloc, extents_to_split);
2153 1765
2154 /* 1766 /*
@@ -2178,17 +1790,17 @@ try_again:
2178 1790
2179 credits = ocfs2_calc_extend_credits(inode->i_sb, 1791 credits = ocfs2_calc_extend_credits(inode->i_sb,
2180 &di->id2.i_list); 1792 &di->id2.i_list);
2181 1793 } else if (type == OCFS2_WRITE_DIRECT)
2182 } 1794 /* direct write needs not to start trans if no extents alloc. */
1795 goto success;
2183 1796
2184 /* 1797 /*
2185 * We have to zero sparse allocated clusters, unwritten extent clusters, 1798 * We have to zero sparse allocated clusters, unwritten extent clusters,
2186 * and non-sparse clusters we just extended. For non-sparse writes, 1799 * and non-sparse clusters we just extended. For non-sparse writes,
2187 * we know zeros will only be needed in the first and/or last cluster. 1800 * we know zeros will only be needed in the first and/or last cluster.
2188 */ 1801 */
2189 if (clusters_to_alloc || extents_to_split || 1802 if (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
2190 (wc->w_clen && (wc->w_desc[0].c_needs_zero || 1803 wc->w_desc[wc->w_clen - 1].c_needs_zero))
2191 wc->w_desc[wc->w_clen - 1].c_needs_zero)))
2192 cluster_of_pages = 1; 1804 cluster_of_pages = 1;
2193 else 1805 else
2194 cluster_of_pages = 0; 1806 cluster_of_pages = 0;
@@ -2255,7 +1867,8 @@ try_again:
2255 ocfs2_free_alloc_context(meta_ac); 1867 ocfs2_free_alloc_context(meta_ac);
2256 1868
2257success: 1869success:
2258 *pagep = wc->w_target_page; 1870 if (pagep)
1871 *pagep = wc->w_target_page;
2259 *fsdata = wc; 1872 *fsdata = wc;
2260 return 0; 1873 return 0;
2261out_quota: 1874out_quota:
@@ -2266,7 +1879,7 @@ out_commit:
2266 ocfs2_commit_trans(osb, handle); 1879 ocfs2_commit_trans(osb, handle);
2267 1880
2268out: 1881out:
2269 ocfs2_free_write_ctxt(wc); 1882 ocfs2_free_write_ctxt(inode, wc);
2270 1883
2271 if (data_ac) { 1884 if (data_ac) {
2272 ocfs2_free_alloc_context(data_ac); 1885 ocfs2_free_alloc_context(data_ac);
@@ -2318,8 +1931,8 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
2318 */ 1931 */
2319 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1932 down_write(&OCFS2_I(inode)->ip_alloc_sem);
2320 1933
2321 ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep, 1934 ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER,
2322 fsdata, di_bh, NULL); 1935 pagep, fsdata, di_bh, NULL);
2323 if (ret) { 1936 if (ret) {
2324 mlog_errno(ret); 1937 mlog_errno(ret);
2325 goto out_fail; 1938 goto out_fail;
@@ -2376,12 +1989,16 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
2376 handle_t *handle = wc->w_handle; 1989 handle_t *handle = wc->w_handle;
2377 struct page *tmppage; 1990 struct page *tmppage;
2378 1991
2379 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, 1992 BUG_ON(!list_empty(&wc->w_unwritten_list));
2380 OCFS2_JOURNAL_ACCESS_WRITE); 1993
2381 if (ret) { 1994 if (handle) {
2382 copied = ret; 1995 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
2383 mlog_errno(ret); 1996 wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2384 goto out; 1997 if (ret) {
1998 copied = ret;
1999 mlog_errno(ret);
2000 goto out;
2001 }
2385 } 2002 }
2386 2003
2387 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 2004 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
@@ -2389,18 +2006,23 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
2389 goto out_write_size; 2006 goto out_write_size;
2390 } 2007 }
2391 2008
2392 if (unlikely(copied < len)) { 2009 if (unlikely(copied < len) && wc->w_target_page) {
2393 if (!PageUptodate(wc->w_target_page)) 2010 if (!PageUptodate(wc->w_target_page))
2394 copied = 0; 2011 copied = 0;
2395 2012
2396 ocfs2_zero_new_buffers(wc->w_target_page, start+copied, 2013 ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
2397 start+len); 2014 start+len);
2398 } 2015 }
2399 flush_dcache_page(wc->w_target_page); 2016 if (wc->w_target_page)
2017 flush_dcache_page(wc->w_target_page);
2400 2018
2401 for(i = 0; i < wc->w_num_pages; i++) { 2019 for(i = 0; i < wc->w_num_pages; i++) {
2402 tmppage = wc->w_pages[i]; 2020 tmppage = wc->w_pages[i];
2403 2021
2022 /* This is the direct io target page. */
2023 if (tmppage == NULL)
2024 continue;
2025
2404 if (tmppage == wc->w_target_page) { 2026 if (tmppage == wc->w_target_page) {
2405 from = wc->w_target_from; 2027 from = wc->w_target_from;
2406 to = wc->w_target_to; 2028 to = wc->w_target_to;
@@ -2419,25 +2041,29 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
2419 } 2041 }
2420 2042
2421 if (page_has_buffers(tmppage)) { 2043 if (page_has_buffers(tmppage)) {
2422 if (ocfs2_should_order_data(inode)) 2044 if (handle && ocfs2_should_order_data(inode))
2423 ocfs2_jbd2_file_inode(wc->w_handle, inode); 2045 ocfs2_jbd2_file_inode(handle, inode);
2424 block_commit_write(tmppage, from, to); 2046 block_commit_write(tmppage, from, to);
2425 } 2047 }
2426 } 2048 }
2427 2049
2428out_write_size: 2050out_write_size:
2429 pos += copied; 2051 /* Direct io do not update i_size here. */
2430 if (pos > i_size_read(inode)) { 2052 if (wc->w_type != OCFS2_WRITE_DIRECT) {
2431 i_size_write(inode, pos); 2053 pos += copied;
2432 mark_inode_dirty(inode); 2054 if (pos > i_size_read(inode)) {
2433 } 2055 i_size_write(inode, pos);
2434 inode->i_blocks = ocfs2_inode_sector_count(inode); 2056 mark_inode_dirty(inode);
2435 di->i_size = cpu_to_le64((u64)i_size_read(inode)); 2057 }
2436 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2058 inode->i_blocks = ocfs2_inode_sector_count(inode);
2437 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); 2059 di->i_size = cpu_to_le64((u64)i_size_read(inode));
2438 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 2060 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2439 ocfs2_update_inode_fsync_trans(handle, inode, 1); 2061 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
2440 ocfs2_journal_dirty(handle, wc->w_di_bh); 2062 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
2063 ocfs2_update_inode_fsync_trans(handle, inode, 1);
2064 }
2065 if (handle)
2066 ocfs2_journal_dirty(handle, wc->w_di_bh);
2441 2067
2442out: 2068out:
2443 /* unlock pages before dealloc since it needs acquiring j_trans_barrier 2069 /* unlock pages before dealloc since it needs acquiring j_trans_barrier
@@ -2447,7 +2073,8 @@ out:
2447 */ 2073 */
2448 ocfs2_unlock_pages(wc); 2074 ocfs2_unlock_pages(wc);
2449 2075
2450 ocfs2_commit_trans(osb, handle); 2076 if (handle)
2077 ocfs2_commit_trans(osb, handle);
2451 2078
2452 ocfs2_run_deallocs(osb, &wc->w_dealloc); 2079 ocfs2_run_deallocs(osb, &wc->w_dealloc);
2453 2080
@@ -2472,6 +2099,360 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
2472 return ret; 2099 return ret;
2473} 2100}
2474 2101
2102struct ocfs2_dio_write_ctxt {
2103 struct list_head dw_zero_list;
2104 unsigned dw_zero_count;
2105 int dw_orphaned;
2106 pid_t dw_writer_pid;
2107};
2108
2109static struct ocfs2_dio_write_ctxt *
2110ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc)
2111{
2112 struct ocfs2_dio_write_ctxt *dwc = NULL;
2113
2114 if (bh->b_private)
2115 return bh->b_private;
2116
2117 dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS);
2118 if (dwc == NULL)
2119 return NULL;
2120 INIT_LIST_HEAD(&dwc->dw_zero_list);
2121 dwc->dw_zero_count = 0;
2122 dwc->dw_orphaned = 0;
2123 dwc->dw_writer_pid = task_pid_nr(current);
2124 bh->b_private = dwc;
2125 *alloc = 1;
2126
2127 return dwc;
2128}
2129
2130static void ocfs2_dio_free_write_ctx(struct inode *inode,
2131 struct ocfs2_dio_write_ctxt *dwc)
2132{
2133 ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list);
2134 kfree(dwc);
2135}
2136
2137/*
2138 * TODO: Make this into a generic get_blocks function.
2139 *
2140 * From do_direct_io in direct-io.c:
2141 * "So what we do is to permit the ->get_blocks function to populate
2142 * bh.b_size with the size of IO which is permitted at this offset and
2143 * this i_blkbits."
2144 *
2145 * This function is called directly from get_more_blocks in direct-io.c.
2146 *
2147 * called like this: dio->get_blocks(dio->inode, fs_startblk,
2148 * fs_count, map_bh, dio->rw == WRITE);
2149 */
2150static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
2151 struct buffer_head *bh_result, int create)
2152{
2153 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2154 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2155 struct ocfs2_write_ctxt *wc;
2156 struct ocfs2_write_cluster_desc *desc = NULL;
2157 struct ocfs2_dio_write_ctxt *dwc = NULL;
2158 struct buffer_head *di_bh = NULL;
2159 u64 p_blkno;
2160 loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
2161 unsigned len, total_len = bh_result->b_size;
2162 int ret = 0, first_get_block = 0;
2163
2164 len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
2165 len = min(total_len, len);
2166
2167 mlog(0, "get block of %lu at %llu:%u req %u\n",
2168 inode->i_ino, pos, len, total_len);
2169
2170 /*
2171 * Because we need to change file size in ocfs2_dio_end_io_write(), or
2172 * we may need to add it to orphan dir. So can not fall to fast path
2173 * while file size will be changed.
2174 */
2175 if (pos + total_len <= i_size_read(inode)) {
2176 down_read(&oi->ip_alloc_sem);
2177 /* This is the fast path for re-write. */
2178 ret = ocfs2_get_block(inode, iblock, bh_result, create);
2179
2180 up_read(&oi->ip_alloc_sem);
2181
2182 if (buffer_mapped(bh_result) &&
2183 !buffer_new(bh_result) &&
2184 ret == 0)
2185 goto out;
2186
2187 /* Clear state set by ocfs2_get_block. */
2188 bh_result->b_state = 0;
2189 }
2190
2191 dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block);
2192 if (unlikely(dwc == NULL)) {
2193 ret = -ENOMEM;
2194 mlog_errno(ret);
2195 goto out;
2196 }
2197
2198 if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) >
2199 ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) &&
2200 !dwc->dw_orphaned) {
2201 /*
2202 * when we are going to alloc extents beyond file size, add the
2203 * inode to orphan dir, so we can recall those spaces when
2204 * system crashed during write.
2205 */
2206 ret = ocfs2_add_inode_to_orphan(osb, inode);
2207 if (ret < 0) {
2208 mlog_errno(ret);
2209 goto out;
2210 }
2211 dwc->dw_orphaned = 1;
2212 }
2213
2214 ret = ocfs2_inode_lock(inode, &di_bh, 1);
2215 if (ret) {
2216 mlog_errno(ret);
2217 goto out;
2218 }
2219
2220 down_write(&oi->ip_alloc_sem);
2221
2222 if (first_get_block) {
2223 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
2224 ret = ocfs2_zero_tail(inode, di_bh, pos);
2225 else
2226 ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
2227 total_len, NULL);
2228 if (ret < 0) {
2229 mlog_errno(ret);
2230 goto unlock;
2231 }
2232 }
2233
2234 ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len,
2235 OCFS2_WRITE_DIRECT, NULL,
2236 (void **)&wc, di_bh, NULL);
2237 if (ret) {
2238 mlog_errno(ret);
2239 goto unlock;
2240 }
2241
2242 desc = &wc->w_desc[0];
2243
2244 p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys);
2245 BUG_ON(p_blkno == 0);
2246 p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1);
2247
2248 map_bh(bh_result, inode->i_sb, p_blkno);
2249 bh_result->b_size = len;
2250 if (desc->c_needs_zero)
2251 set_buffer_new(bh_result);
2252
2253 /* May sleep in end_io. It should not happen in a irq context. So defer
2254 * it to dio work queue. */
2255 set_buffer_defer_completion(bh_result);
2256
2257 if (!list_empty(&wc->w_unwritten_list)) {
2258 struct ocfs2_unwritten_extent *ue = NULL;
2259
2260 ue = list_first_entry(&wc->w_unwritten_list,
2261 struct ocfs2_unwritten_extent,
2262 ue_node);
2263 BUG_ON(ue->ue_cpos != desc->c_cpos);
2264 /* The physical address may be 0, fill it. */
2265 ue->ue_phys = desc->c_phys;
2266
2267 list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
2268 dwc->dw_zero_count++;
2269 }
2270
2271 ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
2272 BUG_ON(ret != len);
2273 ret = 0;
2274unlock:
2275 up_write(&oi->ip_alloc_sem);
2276 ocfs2_inode_unlock(inode, 1);
2277 brelse(di_bh);
2278out:
2279 if (ret < 0)
2280 ret = -EIO;
2281 return ret;
2282}
2283
2284static void ocfs2_dio_end_io_write(struct inode *inode,
2285 struct ocfs2_dio_write_ctxt *dwc,
2286 loff_t offset,
2287 ssize_t bytes)
2288{
2289 struct ocfs2_cached_dealloc_ctxt dealloc;
2290 struct ocfs2_extent_tree et;
2291 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2292 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2293 struct ocfs2_unwritten_extent *ue = NULL;
2294 struct buffer_head *di_bh = NULL;
2295 struct ocfs2_dinode *di;
2296 struct ocfs2_alloc_context *data_ac = NULL;
2297 struct ocfs2_alloc_context *meta_ac = NULL;
2298 handle_t *handle = NULL;
2299 loff_t end = offset + bytes;
2300 int ret = 0, credits = 0, locked = 0;
2301
2302 ocfs2_init_dealloc_ctxt(&dealloc);
2303
2304 /* We do clear unwritten, delete orphan, change i_size here. If neither
2305 * of these happen, we can skip all this. */
2306 if (list_empty(&dwc->dw_zero_list) &&
2307 end <= i_size_read(inode) &&
2308 !dwc->dw_orphaned)
2309 goto out;
2310
2311 /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we
2312 * are in that context. */
2313 if (dwc->dw_writer_pid != task_pid_nr(current)) {
2314 mutex_lock(&inode->i_mutex);
2315 locked = 1;
2316 }
2317
2318 ret = ocfs2_inode_lock(inode, &di_bh, 1);
2319 if (ret < 0) {
2320 mlog_errno(ret);
2321 goto out;
2322 }
2323
2324 down_write(&oi->ip_alloc_sem);
2325
2326 /* Delete orphan before acquire i_mutex. */
2327 if (dwc->dw_orphaned) {
2328 BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
2329
2330 end = end > i_size_read(inode) ? end : 0;
2331
2332 ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
2333 !!end, end);
2334 if (ret < 0)
2335 mlog_errno(ret);
2336 }
2337
2338 di = (struct ocfs2_dinode *)di_bh;
2339
2340 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
2341
2342 ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
2343 &data_ac, &meta_ac);
2344 if (ret) {
2345 mlog_errno(ret);
2346 goto unlock;
2347 }
2348
2349 credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list);
2350
2351 handle = ocfs2_start_trans(osb, credits);
2352 if (IS_ERR(handle)) {
2353 ret = PTR_ERR(handle);
2354 mlog_errno(ret);
2355 goto unlock;
2356 }
2357 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
2358 OCFS2_JOURNAL_ACCESS_WRITE);
2359 if (ret) {
2360 mlog_errno(ret);
2361 goto commit;
2362 }
2363
2364 list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
2365 ret = ocfs2_mark_extent_written(inode, &et, handle,
2366 ue->ue_cpos, 1,
2367 ue->ue_phys,
2368 meta_ac, &dealloc);
2369 if (ret < 0) {
2370 mlog_errno(ret);
2371 break;
2372 }
2373 }
2374
2375 if (end > i_size_read(inode)) {
2376 ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
2377 if (ret < 0)
2378 mlog_errno(ret);
2379 }
2380commit:
2381 ocfs2_commit_trans(osb, handle);
2382unlock:
2383 up_write(&oi->ip_alloc_sem);
2384 ocfs2_inode_unlock(inode, 1);
2385 brelse(di_bh);
2386out:
2387 if (data_ac)
2388 ocfs2_free_alloc_context(data_ac);
2389 if (meta_ac)
2390 ocfs2_free_alloc_context(meta_ac);
2391 ocfs2_run_deallocs(osb, &dealloc);
2392 if (locked)
2393 mutex_unlock(&inode->i_mutex);
2394 ocfs2_dio_free_write_ctx(inode, dwc);
2395}
2396
2397/*
2398 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
2399 * particularly interested in the aio/dio case. We use the rw_lock DLM lock
2400 * to protect io on one node from truncation on another.
2401 */
2402static int ocfs2_dio_end_io(struct kiocb *iocb,
2403 loff_t offset,
2404 ssize_t bytes,
2405 void *private)
2406{
2407 struct inode *inode = file_inode(iocb->ki_filp);
2408 int level;
2409
2410 if (bytes <= 0)
2411 return 0;
2412
2413 /* this io's submitter should not have unlocked this before we could */
2414 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
2415
2416 if (private)
2417 ocfs2_dio_end_io_write(inode, private, offset, bytes);
2418
2419 ocfs2_iocb_clear_rw_locked(iocb);
2420
2421 level = ocfs2_iocb_rw_locked_level(iocb);
2422 ocfs2_rw_unlock(inode, level);
2423 return 0;
2424}
2425
2426static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
2427 loff_t offset)
2428{
2429 struct file *file = iocb->ki_filp;
2430 struct inode *inode = file_inode(file)->i_mapping->host;
2431 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2432 loff_t end = offset + iter->count;
2433 get_block_t *get_block;
2434
2435 /*
2436 * Fallback to buffered I/O if we see an inode without
2437 * extents.
2438 */
2439 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
2440 return 0;
2441
2442 /* Fallback to buffered I/O if we do not support append dio. */
2443 if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb))
2444 return 0;
2445
2446 if (iov_iter_rw(iter) == READ)
2447 get_block = ocfs2_get_block;
2448 else
2449 get_block = ocfs2_dio_get_block;
2450
2451 return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
2452 iter, offset, get_block,
2453 ocfs2_dio_end_io, NULL, 0);
2454}
2455
2475const struct address_space_operations ocfs2_aops = { 2456const struct address_space_operations ocfs2_aops = {
2476 .readpage = ocfs2_readpage, 2457 .readpage = ocfs2_readpage,
2477 .readpages = ocfs2_readpages, 2458 .readpages = ocfs2_readpages,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 24e496d6bdcd..b1c9f28a57b1 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -47,9 +47,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
47 loff_t pos, unsigned len, unsigned copied, 47 loff_t pos, unsigned len, unsigned copied,
48 struct page *page, void *fsdata); 48 struct page *page, void *fsdata);
49 49
50int ocfs2_write_begin_nolock(struct file *filp, 50typedef enum {
51 struct address_space *mapping, 51 OCFS2_WRITE_BUFFER = 0,
52 loff_t pos, unsigned len, unsigned flags, 52 OCFS2_WRITE_DIRECT,
53 OCFS2_WRITE_MMAP,
54} ocfs2_write_type_t;
55
56int ocfs2_write_begin_nolock(struct address_space *mapping,
57 loff_t pos, unsigned len, ocfs2_write_type_t type,
53 struct page **pagep, void **fsdata, 58 struct page **pagep, void **fsdata,
54 struct buffer_head *di_bh, struct page *mmap_page); 59 struct buffer_head *di_bh, struct page *mmap_page);
55 60
@@ -79,7 +84,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
79enum ocfs2_iocb_lock_bits { 84enum ocfs2_iocb_lock_bits {
80 OCFS2_IOCB_RW_LOCK = 0, 85 OCFS2_IOCB_RW_LOCK = 0,
81 OCFS2_IOCB_RW_LOCK_LEVEL, 86 OCFS2_IOCB_RW_LOCK_LEVEL,
82 OCFS2_IOCB_UNALIGNED_IO,
83 OCFS2_IOCB_NUM_LOCKS 87 OCFS2_IOCB_NUM_LOCKS
84}; 88};
85 89
@@ -88,11 +92,4 @@ enum ocfs2_iocb_lock_bits {
88#define ocfs2_iocb_rw_locked_level(iocb) \ 92#define ocfs2_iocb_rw_locked_level(iocb) \
89 test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) 93 test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
90 94
91#define ocfs2_iocb_set_unaligned_aio(iocb) \
92 set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
93#define ocfs2_iocb_clear_unaligned_aio(iocb) \
94 clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
95#define ocfs2_iocb_is_unaligned_aio(iocb) \
96 test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
97
98#endif /* OCFS2_FILE_H */ 95#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a76b9ea7722e..bd15929b5f92 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -287,7 +287,6 @@ struct o2hb_bio_wait_ctxt {
287static void o2hb_write_timeout(struct work_struct *work) 287static void o2hb_write_timeout(struct work_struct *work)
288{ 288{
289 int failed, quorum; 289 int failed, quorum;
290 unsigned long flags;
291 struct o2hb_region *reg = 290 struct o2hb_region *reg =
292 container_of(work, struct o2hb_region, 291 container_of(work, struct o2hb_region,
293 hr_write_timeout_work.work); 292 hr_write_timeout_work.work);
@@ -297,14 +296,14 @@ static void o2hb_write_timeout(struct work_struct *work)
297 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); 296 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
298 297
299 if (o2hb_global_heartbeat_active()) { 298 if (o2hb_global_heartbeat_active()) {
300 spin_lock_irqsave(&o2hb_live_lock, flags); 299 spin_lock(&o2hb_live_lock);
301 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) 300 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
302 set_bit(reg->hr_region_num, o2hb_failed_region_bitmap); 301 set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
303 failed = bitmap_weight(o2hb_failed_region_bitmap, 302 failed = bitmap_weight(o2hb_failed_region_bitmap,
304 O2NM_MAX_REGIONS); 303 O2NM_MAX_REGIONS);
305 quorum = bitmap_weight(o2hb_quorum_region_bitmap, 304 quorum = bitmap_weight(o2hb_quorum_region_bitmap,
306 O2NM_MAX_REGIONS); 305 O2NM_MAX_REGIONS);
307 spin_unlock_irqrestore(&o2hb_live_lock, flags); 306 spin_unlock(&o2hb_live_lock);
308 307
309 mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n", 308 mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
310 quorum, failed); 309 quorum, failed);
@@ -1445,8 +1444,8 @@ static void o2hb_region_release(struct config_item *item)
1445 debugfs_remove(reg->hr_debug_dir); 1444 debugfs_remove(reg->hr_debug_dir);
1446 kfree(reg->hr_db_livenodes); 1445 kfree(reg->hr_db_livenodes);
1447 kfree(reg->hr_db_regnum); 1446 kfree(reg->hr_db_regnum);
1448 kfree(reg->hr_debug_elapsed_time); 1447 kfree(reg->hr_db_elapsed_time);
1449 kfree(reg->hr_debug_pinned); 1448 kfree(reg->hr_db_pinned);
1450 1449
1451 spin_lock(&o2hb_live_lock); 1450 spin_lock(&o2hb_live_lock);
1452 list_del(&reg->hr_all_item); 1451 list_del(&reg->hr_all_item);
@@ -2425,11 +2424,10 @@ EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
2425int o2hb_check_node_heartbeating_no_sem(u8 node_num) 2424int o2hb_check_node_heartbeating_no_sem(u8 node_num)
2426{ 2425{
2427 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 2426 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
2428 unsigned long flags;
2429 2427
2430 spin_lock_irqsave(&o2hb_live_lock, flags); 2428 spin_lock(&o2hb_live_lock);
2431 o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); 2429 o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
2432 spin_unlock_irqrestore(&o2hb_live_lock, flags); 2430 spin_unlock(&o2hb_live_lock);
2433 if (!test_bit(node_num, testing_map)) { 2431 if (!test_bit(node_num, testing_map)) {
2434 mlog(ML_HEARTBEAT, 2432 mlog(ML_HEARTBEAT,
2435 "node (%u) does not have heartbeating enabled.\n", 2433 "node (%u) does not have heartbeating enabled.\n",
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index ebe543894db0..b17d180bdc16 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -630,7 +630,6 @@ static void o2nm_cluster_release(struct config_item *item)
630{ 630{
631 struct o2nm_cluster *cluster = to_o2nm_cluster(item); 631 struct o2nm_cluster *cluster = to_o2nm_cluster(item);
632 632
633 kfree(cluster->cl_group.default_groups);
634 kfree(cluster); 633 kfree(cluster);
635} 634}
636 635
@@ -666,7 +665,6 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
666 struct o2nm_cluster *cluster = NULL; 665 struct o2nm_cluster *cluster = NULL;
667 struct o2nm_node_group *ns = NULL; 666 struct o2nm_node_group *ns = NULL;
668 struct config_group *o2hb_group = NULL, *ret = NULL; 667 struct config_group *o2hb_group = NULL, *ret = NULL;
669 void *defs = NULL;
670 668
671 /* this runs under the parent dir's i_mutex; there can be only 669 /* this runs under the parent dir's i_mutex; there can be only
672 * one caller in here at a time */ 670 * one caller in here at a time */
@@ -675,20 +673,18 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
675 673
676 cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL); 674 cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL);
677 ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL); 675 ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL);
678 defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
679 o2hb_group = o2hb_alloc_hb_set(); 676 o2hb_group = o2hb_alloc_hb_set();
680 if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) 677 if (cluster == NULL || ns == NULL || o2hb_group == NULL)
681 goto out; 678 goto out;
682 679
683 config_group_init_type_name(&cluster->cl_group, name, 680 config_group_init_type_name(&cluster->cl_group, name,
684 &o2nm_cluster_type); 681 &o2nm_cluster_type);
682 configfs_add_default_group(&ns->ns_group, &cluster->cl_group);
683
685 config_group_init_type_name(&ns->ns_group, "node", 684 config_group_init_type_name(&ns->ns_group, "node",
686 &o2nm_node_group_type); 685 &o2nm_node_group_type);
686 configfs_add_default_group(o2hb_group, &cluster->cl_group);
687 687
688 cluster->cl_group.default_groups = defs;
689 cluster->cl_group.default_groups[0] = &ns->ns_group;
690 cluster->cl_group.default_groups[1] = o2hb_group;
691 cluster->cl_group.default_groups[2] = NULL;
692 rwlock_init(&cluster->cl_nodes_lock); 688 rwlock_init(&cluster->cl_nodes_lock);
693 cluster->cl_node_ip_tree = RB_ROOT; 689 cluster->cl_node_ip_tree = RB_ROOT;
694 cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT; 690 cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT;
@@ -704,7 +700,6 @@ out:
704 kfree(cluster); 700 kfree(cluster);
705 kfree(ns); 701 kfree(ns);
706 o2hb_free_hb_set(o2hb_group); 702 o2hb_free_hb_set(o2hb_group);
707 kfree(defs);
708 ret = ERR_PTR(-ENOMEM); 703 ret = ERR_PTR(-ENOMEM);
709 } 704 }
710 705
@@ -714,18 +709,11 @@ out:
714static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item) 709static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item)
715{ 710{
716 struct o2nm_cluster *cluster = to_o2nm_cluster(item); 711 struct o2nm_cluster *cluster = to_o2nm_cluster(item);
717 int i;
718 struct config_item *killme;
719 712
720 BUG_ON(o2nm_single_cluster != cluster); 713 BUG_ON(o2nm_single_cluster != cluster);
721 o2nm_single_cluster = NULL; 714 o2nm_single_cluster = NULL;
722 715
723 for (i = 0; cluster->cl_group.default_groups[i]; i++) { 716 configfs_remove_default_groups(&cluster->cl_group);
724 killme = &cluster->cl_group.default_groups[i]->cg_item;
725 cluster->cl_group.default_groups[i] = NULL;
726 config_item_put(killme);
727 }
728
729 config_item_put(item); 717 config_item_put(item);
730} 718}
731 719
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 68c607e63ff6..004f2cbe8f71 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -282,6 +282,7 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
282#define DLM_LOCK_RES_DROPPING_REF 0x00000040 282#define DLM_LOCK_RES_DROPPING_REF 0x00000040
283#define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000 283#define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000
284#define DLM_LOCK_RES_SETREF_INPROG 0x00002000 284#define DLM_LOCK_RES_SETREF_INPROG 0x00002000
285#define DLM_LOCK_RES_RECOVERY_WAITING 0x00004000
285 286
286/* max milliseconds to wait to sync up a network failure with a node death */ 287/* max milliseconds to wait to sync up a network failure with a node death */
287#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) 288#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
@@ -451,6 +452,7 @@ enum {
451 DLM_QUERY_REGION = 519, 452 DLM_QUERY_REGION = 519,
452 DLM_QUERY_NODEINFO = 520, 453 DLM_QUERY_NODEINFO = 520,
453 DLM_BEGIN_EXIT_DOMAIN_MSG = 521, 454 DLM_BEGIN_EXIT_DOMAIN_MSG = 521,
455 DLM_DEREF_LOCKRES_DONE = 522,
454}; 456};
455 457
456struct dlm_reco_node_data 458struct dlm_reco_node_data
@@ -545,7 +547,7 @@ struct dlm_master_requery
545 * }; 547 * };
546 * 548 *
547 * from ../cluster/tcp.h 549 * from ../cluster/tcp.h
548 * NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg)) 550 * O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg))
549 * (roughly 4080 bytes) 551 * (roughly 4080 bytes)
550 * and sizeof(dlm_migratable_lockres) = 112 bytes 552 * and sizeof(dlm_migratable_lockres) = 112 bytes
551 * and sizeof(dlm_migratable_lock) = 16 bytes 553 * and sizeof(dlm_migratable_lock) = 16 bytes
@@ -586,7 +588,7 @@ struct dlm_migratable_lockres
586 588
587/* from above, 128 bytes 589/* from above, 128 bytes
588 * for some undetermined future use */ 590 * for some undetermined future use */
589#define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \ 591#define DLM_MIG_LOCKRES_RESERVED (O2NET_MAX_PAYLOAD_BYTES - \
590 DLM_MIG_LOCKRES_MAX_LEN) 592 DLM_MIG_LOCKRES_MAX_LEN)
591 593
592struct dlm_create_lock 594struct dlm_create_lock
@@ -782,6 +784,20 @@ struct dlm_deref_lockres
782 u8 name[O2NM_MAX_NAME_LEN]; 784 u8 name[O2NM_MAX_NAME_LEN];
783}; 785};
784 786
787enum {
788 DLM_DEREF_RESPONSE_DONE = 0,
789 DLM_DEREF_RESPONSE_INPROG = 1,
790};
791
792struct dlm_deref_lockres_done {
793 u32 pad1;
794 u16 pad2;
795 u8 node_idx;
796 u8 namelen;
797
798 u8 name[O2NM_MAX_NAME_LEN];
799};
800
785static inline enum dlm_status 801static inline enum dlm_status
786__dlm_lockres_state_to_status(struct dlm_lock_resource *res) 802__dlm_lockres_state_to_status(struct dlm_lock_resource *res)
787{ 803{
@@ -789,7 +805,8 @@ __dlm_lockres_state_to_status(struct dlm_lock_resource *res)
789 805
790 assert_spin_locked(&res->spinlock); 806 assert_spin_locked(&res->spinlock);
791 807
792 if (res->state & DLM_LOCK_RES_RECOVERING) 808 if (res->state & (DLM_LOCK_RES_RECOVERING|
809 DLM_LOCK_RES_RECOVERY_WAITING))
793 status = DLM_RECOVERING; 810 status = DLM_RECOVERING;
794 else if (res->state & DLM_LOCK_RES_MIGRATING) 811 else if (res->state & DLM_LOCK_RES_MIGRATING)
795 status = DLM_MIGRATING; 812 status = DLM_MIGRATING;
@@ -968,6 +985,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
968void dlm_assert_master_post_handler(int status, void *data, void *ret_data); 985void dlm_assert_master_post_handler(int status, void *data, void *ret_data);
969int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, 986int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
970 void **ret_data); 987 void **ret_data);
988int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
989 void **ret_data);
971int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, 990int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
972 void **ret_data); 991 void **ret_data);
973int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, 992int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -1009,6 +1028,7 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
1009{ 1028{
1010 __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS| 1029 __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
1011 DLM_LOCK_RES_RECOVERING| 1030 DLM_LOCK_RES_RECOVERING|
1031 DLM_LOCK_RES_RECOVERY_WAITING|
1012 DLM_LOCK_RES_MIGRATING)); 1032 DLM_LOCK_RES_MIGRATING));
1013} 1033}
1014 1034
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index e36d63ff1783..cdeafb4e7ed6 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -212,6 +212,12 @@ grant:
212 if (lock->lksb->flags & DLM_LKSB_PUT_LVB) 212 if (lock->lksb->flags & DLM_LKSB_PUT_LVB)
213 memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN); 213 memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN);
214 214
215 /*
216 * Move the lock to the tail because it may be the only lock which has
217 * an invalid lvb.
218 */
219 list_move_tail(&lock->list, &res->granted);
220
215 status = DLM_NORMAL; 221 status = DLM_NORMAL;
216 *call_ast = 1; 222 *call_ast = 1;
217 goto unlock_exit; 223 goto unlock_exit;
@@ -262,6 +268,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
262 struct dlm_lock *lock, int flags, int type) 268 struct dlm_lock *lock, int flags, int type)
263{ 269{
264 enum dlm_status status; 270 enum dlm_status status;
271 u8 old_owner = res->owner;
265 272
266 mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type, 273 mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
267 lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS); 274 lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
@@ -287,6 +294,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
287 status = DLM_DENIED; 294 status = DLM_DENIED;
288 goto bail; 295 goto bail;
289 } 296 }
297
298 if (lock->ml.type == type && lock->ml.convert_type == LKM_IVMODE) {
299 mlog(0, "last convert request returned DLM_RECOVERING, but "
300 "owner has already queued and sent ast to me. res %.*s, "
301 "(cookie=%u:%llu, type=%d, conv=%d)\n",
302 res->lockname.len, res->lockname.name,
303 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
304 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
305 lock->ml.type, lock->ml.convert_type);
306 status = DLM_NORMAL;
307 goto bail;
308 }
309
290 res->state |= DLM_LOCK_RES_IN_PROGRESS; 310 res->state |= DLM_LOCK_RES_IN_PROGRESS;
291 /* move lock to local convert queue */ 311 /* move lock to local convert queue */
292 /* do not alter lock refcount. switching lists. */ 312 /* do not alter lock refcount. switching lists. */
@@ -316,11 +336,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
316 spin_lock(&res->spinlock); 336 spin_lock(&res->spinlock);
317 res->state &= ~DLM_LOCK_RES_IN_PROGRESS; 337 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
318 lock->convert_pending = 0; 338 lock->convert_pending = 0;
319 /* if it failed, move it back to granted queue */ 339 /* if it failed, move it back to granted queue.
340 * if master returns DLM_NORMAL and then down before sending ast,
341 * it may have already been moved to granted queue, reset to
342 * DLM_RECOVERING and retry convert */
320 if (status != DLM_NORMAL) { 343 if (status != DLM_NORMAL) {
321 if (status != DLM_NOTQUEUED) 344 if (status != DLM_NOTQUEUED)
322 dlm_error(status); 345 dlm_error(status);
323 dlm_revert_pending_convert(res, lock); 346 dlm_revert_pending_convert(res, lock);
347 } else if ((res->state & DLM_LOCK_RES_RECOVERING) ||
348 (old_owner != res->owner)) {
349 mlog(0, "res %.*s is in recovering or has been recovered.\n",
350 res->lockname.len, res->lockname.name);
351 status = DLM_RECOVERING;
324 } 352 }
325bail: 353bail:
326 spin_unlock(&res->spinlock); 354 spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 2ee7fe747cea..12e064b8be9a 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -132,10 +132,13 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
132 * - Message DLM_QUERY_NODEINFO added to allow online node removes 132 * - Message DLM_QUERY_NODEINFO added to allow online node removes
133 * New in version 1.2: 133 * New in version 1.2:
134 * - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain 134 * - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
135 * New in version 1.3:
136 * - Message DLM_DEREF_LOCKRES_DONE added to inform non-master that the
137 * refmap is cleared
135 */ 138 */
136static const struct dlm_protocol_version dlm_protocol = { 139static const struct dlm_protocol_version dlm_protocol = {
137 .pv_major = 1, 140 .pv_major = 1,
138 .pv_minor = 2, 141 .pv_minor = 3,
139}; 142};
140 143
141#define DLM_DOMAIN_BACKOFF_MS 200 144#define DLM_DOMAIN_BACKOFF_MS 200
@@ -1396,7 +1399,7 @@ static int dlm_send_join_cancels(struct dlm_ctxt *dlm,
1396 unsigned int map_size) 1399 unsigned int map_size)
1397{ 1400{
1398 int status, tmpstat; 1401 int status, tmpstat;
1399 unsigned int node; 1402 int node;
1400 1403
1401 if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) * 1404 if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) *
1402 sizeof(unsigned long))) { 1405 sizeof(unsigned long))) {
@@ -1853,7 +1856,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
1853 sizeof(struct dlm_exit_domain), 1856 sizeof(struct dlm_exit_domain),
1854 dlm_begin_exit_domain_handler, 1857 dlm_begin_exit_domain_handler,
1855 dlm, NULL, &dlm->dlm_domain_handlers); 1858 dlm, NULL, &dlm->dlm_domain_handlers);
1859 if (status)
1860 goto bail;
1856 1861
1862 status = o2net_register_handler(DLM_DEREF_LOCKRES_DONE, dlm->key,
1863 sizeof(struct dlm_deref_lockres_done),
1864 dlm_deref_lockres_done_handler,
1865 dlm, NULL, &dlm->dlm_domain_handlers);
1857bail: 1866bail:
1858 if (status) 1867 if (status)
1859 dlm_unregister_domain_handlers(dlm); 1868 dlm_unregister_domain_handlers(dlm);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 9477d6e1de37..9aed6e202201 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2278,7 +2278,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2278 dlm_print_one_lock_resource(res); 2278 dlm_print_one_lock_resource(res);
2279 BUG(); 2279 BUG();
2280 } 2280 }
2281 return ret; 2281 return ret ? ret : r;
2282} 2282}
2283 2283
2284int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, 2284int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -2345,7 +2345,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
2345 res->lockname.len, res->lockname.name, node); 2345 res->lockname.len, res->lockname.name, node);
2346 dlm_print_one_lock_resource(res); 2346 dlm_print_one_lock_resource(res);
2347 } 2347 }
2348 ret = 0; 2348 ret = DLM_DEREF_RESPONSE_DONE;
2349 goto done; 2349 goto done;
2350 } 2350 }
2351 2351
@@ -2365,7 +2365,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
2365 spin_unlock(&dlm->work_lock); 2365 spin_unlock(&dlm->work_lock);
2366 2366
2367 queue_work(dlm->dlm_worker, &dlm->dispatched_work); 2367 queue_work(dlm->dlm_worker, &dlm->dispatched_work);
2368 return 0; 2368 return DLM_DEREF_RESPONSE_INPROG;
2369 2369
2370done: 2370done:
2371 if (res) 2371 if (res)
@@ -2375,6 +2375,122 @@ done:
2375 return ret; 2375 return ret;
2376} 2376}
2377 2377
2378int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
2379 void **ret_data)
2380{
2381 struct dlm_ctxt *dlm = data;
2382 struct dlm_deref_lockres_done *deref
2383 = (struct dlm_deref_lockres_done *)msg->buf;
2384 struct dlm_lock_resource *res = NULL;
2385 char *name;
2386 unsigned int namelen;
2387 int ret = -EINVAL;
2388 u8 node;
2389 unsigned int hash;
2390
2391 if (!dlm_grab(dlm))
2392 return 0;
2393
2394 name = deref->name;
2395 namelen = deref->namelen;
2396 node = deref->node_idx;
2397
2398 if (namelen > DLM_LOCKID_NAME_MAX) {
2399 mlog(ML_ERROR, "Invalid name length!");
2400 goto done;
2401 }
2402 if (deref->node_idx >= O2NM_MAX_NODES) {
2403 mlog(ML_ERROR, "Invalid node number: %u\n", node);
2404 goto done;
2405 }
2406
2407 hash = dlm_lockid_hash(name, namelen);
2408
2409 spin_lock(&dlm->spinlock);
2410 res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
2411 if (!res) {
2412 spin_unlock(&dlm->spinlock);
2413 mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
2414 dlm->name, namelen, name);
2415 goto done;
2416 }
2417
2418 spin_lock(&res->spinlock);
2419 BUG_ON(!(res->state & DLM_LOCK_RES_DROPPING_REF));
2420 if (!list_empty(&res->purge)) {
2421 mlog(0, "%s: Removing res %.*s from purgelist\n",
2422 dlm->name, res->lockname.len, res->lockname.name);
2423 list_del_init(&res->purge);
2424 dlm_lockres_put(res);
2425 dlm->purge_count--;
2426 }
2427
2428 if (!__dlm_lockres_unused(res)) {
2429 mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
2430 dlm->name, res->lockname.len, res->lockname.name);
2431 __dlm_print_one_lock_resource(res);
2432 BUG();
2433 }
2434
2435 __dlm_unhash_lockres(dlm, res);
2436
2437 spin_lock(&dlm->track_lock);
2438 if (!list_empty(&res->tracking))
2439 list_del_init(&res->tracking);
2440 else {
2441 mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n",
2442 dlm->name, res->lockname.len, res->lockname.name);
2443 __dlm_print_one_lock_resource(res);
2444 }
2445 spin_unlock(&dlm->track_lock);
2446
2447 /* lockres is not in the hash now. drop the flag and wake up
2448 * any processes waiting in dlm_get_lock_resource.
2449 */
2450 res->state &= ~DLM_LOCK_RES_DROPPING_REF;
2451 spin_unlock(&res->spinlock);
2452 wake_up(&res->wq);
2453
2454 dlm_lockres_put(res);
2455
2456 spin_unlock(&dlm->spinlock);
2457
2458done:
2459 dlm_put(dlm);
2460 return ret;
2461}
2462
2463static void dlm_drop_lockres_ref_done(struct dlm_ctxt *dlm,
2464 struct dlm_lock_resource *res, u8 node)
2465{
2466 struct dlm_deref_lockres_done deref;
2467 int ret = 0, r;
2468 const char *lockname;
2469 unsigned int namelen;
2470
2471 lockname = res->lockname.name;
2472 namelen = res->lockname.len;
2473 BUG_ON(namelen > O2NM_MAX_NAME_LEN);
2474
2475 memset(&deref, 0, sizeof(deref));
2476 deref.node_idx = dlm->node_num;
2477 deref.namelen = namelen;
2478 memcpy(deref.name, lockname, namelen);
2479
2480 ret = o2net_send_message(DLM_DEREF_LOCKRES_DONE, dlm->key,
2481 &deref, sizeof(deref), node, &r);
2482 if (ret < 0) {
2483 mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF DONE "
2484 " to node %u\n", dlm->name, namelen,
2485 lockname, ret, node);
2486 } else if (r < 0) {
2487 /* ignore the error */
2488 mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
2489 dlm->name, namelen, lockname, node, r);
2490 dlm_print_one_lock_resource(res);
2491 }
2492}
2493
2378static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) 2494static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
2379{ 2495{
2380 struct dlm_ctxt *dlm; 2496 struct dlm_ctxt *dlm;
@@ -2395,6 +2511,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
2395 } 2511 }
2396 spin_unlock(&res->spinlock); 2512 spin_unlock(&res->spinlock);
2397 2513
2514 dlm_drop_lockres_ref_done(dlm, res, node);
2515
2398 if (cleared) { 2516 if (cleared) {
2399 mlog(0, "%s:%.*s node %u ref dropped in dispatch\n", 2517 mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
2400 dlm->name, res->lockname.len, res->lockname.name, node); 2518 dlm->name, res->lockname.len, res->lockname.name, node);
@@ -2432,7 +2550,8 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2432 return 0; 2550 return 0;
2433 2551
2434 /* delay migration when the lockres is in RECOCERING state */ 2552 /* delay migration when the lockres is in RECOCERING state */
2435 if (res->state & DLM_LOCK_RES_RECOVERING) 2553 if (res->state & (DLM_LOCK_RES_RECOVERING|
2554 DLM_LOCK_RES_RECOVERY_WAITING))
2436 return 0; 2555 return 0;
2437 2556
2438 if (res->owner != dlm->node_num) 2557 if (res->owner != dlm->node_num)
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index b94a425f0175..f6b313898763 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1403,12 +1403,24 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
1403 * and RECOVERY flag changed when it completes. */ 1403 * and RECOVERY flag changed when it completes. */
1404 hash = dlm_lockid_hash(mres->lockname, mres->lockname_len); 1404 hash = dlm_lockid_hash(mres->lockname, mres->lockname_len);
1405 spin_lock(&dlm->spinlock); 1405 spin_lock(&dlm->spinlock);
1406 res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len, 1406 res = __dlm_lookup_lockres_full(dlm, mres->lockname, mres->lockname_len,
1407 hash); 1407 hash);
1408 if (res) { 1408 if (res) {
1409 /* this will get a ref on res */ 1409 /* this will get a ref on res */
1410 /* mark it as recovering/migrating and hash it */ 1410 /* mark it as recovering/migrating and hash it */
1411 spin_lock(&res->spinlock); 1411 spin_lock(&res->spinlock);
1412 if (res->state & DLM_LOCK_RES_DROPPING_REF) {
1413 mlog(0, "%s: node is attempting to migrate "
1414 "lockres %.*s, but marked as dropping "
1415 " ref!\n", dlm->name,
1416 mres->lockname_len, mres->lockname);
1417 ret = -EINVAL;
1418 spin_unlock(&res->spinlock);
1419 spin_unlock(&dlm->spinlock);
1420 dlm_lockres_put(res);
1421 goto leave;
1422 }
1423
1412 if (mres->flags & DLM_MRES_RECOVERY) { 1424 if (mres->flags & DLM_MRES_RECOVERY) {
1413 res->state |= DLM_LOCK_RES_RECOVERING; 1425 res->state |= DLM_LOCK_RES_RECOVERING;
1414 } else { 1426 } else {
@@ -2071,7 +2083,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
2071 dlm_lock_get(lock); 2083 dlm_lock_get(lock);
2072 if (lock->convert_pending) { 2084 if (lock->convert_pending) {
2073 /* move converting lock back to granted */ 2085 /* move converting lock back to granted */
2074 BUG_ON(i != DLM_CONVERTING_LIST);
2075 mlog(0, "node died with convert pending " 2086 mlog(0, "node died with convert pending "
2076 "on %.*s. move back to granted list.\n", 2087 "on %.*s. move back to granted list.\n",
2077 res->lockname.len, res->lockname.name); 2088 res->lockname.len, res->lockname.name);
@@ -2163,6 +2174,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
2163 for (i = 0; i < DLM_HASH_BUCKETS; i++) { 2174 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
2164 bucket = dlm_lockres_hash(dlm, i); 2175 bucket = dlm_lockres_hash(dlm, i);
2165 hlist_for_each_entry(res, bucket, hash_node) { 2176 hlist_for_each_entry(res, bucket, hash_node) {
2177 if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) {
2178 spin_lock(&res->spinlock);
2179 res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING;
2180 spin_unlock(&res->spinlock);
2181 wake_up(&res->wq);
2182 }
2183
2166 if (!(res->state & DLM_LOCK_RES_RECOVERING)) 2184 if (!(res->state & DLM_LOCK_RES_RECOVERING))
2167 continue; 2185 continue;
2168 2186
@@ -2300,6 +2318,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2300 res->lockname.len, res->lockname.name, freed, dead_node); 2318 res->lockname.len, res->lockname.name, freed, dead_node);
2301 __dlm_print_one_lock_resource(res); 2319 __dlm_print_one_lock_resource(res);
2302 } 2320 }
2321 res->state |= DLM_LOCK_RES_RECOVERY_WAITING;
2303 dlm_lockres_clear_refmap_bit(dlm, res, dead_node); 2322 dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
2304 } else if (test_bit(dead_node, res->refmap)) { 2323 } else if (test_bit(dead_node, res->refmap)) {
2305 mlog(0, "%s:%.*s: dead node %u had a ref, but had " 2324 mlog(0, "%s:%.*s: dead node %u had a ref, but had "
@@ -2377,14 +2396,16 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2377 dlm_revalidate_lvb(dlm, res, dead_node); 2396 dlm_revalidate_lvb(dlm, res, dead_node);
2378 if (res->owner == dead_node) { 2397 if (res->owner == dead_node) {
2379 if (res->state & DLM_LOCK_RES_DROPPING_REF) { 2398 if (res->state & DLM_LOCK_RES_DROPPING_REF) {
2380 mlog(ML_NOTICE, "%s: res %.*s, Skip " 2399 mlog(0, "%s:%.*s: owned by "
2381 "recovery as it is being freed\n", 2400 "dead node %u, this node was "
2382 dlm->name, res->lockname.len, 2401 "dropping its ref when it died. "
2383 res->lockname.name); 2402 "continue, dropping the flag.\n",
2384 } else 2403 dlm->name, res->lockname.len,
2385 dlm_move_lockres_to_recovery_list(dlm, 2404 res->lockname.name, dead_node);
2386 res); 2405 }
2387 2406 res->state &= ~DLM_LOCK_RES_DROPPING_REF;
2407 dlm_move_lockres_to_recovery_list(dlm,
2408 res);
2388 } else if (res->owner == dlm->node_num) { 2409 } else if (res->owner == dlm->node_num) {
2389 dlm_free_dead_locks(dlm, res, dead_node); 2410 dlm_free_dead_locks(dlm, res, dead_node);
2390 __dlm_lockres_calc_usage(dlm, res); 2411 __dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index c5f6c241ecd7..68d239ba0c63 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -106,7 +106,8 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
106 if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) 106 if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
107 return 0; 107 return 0;
108 108
109 if (res->state & DLM_LOCK_RES_RECOVERING) 109 if (res->state & (DLM_LOCK_RES_RECOVERING|
110 DLM_LOCK_RES_RECOVERY_WAITING))
110 return 0; 111 return 0;
111 112
112 /* Another node has this resource with this node as the master */ 113 /* Another node has this resource with this node as the master */
@@ -202,6 +203,13 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
202 dlm->purge_count--; 203 dlm->purge_count--;
203 } 204 }
204 205
206 if (!master && ret != 0) {
207 mlog(0, "%s: deref %.*s in progress or master goes down\n",
208 dlm->name, res->lockname.len, res->lockname.name);
209 spin_unlock(&res->spinlock);
210 return;
211 }
212
205 if (!__dlm_lockres_unused(res)) { 213 if (!__dlm_lockres_unused(res)) {
206 mlog(ML_ERROR, "%s: res %.*s in use after deref\n", 214 mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
207 dlm->name, res->lockname.len, res->lockname.name); 215 dlm->name, res->lockname.len, res->lockname.name);
@@ -700,7 +708,8 @@ static int dlm_thread(void *data)
700 * dirty for a short while. */ 708 * dirty for a short while. */
701 BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); 709 BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
702 if (res->state & (DLM_LOCK_RES_IN_PROGRESS | 710 if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
703 DLM_LOCK_RES_RECOVERING)) { 711 DLM_LOCK_RES_RECOVERING |
712 DLM_LOCK_RES_RECOVERY_WAITING)) {
704 /* move it to the tail and keep going */ 713 /* move it to the tail and keep going */
705 res->state &= ~DLM_LOCK_RES_DIRTY; 714 res->state &= ~DLM_LOCK_RES_DIRTY;
706 spin_unlock(&res->spinlock); 715 spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7cb38fdca229..c18ab45f8d21 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1381,44 +1381,6 @@ out:
1381 return ret; 1381 return ret;
1382} 1382}
1383 1383
1384/*
1385 * Will look for holes and unwritten extents in the range starting at
1386 * pos for count bytes (inclusive).
1387 */
1388static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
1389 size_t count)
1390{
1391 int ret = 0;
1392 unsigned int extent_flags;
1393 u32 cpos, clusters, extent_len, phys_cpos;
1394 struct super_block *sb = inode->i_sb;
1395
1396 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1397 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1398
1399 while (clusters) {
1400 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1401 &extent_flags);
1402 if (ret < 0) {
1403 mlog_errno(ret);
1404 goto out;
1405 }
1406
1407 if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
1408 ret = 1;
1409 break;
1410 }
1411
1412 if (extent_len > clusters)
1413 extent_len = clusters;
1414
1415 clusters -= extent_len;
1416 cpos += extent_len;
1417 }
1418out:
1419 return ret;
1420}
1421
1422static int ocfs2_write_remove_suid(struct inode *inode) 1384static int ocfs2_write_remove_suid(struct inode *inode)
1423{ 1385{
1424 int ret; 1386 int ret;
@@ -2129,18 +2091,12 @@ out:
2129 2091
2130static int ocfs2_prepare_inode_for_write(struct file *file, 2092static int ocfs2_prepare_inode_for_write(struct file *file,
2131 loff_t pos, 2093 loff_t pos,
2132 size_t count, 2094 size_t count)
2133 int appending,
2134 int *direct_io,
2135 int *has_refcount)
2136{ 2095{
2137 int ret = 0, meta_level = 0; 2096 int ret = 0, meta_level = 0;
2138 struct dentry *dentry = file->f_path.dentry; 2097 struct dentry *dentry = file->f_path.dentry;
2139 struct inode *inode = d_inode(dentry); 2098 struct inode *inode = d_inode(dentry);
2140 loff_t end; 2099 loff_t end;
2141 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2142 int full_coherency = !(osb->s_mount_opt &
2143 OCFS2_MOUNT_COHERENCY_BUFFERED);
2144 2100
2145 /* 2101 /*
2146 * We start with a read level meta lock and only jump to an ex 2102 * We start with a read level meta lock and only jump to an ex
@@ -2189,10 +2145,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
2189 pos, 2145 pos,
2190 count, 2146 count,
2191 &meta_level); 2147 &meta_level);
2192 if (has_refcount)
2193 *has_refcount = 1;
2194 if (direct_io)
2195 *direct_io = 0;
2196 } 2148 }
2197 2149
2198 if (ret < 0) { 2150 if (ret < 0) {
@@ -2200,67 +2152,12 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
2200 goto out_unlock; 2152 goto out_unlock;
2201 } 2153 }
2202 2154
2203 /*
2204 * Skip the O_DIRECT checks if we don't need
2205 * them.
2206 */
2207 if (!direct_io || !(*direct_io))
2208 break;
2209
2210 /*
2211 * There's no sane way to do direct writes to an inode
2212 * with inline data.
2213 */
2214 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
2215 *direct_io = 0;
2216 break;
2217 }
2218
2219 /*
2220 * Allowing concurrent direct writes means
2221 * i_size changes wouldn't be synchronized, so
2222 * one node could wind up truncating another
2223 * nodes writes.
2224 */
2225 if (end > i_size_read(inode) && !full_coherency) {
2226 *direct_io = 0;
2227 break;
2228 }
2229
2230 /*
2231 * Fallback to old way if the feature bit is not set.
2232 */
2233 if (end > i_size_read(inode) &&
2234 !ocfs2_supports_append_dio(osb)) {
2235 *direct_io = 0;
2236 break;
2237 }
2238
2239 /*
2240 * We don't fill holes during direct io, so
2241 * check for them here. If any are found, the
2242 * caller will have to retake some cluster
2243 * locks and initiate the io as buffered.
2244 */
2245 ret = ocfs2_check_range_for_holes(inode, pos, count);
2246 if (ret == 1) {
2247 /*
2248 * Fallback to old way if the feature bit is not set.
2249 * Otherwise try dio first and then complete the rest
2250 * request through buffer io.
2251 */
2252 if (!ocfs2_supports_append_dio(osb))
2253 *direct_io = 0;
2254 ret = 0;
2255 } else if (ret < 0)
2256 mlog_errno(ret);
2257 break; 2155 break;
2258 } 2156 }
2259 2157
2260out_unlock: 2158out_unlock:
2261 trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, 2159 trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
2262 pos, appending, count, 2160 pos, count);
2263 direct_io, has_refcount);
2264 2161
2265 if (meta_level >= 0) 2162 if (meta_level >= 0)
2266 ocfs2_inode_unlock(inode, meta_level); 2163 ocfs2_inode_unlock(inode, meta_level);
@@ -2272,18 +2169,16 @@ out:
2272static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, 2169static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
2273 struct iov_iter *from) 2170 struct iov_iter *from)
2274{ 2171{
2275 int direct_io, appending, rw_level; 2172 int direct_io, rw_level;
2276 int can_do_direct, has_refcount = 0;
2277 ssize_t written = 0; 2173 ssize_t written = 0;
2278 ssize_t ret; 2174 ssize_t ret;
2279 size_t count = iov_iter_count(from), orig_count; 2175 size_t count = iov_iter_count(from);
2280 struct file *file = iocb->ki_filp; 2176 struct file *file = iocb->ki_filp;
2281 struct inode *inode = file_inode(file); 2177 struct inode *inode = file_inode(file);
2282 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2178 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2283 int full_coherency = !(osb->s_mount_opt & 2179 int full_coherency = !(osb->s_mount_opt &
2284 OCFS2_MOUNT_COHERENCY_BUFFERED); 2180 OCFS2_MOUNT_COHERENCY_BUFFERED);
2285 int unaligned_dio = 0; 2181 void *saved_ki_complete = NULL;
2286 int dropped_dio = 0;
2287 int append_write = ((iocb->ki_pos + count) >= 2182 int append_write = ((iocb->ki_pos + count) >=
2288 i_size_read(inode) ? 1 : 0); 2183 i_size_read(inode) ? 1 : 0);
2289 2184
@@ -2296,12 +2191,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
2296 if (count == 0) 2191 if (count == 0)
2297 return 0; 2192 return 0;
2298 2193
2299 appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0;
2300 direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; 2194 direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
2301 2195
2302 inode_lock(inode); 2196 inode_lock(inode);
2303 2197
2304relock:
2305 /* 2198 /*
2306 * Concurrent O_DIRECT writes are allowed with 2199 * Concurrent O_DIRECT writes are allowed with
2307 * mount_option "coherency=buffered". 2200 * mount_option "coherency=buffered".
@@ -2334,7 +2227,6 @@ relock:
2334 ocfs2_inode_unlock(inode, 1); 2227 ocfs2_inode_unlock(inode, 1);
2335 } 2228 }
2336 2229
2337 orig_count = iov_iter_count(from);
2338 ret = generic_write_checks(iocb, from); 2230 ret = generic_write_checks(iocb, from);
2339 if (ret <= 0) { 2231 if (ret <= 0) {
2340 if (ret) 2232 if (ret)
@@ -2343,41 +2235,18 @@ relock:
2343 } 2235 }
2344 count = ret; 2236 count = ret;
2345 2237
2346 can_do_direct = direct_io; 2238 ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count);
2347 ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending,
2348 &can_do_direct, &has_refcount);
2349 if (ret < 0) { 2239 if (ret < 0) {
2350 mlog_errno(ret); 2240 mlog_errno(ret);
2351 goto out; 2241 goto out;
2352 } 2242 }
2353 2243
2354 if (direct_io && !is_sync_kiocb(iocb)) 2244 if (direct_io && !is_sync_kiocb(iocb) &&
2355 unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos); 2245 ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) {
2356
2357 /*
2358 * We can't complete the direct I/O as requested, fall back to
2359 * buffered I/O.
2360 */
2361 if (direct_io && !can_do_direct) {
2362 ocfs2_rw_unlock(inode, rw_level);
2363
2364 rw_level = -1;
2365
2366 direct_io = 0;
2367 iocb->ki_flags &= ~IOCB_DIRECT;
2368 iov_iter_reexpand(from, orig_count);
2369 dropped_dio = 1;
2370 goto relock;
2371 }
2372
2373 if (unaligned_dio) {
2374 /* 2246 /*
2375 * Wait on previous unaligned aio to complete before 2247 * Make it a sync io if it's an unaligned aio.
2376 * proceeding.
2377 */ 2248 */
2378 mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio); 2249 saved_ki_complete = xchg(&iocb->ki_complete, NULL);
2379 /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */
2380 ocfs2_iocb_set_unaligned_aio(iocb);
2381 } 2250 }
2382 2251
2383 /* communicate with ocfs2_dio_end_io */ 2252 /* communicate with ocfs2_dio_end_io */
@@ -2398,14 +2267,13 @@ relock:
2398 */ 2267 */
2399 if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { 2268 if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2400 rw_level = -1; 2269 rw_level = -1;
2401 unaligned_dio = 0;
2402 } 2270 }
2403 2271
2404 if (unlikely(written <= 0)) 2272 if (unlikely(written <= 0))
2405 goto no_sync; 2273 goto out;
2406 2274
2407 if (((file->f_flags & O_DSYNC) && !direct_io) || 2275 if (((file->f_flags & O_DSYNC) && !direct_io) ||
2408 IS_SYNC(inode) || dropped_dio) { 2276 IS_SYNC(inode)) {
2409 ret = filemap_fdatawrite_range(file->f_mapping, 2277 ret = filemap_fdatawrite_range(file->f_mapping,
2410 iocb->ki_pos - written, 2278 iocb->ki_pos - written,
2411 iocb->ki_pos - 1); 2279 iocb->ki_pos - 1);
@@ -2424,13 +2292,10 @@ relock:
2424 iocb->ki_pos - 1); 2292 iocb->ki_pos - 1);
2425 } 2293 }
2426 2294
2427no_sync:
2428 if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) {
2429 ocfs2_iocb_clear_unaligned_aio(iocb);
2430 mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
2431 }
2432
2433out: 2295out:
2296 if (saved_ki_complete)
2297 xchg(&iocb->ki_complete, saved_ki_complete);
2298
2434 if (rw_level != -1) 2299 if (rw_level != -1)
2435 ocfs2_rw_unlock(inode, rw_level); 2300 ocfs2_rw_unlock(inode, rw_level);
2436 2301
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
new file mode 100644
index 000000000000..2cabbcf2f28e
--- /dev/null
+++ b/fs/ocfs2/filecheck.c
@@ -0,0 +1,606 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * filecheck.c
5 *
6 * Code which implements online file check.
7 *
8 * Copyright (C) 2016 SuSE. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation, version 2.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#include <linux/list.h>
21#include <linux/spinlock.h>
22#include <linux/module.h>
23#include <linux/slab.h>
24#include <linux/kmod.h>
25#include <linux/fs.h>
26#include <linux/kobject.h>
27#include <linux/sysfs.h>
28#include <linux/sysctl.h>
29#include <cluster/masklog.h>
30
31#include "ocfs2.h"
32#include "ocfs2_fs.h"
33#include "stackglue.h"
34#include "inode.h"
35
36#include "filecheck.h"
37
38
39/* File check error strings,
40 * must correspond with error number in header file.
41 */
42static const char * const ocfs2_filecheck_errs[] = {
43 "SUCCESS",
44 "FAILED",
45 "INPROGRESS",
46 "READONLY",
47 "INJBD",
48 "INVALIDINO",
49 "BLOCKECC",
50 "BLOCKNO",
51 "VALIDFLAG",
52 "GENERATION",
53 "UNSUPPORTED"
54};
55
56static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock);
57static LIST_HEAD(ocfs2_filecheck_sysfs_list);
58
59struct ocfs2_filecheck {
60 struct list_head fc_head; /* File check entry list head */
61 spinlock_t fc_lock;
62 unsigned int fc_max; /* Maximum number of entry in list */
63 unsigned int fc_size; /* Current entry count in list */
64 unsigned int fc_done; /* Finished entry count in list */
65};
66
67struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */
68 struct list_head fs_list;
69 atomic_t fs_count;
70 struct super_block *fs_sb;
71 struct kset *fs_devicekset;
72 struct kset *fs_fcheckkset;
73 struct ocfs2_filecheck *fs_fcheck;
74};
75
76#define OCFS2_FILECHECK_MAXSIZE 100
77#define OCFS2_FILECHECK_MINSIZE 10
78
79/* File check operation type */
80enum {
81 OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */
82 OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */
83 OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */
84};
85
86struct ocfs2_filecheck_entry {
87 struct list_head fe_list;
88 unsigned long fe_ino;
89 unsigned int fe_type;
90 unsigned int fe_done:1;
91 unsigned int fe_status:31;
92};
93
94struct ocfs2_filecheck_args {
95 unsigned int fa_type;
96 union {
97 unsigned long fa_ino;
98 unsigned int fa_len;
99 };
100};
101
102static const char *
103ocfs2_filecheck_error(int errno)
104{
105 if (!errno)
106 return ocfs2_filecheck_errs[errno];
107
108 BUG_ON(errno < OCFS2_FILECHECK_ERR_START ||
109 errno > OCFS2_FILECHECK_ERR_END);
110 return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1];
111}
112
113static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
114 struct kobj_attribute *attr,
115 char *buf);
116static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
117 struct kobj_attribute *attr,
118 const char *buf, size_t count);
119static struct kobj_attribute ocfs2_attr_filecheck_chk =
120 __ATTR(check, S_IRUSR | S_IWUSR,
121 ocfs2_filecheck_show,
122 ocfs2_filecheck_store);
123static struct kobj_attribute ocfs2_attr_filecheck_fix =
124 __ATTR(fix, S_IRUSR | S_IWUSR,
125 ocfs2_filecheck_show,
126 ocfs2_filecheck_store);
127static struct kobj_attribute ocfs2_attr_filecheck_set =
128 __ATTR(set, S_IRUSR | S_IWUSR,
129 ocfs2_filecheck_show,
130 ocfs2_filecheck_store);
131
132static int ocfs2_filecheck_sysfs_wait(atomic_t *p)
133{
134 schedule();
135 return 0;
136}
137
138static void
139ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry)
140{
141 struct ocfs2_filecheck_entry *p;
142
143 if (!atomic_dec_and_test(&entry->fs_count))
144 wait_on_atomic_t(&entry->fs_count, ocfs2_filecheck_sysfs_wait,
145 TASK_UNINTERRUPTIBLE);
146
147 spin_lock(&entry->fs_fcheck->fc_lock);
148 while (!list_empty(&entry->fs_fcheck->fc_head)) {
149 p = list_first_entry(&entry->fs_fcheck->fc_head,
150 struct ocfs2_filecheck_entry, fe_list);
151 list_del(&p->fe_list);
152 BUG_ON(!p->fe_done); /* To free a undone file check entry */
153 kfree(p);
154 }
155 spin_unlock(&entry->fs_fcheck->fc_lock);
156
157 kset_unregister(entry->fs_fcheckkset);
158 kset_unregister(entry->fs_devicekset);
159 kfree(entry->fs_fcheck);
160 kfree(entry);
161}
162
163static void
164ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry)
165{
166 spin_lock(&ocfs2_filecheck_sysfs_lock);
167 list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list);
168 spin_unlock(&ocfs2_filecheck_sysfs_lock);
169}
170
171static int ocfs2_filecheck_sysfs_del(const char *devname)
172{
173 struct ocfs2_filecheck_sysfs_entry *p;
174
175 spin_lock(&ocfs2_filecheck_sysfs_lock);
176 list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
177 if (!strcmp(p->fs_sb->s_id, devname)) {
178 list_del(&p->fs_list);
179 spin_unlock(&ocfs2_filecheck_sysfs_lock);
180 ocfs2_filecheck_sysfs_free(p);
181 return 0;
182 }
183 }
184 spin_unlock(&ocfs2_filecheck_sysfs_lock);
185 return 1;
186}
187
188static void
189ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry)
190{
191 if (atomic_dec_and_test(&entry->fs_count))
192 wake_up_atomic_t(&entry->fs_count);
193}
194
195static struct ocfs2_filecheck_sysfs_entry *
196ocfs2_filecheck_sysfs_get(const char *devname)
197{
198 struct ocfs2_filecheck_sysfs_entry *p = NULL;
199
200 spin_lock(&ocfs2_filecheck_sysfs_lock);
201 list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
202 if (!strcmp(p->fs_sb->s_id, devname)) {
203 atomic_inc(&p->fs_count);
204 spin_unlock(&ocfs2_filecheck_sysfs_lock);
205 return p;
206 }
207 }
208 spin_unlock(&ocfs2_filecheck_sysfs_lock);
209 return NULL;
210}
211
212int ocfs2_filecheck_create_sysfs(struct super_block *sb)
213{
214 int ret = 0;
215 struct kset *device_kset = NULL;
216 struct kset *fcheck_kset = NULL;
217 struct ocfs2_filecheck *fcheck = NULL;
218 struct ocfs2_filecheck_sysfs_entry *entry = NULL;
219 struct attribute **attrs = NULL;
220 struct attribute_group attrgp;
221
222 if (!ocfs2_kset)
223 return -ENOMEM;
224
225 attrs = kmalloc(sizeof(struct attribute *) * 4, GFP_NOFS);
226 if (!attrs) {
227 ret = -ENOMEM;
228 goto error;
229 } else {
230 attrs[0] = &ocfs2_attr_filecheck_chk.attr;
231 attrs[1] = &ocfs2_attr_filecheck_fix.attr;
232 attrs[2] = &ocfs2_attr_filecheck_set.attr;
233 attrs[3] = NULL;
234 memset(&attrgp, 0, sizeof(attrgp));
235 attrgp.attrs = attrs;
236 }
237
238 fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS);
239 if (!fcheck) {
240 ret = -ENOMEM;
241 goto error;
242 } else {
243 INIT_LIST_HEAD(&fcheck->fc_head);
244 spin_lock_init(&fcheck->fc_lock);
245 fcheck->fc_max = OCFS2_FILECHECK_MINSIZE;
246 fcheck->fc_size = 0;
247 fcheck->fc_done = 0;
248 }
249
250 if (strlen(sb->s_id) <= 0) {
251 mlog(ML_ERROR,
252 "Cannot get device basename when create filecheck sysfs\n");
253 ret = -ENODEV;
254 goto error;
255 }
256
257 device_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj);
258 if (!device_kset) {
259 ret = -ENOMEM;
260 goto error;
261 }
262
263 fcheck_kset = kset_create_and_add("filecheck", NULL,
264 &device_kset->kobj);
265 if (!fcheck_kset) {
266 ret = -ENOMEM;
267 goto error;
268 }
269
270 ret = sysfs_create_group(&fcheck_kset->kobj, &attrgp);
271 if (ret)
272 goto error;
273
274 entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS);
275 if (!entry) {
276 ret = -ENOMEM;
277 goto error;
278 } else {
279 atomic_set(&entry->fs_count, 1);
280 entry->fs_sb = sb;
281 entry->fs_devicekset = device_kset;
282 entry->fs_fcheckkset = fcheck_kset;
283 entry->fs_fcheck = fcheck;
284 ocfs2_filecheck_sysfs_add(entry);
285 }
286
287 kfree(attrs);
288 return 0;
289
290error:
291 kfree(attrs);
292 kfree(entry);
293 kfree(fcheck);
294 kset_unregister(fcheck_kset);
295 kset_unregister(device_kset);
296 return ret;
297}
298
299int ocfs2_filecheck_remove_sysfs(struct super_block *sb)
300{
301 return ocfs2_filecheck_sysfs_del(sb->s_id);
302}
303
304static int
305ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent,
306 unsigned int count);
307static int
308ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent,
309 unsigned int len)
310{
311 int ret;
312
313 if ((len < OCFS2_FILECHECK_MINSIZE) || (len > OCFS2_FILECHECK_MAXSIZE))
314 return -EINVAL;
315
316 spin_lock(&ent->fs_fcheck->fc_lock);
317 if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) {
318 mlog(ML_ERROR,
319 "Cannot set online file check maximum entry number "
320 "to %u due to too many pending entries(%u)\n",
321 len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done);
322 ret = -EBUSY;
323 } else {
324 if (len < ent->fs_fcheck->fc_size)
325 BUG_ON(!ocfs2_filecheck_erase_entries(ent,
326 ent->fs_fcheck->fc_size - len));
327
328 ent->fs_fcheck->fc_max = len;
329 ret = 0;
330 }
331 spin_unlock(&ent->fs_fcheck->fc_lock);
332
333 return ret;
334}
335
336#define OCFS2_FILECHECK_ARGS_LEN 24
337static int
338ocfs2_filecheck_args_get_long(const char *buf, size_t count,
339 unsigned long *val)
340{
341 char buffer[OCFS2_FILECHECK_ARGS_LEN];
342
343 memcpy(buffer, buf, count);
344 buffer[count] = '\0';
345
346 if (kstrtoul(buffer, 0, val))
347 return 1;
348
349 return 0;
350}
351
352static int
353ocfs2_filecheck_type_parse(const char *name, unsigned int *type)
354{
355 if (!strncmp(name, "fix", 4))
356 *type = OCFS2_FILECHECK_TYPE_FIX;
357 else if (!strncmp(name, "check", 6))
358 *type = OCFS2_FILECHECK_TYPE_CHK;
359 else if (!strncmp(name, "set", 4))
360 *type = OCFS2_FILECHECK_TYPE_SET;
361 else
362 return 1;
363
364 return 0;
365}
366
367static int
368ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count,
369 struct ocfs2_filecheck_args *args)
370{
371 unsigned long val = 0;
372 unsigned int type;
373
374 /* too short/long args length */
375 if ((count < 1) || (count >= OCFS2_FILECHECK_ARGS_LEN))
376 return 1;
377
378 if (ocfs2_filecheck_type_parse(name, &type))
379 return 1;
380 if (ocfs2_filecheck_args_get_long(buf, count, &val))
381 return 1;
382
383 if (val <= 0)
384 return 1;
385
386 args->fa_type = type;
387 if (type == OCFS2_FILECHECK_TYPE_SET)
388 args->fa_len = (unsigned int)val;
389 else
390 args->fa_ino = val;
391
392 return 0;
393}
394
395static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
396 struct kobj_attribute *attr,
397 char *buf)
398{
399
400 ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
401 unsigned int type;
402 struct ocfs2_filecheck_entry *p;
403 struct ocfs2_filecheck_sysfs_entry *ent;
404
405 if (ocfs2_filecheck_type_parse(attr->attr.name, &type))
406 return -EINVAL;
407
408 ent = ocfs2_filecheck_sysfs_get(kobj->parent->name);
409 if (!ent) {
410 mlog(ML_ERROR,
411 "Cannot get the corresponding entry via device basename %s\n",
412 kobj->name);
413 return -ENODEV;
414 }
415
416 if (type == OCFS2_FILECHECK_TYPE_SET) {
417 spin_lock(&ent->fs_fcheck->fc_lock);
418 total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max);
419 spin_unlock(&ent->fs_fcheck->fc_lock);
420 goto exit;
421 }
422
423 ret = snprintf(buf, remain, "INO\t\tDONE\tERROR\n");
424 total += ret;
425 remain -= ret;
426 spin_lock(&ent->fs_fcheck->fc_lock);
427 list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
428 if (p->fe_type != type)
429 continue;
430
431 ret = snprintf(buf + total, remain, "%lu\t\t%u\t%s\n",
432 p->fe_ino, p->fe_done,
433 ocfs2_filecheck_error(p->fe_status));
434 if (ret < 0) {
435 total = ret;
436 break;
437 }
438 if (ret == remain) {
439 /* snprintf() didn't fit */
440 total = -E2BIG;
441 break;
442 }
443 total += ret;
444 remain -= ret;
445 }
446 spin_unlock(&ent->fs_fcheck->fc_lock);
447
448exit:
449 ocfs2_filecheck_sysfs_put(ent);
450 return total;
451}
452
453static int
454ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent)
455{
456 struct ocfs2_filecheck_entry *p;
457
458 list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
459 if (p->fe_done) {
460 list_del(&p->fe_list);
461 kfree(p);
462 ent->fs_fcheck->fc_size--;
463 ent->fs_fcheck->fc_done--;
464 return 1;
465 }
466 }
467
468 return 0;
469}
470
471static int
472ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent,
473 unsigned int count)
474{
475 unsigned int i = 0;
476 unsigned int ret = 0;
477
478 while (i++ < count) {
479 if (ocfs2_filecheck_erase_entry(ent))
480 ret++;
481 else
482 break;
483 }
484
485 return (ret == count ? 1 : 0);
486}
487
488static void
489ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent,
490 struct ocfs2_filecheck_entry *entry)
491{
492 entry->fe_done = 1;
493 spin_lock(&ent->fs_fcheck->fc_lock);
494 ent->fs_fcheck->fc_done++;
495 spin_unlock(&ent->fs_fcheck->fc_lock);
496}
497
498static unsigned int
499ocfs2_filecheck_handle(struct super_block *sb,
500 unsigned long ino, unsigned int flags)
501{
502 unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS;
503 struct inode *inode = NULL;
504 int rc;
505
506 inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0);
507 if (IS_ERR(inode)) {
508 rc = (int)(-(long)inode);
509 if (rc >= OCFS2_FILECHECK_ERR_START &&
510 rc < OCFS2_FILECHECK_ERR_END)
511 ret = rc;
512 else
513 ret = OCFS2_FILECHECK_ERR_FAILED;
514 } else
515 iput(inode);
516
517 return ret;
518}
519
520static void
521ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent,
522 struct ocfs2_filecheck_entry *entry)
523{
524 if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK)
525 entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb,
526 entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK);
527 else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX)
528 entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb,
529 entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX);
530 else
531 entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED;
532
533 ocfs2_filecheck_done_entry(ent, entry);
534}
535
536static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
537 struct kobj_attribute *attr,
538 const char *buf, size_t count)
539{
540 struct ocfs2_filecheck_args args;
541 struct ocfs2_filecheck_entry *entry;
542 struct ocfs2_filecheck_sysfs_entry *ent;
543 ssize_t ret = 0;
544
545 if (count == 0)
546 return count;
547
548 if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) {
549 mlog(ML_ERROR, "Invalid arguments for online file check\n");
550 return -EINVAL;
551 }
552
553 ent = ocfs2_filecheck_sysfs_get(kobj->parent->name);
554 if (!ent) {
555 mlog(ML_ERROR,
556 "Cannot get the corresponding entry via device basename %s\n",
557 kobj->parent->name);
558 return -ENODEV;
559 }
560
561 if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) {
562 ret = ocfs2_filecheck_adjust_max(ent, args.fa_len);
563 goto exit;
564 }
565
566 entry = kmalloc(sizeof(struct ocfs2_filecheck_entry), GFP_NOFS);
567 if (!entry) {
568 ret = -ENOMEM;
569 goto exit;
570 }
571
572 spin_lock(&ent->fs_fcheck->fc_lock);
573 if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
574 (ent->fs_fcheck->fc_done == 0)) {
575 mlog(ML_ERROR,
576 "Cannot do more file check "
577 "since file check queue(%u) is full now\n",
578 ent->fs_fcheck->fc_max);
579 ret = -EBUSY;
580 kfree(entry);
581 } else {
582 if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
583 (ent->fs_fcheck->fc_done > 0)) {
584 /* Delete the oldest entry which was done,
585 * make sure the entry size in list does
586 * not exceed maximum value
587 */
588 BUG_ON(!ocfs2_filecheck_erase_entry(ent));
589 }
590
591 entry->fe_ino = args.fa_ino;
592 entry->fe_type = args.fa_type;
593 entry->fe_done = 0;
594 entry->fe_status = OCFS2_FILECHECK_ERR_INPROGRESS;
595 list_add_tail(&entry->fe_list, &ent->fs_fcheck->fc_head);
596 ent->fs_fcheck->fc_size++;
597 }
598 spin_unlock(&ent->fs_fcheck->fc_lock);
599
600 if (!ret)
601 ocfs2_filecheck_handle_entry(ent, entry);
602
603exit:
604 ocfs2_filecheck_sysfs_put(ent);
605 return (!ret ? count : ret);
606}
diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h
new file mode 100644
index 000000000000..e5cd002a2c09
--- /dev/null
+++ b/fs/ocfs2/filecheck.h
@@ -0,0 +1,49 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * filecheck.h
5 *
6 * Online file check.
7 *
8 * Copyright (C) 2016 SuSE. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation, version 2.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20
21#ifndef FILECHECK_H
22#define FILECHECK_H
23
24#include <linux/types.h>
25#include <linux/list.h>
26
27
28/* File check errno */
29enum {
30 OCFS2_FILECHECK_ERR_SUCCESS = 0, /* Success */
31 OCFS2_FILECHECK_ERR_FAILED = 1000, /* Other failure */
32 OCFS2_FILECHECK_ERR_INPROGRESS, /* In progress */
33 OCFS2_FILECHECK_ERR_READONLY, /* Read only */
34 OCFS2_FILECHECK_ERR_INJBD, /* Buffer in jbd */
35 OCFS2_FILECHECK_ERR_INVALIDINO, /* Invalid ino */
36 OCFS2_FILECHECK_ERR_BLOCKECC, /* Block ecc */
37 OCFS2_FILECHECK_ERR_BLOCKNO, /* Block number */
38 OCFS2_FILECHECK_ERR_VALIDFLAG, /* Inode valid flag */
39 OCFS2_FILECHECK_ERR_GENERATION, /* Inode generation */
40 OCFS2_FILECHECK_ERR_UNSUPPORTED /* Unsupported */
41};
42
43#define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED
44#define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED
45
46int ocfs2_filecheck_create_sysfs(struct super_block *sb);
47int ocfs2_filecheck_remove_sysfs(struct super_block *sb);
48
49#endif /* FILECHECK_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 36294446d960..12f4a9e9800f 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -53,6 +53,7 @@
53#include "xattr.h" 53#include "xattr.h"
54#include "refcounttree.h" 54#include "refcounttree.h"
55#include "ocfs2_trace.h" 55#include "ocfs2_trace.h"
56#include "filecheck.h"
56 57
57#include "buffer_head_io.h" 58#include "buffer_head_io.h"
58 59
@@ -74,6 +75,14 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
74 struct inode *inode, 75 struct inode *inode,
75 struct buffer_head *fe_bh); 76 struct buffer_head *fe_bh);
76 77
78static int ocfs2_filecheck_read_inode_block_full(struct inode *inode,
79 struct buffer_head **bh,
80 int flags, int type);
81static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
82 struct buffer_head *bh);
83static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
84 struct buffer_head *bh);
85
77void ocfs2_set_inode_flags(struct inode *inode) 86void ocfs2_set_inode_flags(struct inode *inode)
78{ 87{
79 unsigned int flags = OCFS2_I(inode)->ip_attr; 88 unsigned int flags = OCFS2_I(inode)->ip_attr;
@@ -127,6 +136,7 @@ struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
127struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, 136struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
128 int sysfile_type) 137 int sysfile_type)
129{ 138{
139 int rc = 0;
130 struct inode *inode = NULL; 140 struct inode *inode = NULL;
131 struct super_block *sb = osb->sb; 141 struct super_block *sb = osb->sb;
132 struct ocfs2_find_inode_args args; 142 struct ocfs2_find_inode_args args;
@@ -161,12 +171,17 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
161 } 171 }
162 trace_ocfs2_iget5_locked(inode->i_state); 172 trace_ocfs2_iget5_locked(inode->i_state);
163 if (inode->i_state & I_NEW) { 173 if (inode->i_state & I_NEW) {
164 ocfs2_read_locked_inode(inode, &args); 174 rc = ocfs2_read_locked_inode(inode, &args);
165 unlock_new_inode(inode); 175 unlock_new_inode(inode);
166 } 176 }
167 if (is_bad_inode(inode)) { 177 if (is_bad_inode(inode)) {
168 iput(inode); 178 iput(inode);
169 inode = ERR_PTR(-ESTALE); 179 if ((flags & OCFS2_FI_FLAG_FILECHECK_CHK) ||
180 (flags & OCFS2_FI_FLAG_FILECHECK_FIX))
181 /* Return OCFS2_FILECHECK_ERR_XXX related errno */
182 inode = ERR_PTR(rc);
183 else
184 inode = ERR_PTR(-ESTALE);
170 goto bail; 185 goto bail;
171 } 186 }
172 187
@@ -410,7 +425,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
410 struct ocfs2_super *osb; 425 struct ocfs2_super *osb;
411 struct ocfs2_dinode *fe; 426 struct ocfs2_dinode *fe;
412 struct buffer_head *bh = NULL; 427 struct buffer_head *bh = NULL;
413 int status, can_lock; 428 int status, can_lock, lock_level = 0;
414 u32 generation = 0; 429 u32 generation = 0;
415 430
416 status = -EINVAL; 431 status = -EINVAL;
@@ -478,7 +493,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
478 mlog_errno(status); 493 mlog_errno(status);
479 return status; 494 return status;
480 } 495 }
481 status = ocfs2_inode_lock(inode, NULL, 0); 496 status = ocfs2_inode_lock(inode, NULL, lock_level);
482 if (status) { 497 if (status) {
483 make_bad_inode(inode); 498 make_bad_inode(inode);
484 mlog_errno(status); 499 mlog_errno(status);
@@ -495,16 +510,32 @@ static int ocfs2_read_locked_inode(struct inode *inode,
495 } 510 }
496 511
497 if (can_lock) { 512 if (can_lock) {
498 status = ocfs2_read_inode_block_full(inode, &bh, 513 if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK)
499 OCFS2_BH_IGNORE_CACHE); 514 status = ocfs2_filecheck_read_inode_block_full(inode,
515 &bh, OCFS2_BH_IGNORE_CACHE, 0);
516 else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX)
517 status = ocfs2_filecheck_read_inode_block_full(inode,
518 &bh, OCFS2_BH_IGNORE_CACHE, 1);
519 else
520 status = ocfs2_read_inode_block_full(inode,
521 &bh, OCFS2_BH_IGNORE_CACHE);
500 } else { 522 } else {
501 status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); 523 status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
502 /* 524 /*
503 * If buffer is in jbd, then its checksum may not have been 525 * If buffer is in jbd, then its checksum may not have been
504 * computed as yet. 526 * computed as yet.
505 */ 527 */
506 if (!status && !buffer_jbd(bh)) 528 if (!status && !buffer_jbd(bh)) {
507 status = ocfs2_validate_inode_block(osb->sb, bh); 529 if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK)
530 status = ocfs2_filecheck_validate_inode_block(
531 osb->sb, bh);
532 else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX)
533 status = ocfs2_filecheck_repair_inode_block(
534 osb->sb, bh);
535 else
536 status = ocfs2_validate_inode_block(
537 osb->sb, bh);
538 }
508 } 539 }
509 if (status < 0) { 540 if (status < 0) {
510 mlog_errno(status); 541 mlog_errno(status);
@@ -532,11 +563,24 @@ static int ocfs2_read_locked_inode(struct inode *inode,
532 563
533 BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); 564 BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
534 565
566 if (buffer_dirty(bh) && !buffer_jbd(bh)) {
567 if (can_lock) {
568 ocfs2_inode_unlock(inode, lock_level);
569 lock_level = 1;
570 ocfs2_inode_lock(inode, NULL, lock_level);
571 }
572 status = ocfs2_write_block(osb, bh, INODE_CACHE(inode));
573 if (status < 0) {
574 mlog_errno(status);
575 goto bail;
576 }
577 }
578
535 status = 0; 579 status = 0;
536 580
537bail: 581bail:
538 if (can_lock) 582 if (can_lock)
539 ocfs2_inode_unlock(inode, 0); 583 ocfs2_inode_unlock(inode, lock_level);
540 584
541 if (status < 0) 585 if (status < 0)
542 make_bad_inode(inode); 586 make_bad_inode(inode);
@@ -1126,6 +1170,9 @@ static void ocfs2_clear_inode(struct inode *inode)
1126 mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), 1170 mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
1127 "Clear inode of %llu, inode has io markers\n", 1171 "Clear inode of %llu, inode has io markers\n",
1128 (unsigned long long)oi->ip_blkno); 1172 (unsigned long long)oi->ip_blkno);
1173 mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list),
1174 "Clear inode of %llu, inode has unwritten extents\n",
1175 (unsigned long long)oi->ip_blkno);
1129 1176
1130 ocfs2_extent_map_trunc(inode, 0); 1177 ocfs2_extent_map_trunc(inode, 0);
1131 1178
@@ -1397,6 +1444,169 @@ bail:
1397 return rc; 1444 return rc;
1398} 1445}
1399 1446
1447static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
1448 struct buffer_head *bh)
1449{
1450 int rc = 0;
1451 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
1452
1453 trace_ocfs2_filecheck_validate_inode_block(
1454 (unsigned long long)bh->b_blocknr);
1455
1456 BUG_ON(!buffer_uptodate(bh));
1457
1458 /*
1459 * Call ocfs2_validate_meta_ecc() first since it has ecc repair
1460 * function, but we should not return error immediately when ecc
1461 * validation fails, because the reason is quite likely the invalid
1462 * inode number inputed.
1463 */
1464 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
1465 if (rc) {
1466 mlog(ML_ERROR,
1467 "Filecheck: checksum failed for dinode %llu\n",
1468 (unsigned long long)bh->b_blocknr);
1469 rc = -OCFS2_FILECHECK_ERR_BLOCKECC;
1470 }
1471
1472 if (!OCFS2_IS_VALID_DINODE(di)) {
1473 mlog(ML_ERROR,
1474 "Filecheck: invalid dinode #%llu: signature = %.*s\n",
1475 (unsigned long long)bh->b_blocknr, 7, di->i_signature);
1476 rc = -OCFS2_FILECHECK_ERR_INVALIDINO;
1477 goto bail;
1478 } else if (rc)
1479 goto bail;
1480
1481 if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
1482 mlog(ML_ERROR,
1483 "Filecheck: invalid dinode #%llu: i_blkno is %llu\n",
1484 (unsigned long long)bh->b_blocknr,
1485 (unsigned long long)le64_to_cpu(di->i_blkno));
1486 rc = -OCFS2_FILECHECK_ERR_BLOCKNO;
1487 goto bail;
1488 }
1489
1490 if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
1491 mlog(ML_ERROR,
1492 "Filecheck: invalid dinode #%llu: OCFS2_VALID_FL "
1493 "not set\n",
1494 (unsigned long long)bh->b_blocknr);
1495 rc = -OCFS2_FILECHECK_ERR_VALIDFLAG;
1496 goto bail;
1497 }
1498
1499 if (le32_to_cpu(di->i_fs_generation) !=
1500 OCFS2_SB(sb)->fs_generation) {
1501 mlog(ML_ERROR,
1502 "Filecheck: invalid dinode #%llu: fs_generation is %u\n",
1503 (unsigned long long)bh->b_blocknr,
1504 le32_to_cpu(di->i_fs_generation));
1505 rc = -OCFS2_FILECHECK_ERR_GENERATION;
1506 goto bail;
1507 }
1508
1509bail:
1510 return rc;
1511}
1512
1513static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
1514 struct buffer_head *bh)
1515{
1516 int changed = 0;
1517 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
1518
1519 if (!ocfs2_filecheck_validate_inode_block(sb, bh))
1520 return 0;
1521
1522 trace_ocfs2_filecheck_repair_inode_block(
1523 (unsigned long long)bh->b_blocknr);
1524
1525 if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) ||
1526 ocfs2_is_soft_readonly(OCFS2_SB(sb))) {
1527 mlog(ML_ERROR,
1528 "Filecheck: cannot repair dinode #%llu "
1529 "on readonly filesystem\n",
1530 (unsigned long long)bh->b_blocknr);
1531 return -OCFS2_FILECHECK_ERR_READONLY;
1532 }
1533
1534 if (buffer_jbd(bh)) {
1535 mlog(ML_ERROR,
1536 "Filecheck: cannot repair dinode #%llu, "
1537 "its buffer is in jbd\n",
1538 (unsigned long long)bh->b_blocknr);
1539 return -OCFS2_FILECHECK_ERR_INJBD;
1540 }
1541
1542 if (!OCFS2_IS_VALID_DINODE(di)) {
1543 /* Cannot fix invalid inode block */
1544 return -OCFS2_FILECHECK_ERR_INVALIDINO;
1545 }
1546
1547 if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
1548 /* Cannot just add VALID_FL flag back as a fix,
1549 * need more things to check here.
1550 */
1551 return -OCFS2_FILECHECK_ERR_VALIDFLAG;
1552 }
1553
1554 if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
1555 di->i_blkno = cpu_to_le64(bh->b_blocknr);
1556 changed = 1;
1557 mlog(ML_ERROR,
1558 "Filecheck: reset dinode #%llu: i_blkno to %llu\n",
1559 (unsigned long long)bh->b_blocknr,
1560 (unsigned long long)le64_to_cpu(di->i_blkno));
1561 }
1562
1563 if (le32_to_cpu(di->i_fs_generation) !=
1564 OCFS2_SB(sb)->fs_generation) {
1565 di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
1566 changed = 1;
1567 mlog(ML_ERROR,
1568 "Filecheck: reset dinode #%llu: fs_generation to %u\n",
1569 (unsigned long long)bh->b_blocknr,
1570 le32_to_cpu(di->i_fs_generation));
1571 }
1572
1573 if (changed || ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) {
1574 ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check);
1575 mark_buffer_dirty(bh);
1576 mlog(ML_ERROR,
1577 "Filecheck: reset dinode #%llu: compute meta ecc\n",
1578 (unsigned long long)bh->b_blocknr);
1579 }
1580
1581 return 0;
1582}
1583
1584static int
1585ocfs2_filecheck_read_inode_block_full(struct inode *inode,
1586 struct buffer_head **bh,
1587 int flags, int type)
1588{
1589 int rc;
1590 struct buffer_head *tmp = *bh;
1591
1592 if (!type) /* Check inode block */
1593 rc = ocfs2_read_blocks(INODE_CACHE(inode),
1594 OCFS2_I(inode)->ip_blkno,
1595 1, &tmp, flags,
1596 ocfs2_filecheck_validate_inode_block);
1597 else /* Repair inode block */
1598 rc = ocfs2_read_blocks(INODE_CACHE(inode),
1599 OCFS2_I(inode)->ip_blkno,
1600 1, &tmp, flags,
1601 ocfs2_filecheck_repair_inode_block);
1602
1603 /* If ocfs2_read_blocks() got us a new bh, pass it up. */
1604 if (!rc && !*bh)
1605 *bh = tmp;
1606
1607 return rc;
1608}
1609
1400int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, 1610int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
1401 int flags) 1611 int flags)
1402{ 1612{
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index aac8b86f312e..d8f3fc8d2551 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -43,9 +43,6 @@ struct ocfs2_inode_info
43 /* protects extended attribute changes on this inode */ 43 /* protects extended attribute changes on this inode */
44 struct rw_semaphore ip_xattr_sem; 44 struct rw_semaphore ip_xattr_sem;
45 45
46 /* Number of outstanding AIO's which are not page aligned */
47 struct mutex ip_unaligned_aio;
48
49 /* These fields are protected by ip_lock */ 46 /* These fields are protected by ip_lock */
50 spinlock_t ip_lock; 47 spinlock_t ip_lock;
51 u32 ip_open_count; 48 u32 ip_open_count;
@@ -57,6 +54,9 @@ struct ocfs2_inode_info
57 u32 ip_flags; /* see below */ 54 u32 ip_flags; /* see below */
58 u32 ip_attr; /* inode attributes */ 55 u32 ip_attr; /* inode attributes */
59 56
57 /* Record unwritten extents during direct io. */
58 struct list_head ip_unwritten_list;
59
60 /* protected by recovery_lock. */ 60 /* protected by recovery_lock. */
61 struct inode *ip_next_orphan; 61 struct inode *ip_next_orphan;
62 62
@@ -139,6 +139,9 @@ int ocfs2_drop_inode(struct inode *inode);
139/* Flags for ocfs2_iget() */ 139/* Flags for ocfs2_iget() */
140#define OCFS2_FI_FLAG_SYSFILE 0x1 140#define OCFS2_FI_FLAG_SYSFILE 0x1
141#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2 141#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
142#define OCFS2_FI_FLAG_FILECHECK_CHK 0x4
143#define OCFS2_FI_FLAG_FILECHECK_FIX 0x8
144
142struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff); 145struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
143struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, 146struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
144 int sysfile_type); 147 int sysfile_type);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 61b833b721d8..e607419cdfa4 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -231,7 +231,7 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb)
231 /* At this point, we know that no more recovery threads can be 231 /* At this point, we know that no more recovery threads can be
232 * launched, so wait for any recovery completion work to 232 * launched, so wait for any recovery completion work to
233 * complete. */ 233 * complete. */
234 flush_workqueue(ocfs2_wq); 234 flush_workqueue(osb->ocfs2_wq);
235 235
236 /* 236 /*
237 * Now that recovery is shut down, and the osb is about to be 237 * Now that recovery is shut down, and the osb is about to be
@@ -1326,7 +1326,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
1326 1326
1327 spin_lock(&journal->j_lock); 1327 spin_lock(&journal->j_lock);
1328 list_add_tail(&item->lri_list, &journal->j_la_cleanups); 1328 list_add_tail(&item->lri_list, &journal->j_la_cleanups);
1329 queue_work(ocfs2_wq, &journal->j_recovery_work); 1329 queue_work(journal->j_osb->ocfs2_wq, &journal->j_recovery_work);
1330 spin_unlock(&journal->j_lock); 1330 spin_unlock(&journal->j_lock);
1331} 1331}
1332 1332
@@ -1968,7 +1968,7 @@ static void ocfs2_orphan_scan_work(struct work_struct *work)
1968 mutex_lock(&os->os_lock); 1968 mutex_lock(&os->os_lock);
1969 ocfs2_queue_orphan_scan(osb); 1969 ocfs2_queue_orphan_scan(osb);
1970 if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) 1970 if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
1971 queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, 1971 queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
1972 ocfs2_orphan_scan_timeout()); 1972 ocfs2_orphan_scan_timeout());
1973 mutex_unlock(&os->os_lock); 1973 mutex_unlock(&os->os_lock);
1974} 1974}
@@ -2008,7 +2008,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
2008 atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); 2008 atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
2009 else { 2009 else {
2010 atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE); 2010 atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
2011 queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, 2011 queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
2012 ocfs2_orphan_scan_timeout()); 2012 ocfs2_orphan_scan_timeout());
2013 } 2013 }
2014} 2014}
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 7d62c43a2c3e..fe0d1f9571bb 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -386,7 +386,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
386 struct ocfs2_dinode *alloc = NULL; 386 struct ocfs2_dinode *alloc = NULL;
387 387
388 cancel_delayed_work(&osb->la_enable_wq); 388 cancel_delayed_work(&osb->la_enable_wq);
389 flush_workqueue(ocfs2_wq); 389 flush_workqueue(osb->ocfs2_wq);
390 390
391 if (osb->local_alloc_state == OCFS2_LA_UNUSED) 391 if (osb->local_alloc_state == OCFS2_LA_UNUSED)
392 goto out; 392 goto out;
@@ -1085,7 +1085,7 @@ static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
1085 } else { 1085 } else {
1086 osb->local_alloc_state = OCFS2_LA_DISABLED; 1086 osb->local_alloc_state = OCFS2_LA_DISABLED;
1087 } 1087 }
1088 queue_delayed_work(ocfs2_wq, &osb->la_enable_wq, 1088 queue_delayed_work(osb->ocfs2_wq, &osb->la_enable_wq,
1089 OCFS2_LA_ENABLE_INTERVAL); 1089 OCFS2_LA_ENABLE_INTERVAL);
1090 goto out_unlock; 1090 goto out_unlock;
1091 } 1091 }
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 77ebc2bc1cca..9ea081f4e6e4 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -104,8 +104,8 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
104 if (page->index == last_index) 104 if (page->index == last_index)
105 len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; 105 len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
106 106
107 ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page, 107 ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
108 &fsdata, di_bh, page); 108 &locked_page, &fsdata, di_bh, page);
109 if (ret) { 109 if (ret) {
110 if (ret != -ENOSPC) 110 if (ret != -ENOSPC)
111 mlog_errno(ret); 111 mlog_errno(ret);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7a0126267847..6cf6538a0651 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -464,6 +464,14 @@ struct ocfs2_super
464 struct ocfs2_refcount_tree *osb_ref_tree_lru; 464 struct ocfs2_refcount_tree *osb_ref_tree_lru;
465 465
466 struct mutex system_file_mutex; 466 struct mutex system_file_mutex;
467
468 /*
469 * OCFS2 needs to schedule several different types of work which
470 * require cluster locking, disk I/O, recovery waits, etc. Since these
471 * types of work tend to be heavy we avoid using the kernel events
472 * workqueue and schedule on our own.
473 */
474 struct workqueue_struct *ocfs2_wq;
467}; 475};
468 476
469#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 477#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 6cb019b7c6a8..f8f5fc5e6c05 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1450,28 +1450,20 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
1450 1450
1451TRACE_EVENT(ocfs2_prepare_inode_for_write, 1451TRACE_EVENT(ocfs2_prepare_inode_for_write,
1452 TP_PROTO(unsigned long long ino, unsigned long long saved_pos, 1452 TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
1453 int appending, unsigned long count, 1453 unsigned long count),
1454 int *direct_io, int *has_refcount), 1454 TP_ARGS(ino, saved_pos, count),
1455 TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
1456 TP_STRUCT__entry( 1455 TP_STRUCT__entry(
1457 __field(unsigned long long, ino) 1456 __field(unsigned long long, ino)
1458 __field(unsigned long long, saved_pos) 1457 __field(unsigned long long, saved_pos)
1459 __field(int, appending)
1460 __field(unsigned long, count) 1458 __field(unsigned long, count)
1461 __field(int, direct_io)
1462 __field(int, has_refcount)
1463 ), 1459 ),
1464 TP_fast_assign( 1460 TP_fast_assign(
1465 __entry->ino = ino; 1461 __entry->ino = ino;
1466 __entry->saved_pos = saved_pos; 1462 __entry->saved_pos = saved_pos;
1467 __entry->appending = appending;
1468 __entry->count = count; 1463 __entry->count = count;
1469 __entry->direct_io = direct_io ? *direct_io : -1;
1470 __entry->has_refcount = has_refcount ? *has_refcount : -1;
1471 ), 1464 ),
1472 TP_printk("%llu %llu %d %lu %d %d", __entry->ino, 1465 TP_printk("%llu %llu %lu", __entry->ino,
1473 __entry->saved_pos, __entry->appending, __entry->count, 1466 __entry->saved_pos, __entry->count)
1474 __entry->direct_io, __entry->has_refcount)
1475); 1467);
1476 1468
1477DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); 1469DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
@@ -1540,6 +1532,8 @@ DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode);
1540DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state); 1532DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state);
1541 1533
1542DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block); 1534DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block);
1535DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_validate_inode_block);
1536DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_repair_inode_block);
1543 1537
1544TRACE_EVENT(ocfs2_inode_is_valid_to_delete, 1538TRACE_EVENT(ocfs2_inode_is_valid_to_delete,
1545 TP_PROTO(void *task, void *dc_task, unsigned long long ino, 1539 TP_PROTO(void *task, void *dc_task, unsigned long long ino,
@@ -2035,6 +2029,8 @@ DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_release_dquot);
2035 2029
2036DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot); 2030DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot);
2037 2031
2032DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_get_next_id);
2033
2038DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty); 2034DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty);
2039 2035
2040/* End of trace events for fs/ocfs2/quota_global.c. */ 2036/* End of trace events for fs/ocfs2/quota_global.c. */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 9c9dd30bc945..3892f3c079ca 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -726,7 +726,7 @@ static int ocfs2_release_dquot(struct dquot *dquot)
726 dqgrab(dquot); 726 dqgrab(dquot);
727 /* First entry on list -> queue work */ 727 /* First entry on list -> queue work */
728 if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list)) 728 if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
729 queue_work(ocfs2_wq, &osb->dquot_drop_work); 729 queue_work(osb->ocfs2_wq, &osb->dquot_drop_work);
730 goto out; 730 goto out;
731 } 731 }
732 status = ocfs2_lock_global_qf(oinfo, 1); 732 status = ocfs2_lock_global_qf(oinfo, 1);
@@ -860,6 +860,30 @@ out:
860 return status; 860 return status;
861} 861}
862 862
863static int ocfs2_get_next_id(struct super_block *sb, struct kqid *qid)
864{
865 int type = qid->type;
866 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
867 int status = 0;
868
869 trace_ocfs2_get_next_id(from_kqid(&init_user_ns, *qid), type);
870 status = ocfs2_lock_global_qf(info, 0);
871 if (status < 0)
872 goto out;
873 status = ocfs2_qinfo_lock(info, 0);
874 if (status < 0)
875 goto out_global;
876 status = qtree_get_next_id(&info->dqi_gi, qid);
877 ocfs2_qinfo_unlock(info, 0);
878out_global:
879 ocfs2_unlock_global_qf(info, 0);
880out:
881 /* Avoid logging ENOENT since it just means there isn't next ID */
882 if (status && status != -ENOENT)
883 mlog_errno(status);
884 return status;
885}
886
863static int ocfs2_mark_dquot_dirty(struct dquot *dquot) 887static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
864{ 888{
865 unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) | 889 unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
@@ -968,4 +992,5 @@ const struct dquot_operations ocfs2_quota_operations = {
968 .write_info = ocfs2_write_info, 992 .write_info = ocfs2_write_info,
969 .alloc_dquot = ocfs2_alloc_dquot, 993 .alloc_dquot = ocfs2_alloc_dquot,
970 .destroy_dquot = ocfs2_destroy_dquot, 994 .destroy_dquot = ocfs2_destroy_dquot,
995 .get_next_id = ocfs2_get_next_id,
971}; 996};
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 576b9a04873f..18451e0fab81 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -196,7 +196,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data)
196 for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { 196 for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
197 blkno = ocfs2_backup_super_blkno(inode->i_sb, i); 197 blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
198 cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno); 198 cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
199 if (cluster > clusters) 199 if (cluster >= clusters)
200 break; 200 break;
201 201
202 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup); 202 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 5d965e83bd43..13219ed73e1d 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -629,7 +629,8 @@ static struct attribute_group ocfs2_attr_group = {
629 .attrs = ocfs2_attrs, 629 .attrs = ocfs2_attrs,
630}; 630};
631 631
632static struct kset *ocfs2_kset; 632struct kset *ocfs2_kset;
633EXPORT_SYMBOL_GPL(ocfs2_kset);
633 634
634static void ocfs2_sysfs_exit(void) 635static void ocfs2_sysfs_exit(void)
635{ 636{
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 66334a30cea8..f2dce10fae54 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -298,4 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p
298int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); 298int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
299void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); 299void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
300 300
301extern struct kset *ocfs2_kset;
302
301#endif /* STACKGLUE_H */ 303#endif /* STACKGLUE_H */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index faa1365097bc..7db631e1c8b0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -74,17 +74,12 @@
74#include "suballoc.h" 74#include "suballoc.h"
75 75
76#include "buffer_head_io.h" 76#include "buffer_head_io.h"
77#include "filecheck.h"
77 78
78static struct kmem_cache *ocfs2_inode_cachep; 79static struct kmem_cache *ocfs2_inode_cachep;
79struct kmem_cache *ocfs2_dquot_cachep; 80struct kmem_cache *ocfs2_dquot_cachep;
80struct kmem_cache *ocfs2_qf_chunk_cachep; 81struct kmem_cache *ocfs2_qf_chunk_cachep;
81 82
82/* OCFS2 needs to schedule several different types of work which
83 * require cluster locking, disk I/O, recovery waits, etc. Since these
84 * types of work tend to be heavy we avoid using the kernel events
85 * workqueue and schedule on our own. */
86struct workqueue_struct *ocfs2_wq = NULL;
87
88static struct dentry *ocfs2_debugfs_root; 83static struct dentry *ocfs2_debugfs_root;
89 84
90MODULE_AUTHOR("Oracle"); 85MODULE_AUTHOR("Oracle");
@@ -236,6 +231,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
236 struct ocfs2_recovery_map *rm = osb->recovery_map; 231 struct ocfs2_recovery_map *rm = osb->recovery_map;
237 struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan; 232 struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan;
238 int i, out = 0; 233 int i, out = 0;
234 unsigned long flags;
239 235
240 out += snprintf(buf + out, len - out, 236 out += snprintf(buf + out, len - out,
241 "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n", 237 "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n",
@@ -271,14 +267,14 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
271 cconn->cc_version.pv_minor); 267 cconn->cc_version.pv_minor);
272 } 268 }
273 269
274 spin_lock(&osb->dc_task_lock); 270 spin_lock_irqsave(&osb->dc_task_lock, flags);
275 out += snprintf(buf + out, len - out, 271 out += snprintf(buf + out, len - out,
276 "%10s => Pid: %d Count: %lu WakeSeq: %lu " 272 "%10s => Pid: %d Count: %lu WakeSeq: %lu "
277 "WorkSeq: %lu\n", "DownCnvt", 273 "WorkSeq: %lu\n", "DownCnvt",
278 (osb->dc_task ? task_pid_nr(osb->dc_task) : -1), 274 (osb->dc_task ? task_pid_nr(osb->dc_task) : -1),
279 osb->blocked_lock_count, osb->dc_wake_sequence, 275 osb->blocked_lock_count, osb->dc_wake_sequence,
280 osb->dc_work_sequence); 276 osb->dc_work_sequence);
281 spin_unlock(&osb->dc_task_lock); 277 spin_unlock_irqrestore(&osb->dc_task_lock, flags);
282 278
283 spin_lock(&osb->osb_lock); 279 spin_lock(&osb->osb_lock);
284 out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:", 280 out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:",
@@ -1204,6 +1200,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1204 /* Start this when the mount is almost sure of being successful */ 1200 /* Start this when the mount is almost sure of being successful */
1205 ocfs2_orphan_scan_start(osb); 1201 ocfs2_orphan_scan_start(osb);
1206 1202
1203 /* Create filecheck sysfile /sys/fs/ocfs2/<devname>/filecheck */
1204 ocfs2_filecheck_create_sysfs(sb);
1205
1207 return status; 1206 return status;
1208 1207
1209read_super_error: 1208read_super_error:
@@ -1608,33 +1607,25 @@ static int __init ocfs2_init(void)
1608 if (status < 0) 1607 if (status < 0)
1609 goto out2; 1608 goto out2;
1610 1609
1611 ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
1612 if (!ocfs2_wq) {
1613 status = -ENOMEM;
1614 goto out3;
1615 }
1616
1617 ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); 1610 ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
1618 if (!ocfs2_debugfs_root) { 1611 if (!ocfs2_debugfs_root) {
1619 status = -ENOMEM; 1612 status = -ENOMEM;
1620 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); 1613 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
1621 goto out4; 1614 goto out3;
1622 } 1615 }
1623 1616
1624 ocfs2_set_locking_protocol(); 1617 ocfs2_set_locking_protocol();
1625 1618
1626 status = register_quota_format(&ocfs2_quota_format); 1619 status = register_quota_format(&ocfs2_quota_format);
1627 if (status < 0) 1620 if (status < 0)
1628 goto out4; 1621 goto out3;
1629 status = register_filesystem(&ocfs2_fs_type); 1622 status = register_filesystem(&ocfs2_fs_type);
1630 if (!status) 1623 if (!status)
1631 return 0; 1624 return 0;
1632 1625
1633 unregister_quota_format(&ocfs2_quota_format); 1626 unregister_quota_format(&ocfs2_quota_format);
1634out4:
1635 destroy_workqueue(ocfs2_wq);
1636 debugfs_remove(ocfs2_debugfs_root);
1637out3: 1627out3:
1628 debugfs_remove(ocfs2_debugfs_root);
1638 ocfs2_free_mem_caches(); 1629 ocfs2_free_mem_caches();
1639out2: 1630out2:
1640 exit_ocfs2_uptodate_cache(); 1631 exit_ocfs2_uptodate_cache();
@@ -1645,11 +1636,6 @@ out1:
1645 1636
1646static void __exit ocfs2_exit(void) 1637static void __exit ocfs2_exit(void)
1647{ 1638{
1648 if (ocfs2_wq) {
1649 flush_workqueue(ocfs2_wq);
1650 destroy_workqueue(ocfs2_wq);
1651 }
1652
1653 unregister_quota_format(&ocfs2_quota_format); 1639 unregister_quota_format(&ocfs2_quota_format);
1654 1640
1655 debugfs_remove(ocfs2_debugfs_root); 1641 debugfs_remove(ocfs2_debugfs_root);
@@ -1667,6 +1653,7 @@ static void ocfs2_put_super(struct super_block *sb)
1667 1653
1668 ocfs2_sync_blockdev(sb); 1654 ocfs2_sync_blockdev(sb);
1669 ocfs2_dismount_volume(sb, 0); 1655 ocfs2_dismount_volume(sb, 0);
1656 ocfs2_filecheck_remove_sysfs(sb);
1670} 1657}
1671 1658
1672static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) 1659static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1739,8 +1726,8 @@ static void ocfs2_inode_init_once(void *data)
1739 spin_lock_init(&oi->ip_lock); 1726 spin_lock_init(&oi->ip_lock);
1740 ocfs2_extent_map_init(&oi->vfs_inode); 1727 ocfs2_extent_map_init(&oi->vfs_inode);
1741 INIT_LIST_HEAD(&oi->ip_io_markers); 1728 INIT_LIST_HEAD(&oi->ip_io_markers);
1729 INIT_LIST_HEAD(&oi->ip_unwritten_list);
1742 oi->ip_dir_start_lookup = 0; 1730 oi->ip_dir_start_lookup = 0;
1743 mutex_init(&oi->ip_unaligned_aio);
1744 init_rwsem(&oi->ip_alloc_sem); 1731 init_rwsem(&oi->ip_alloc_sem);
1745 init_rwsem(&oi->ip_xattr_sem); 1732 init_rwsem(&oi->ip_xattr_sem);
1746 mutex_init(&oi->ip_io_mutex); 1733 mutex_init(&oi->ip_io_mutex);
@@ -2343,6 +2330,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
2343 } 2330 }
2344 cleancache_init_shared_fs(sb); 2331 cleancache_init_shared_fs(sb);
2345 2332
2333 osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
2334 if (!osb->ocfs2_wq) {
2335 status = -ENOMEM;
2336 mlog_errno(status);
2337 }
2338
2346bail: 2339bail:
2347 return status; 2340 return status;
2348} 2341}
@@ -2530,6 +2523,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
2530{ 2523{
2531 /* This function assumes that the caller has the main osb resource */ 2524 /* This function assumes that the caller has the main osb resource */
2532 2525
2526 /* ocfs2_initializer_super have already created this workqueue */
2527 if (osb->ocfs2_wq) {
2528 flush_workqueue(osb->ocfs2_wq);
2529 destroy_workqueue(osb->ocfs2_wq);
2530 }
2531
2533 ocfs2_free_slot_info(osb); 2532 ocfs2_free_slot_info(osb);
2534 2533
2535 kfree(osb->osb_orphan_wipes); 2534 kfree(osb->osb_orphan_wipes);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index b477d0b1c7b6..b023e4f3d740 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -26,8 +26,6 @@
26#ifndef OCFS2_SUPER_H 26#ifndef OCFS2_SUPER_H
27#define OCFS2_SUPER_H 27#define OCFS2_SUPER_H
28 28
29extern struct workqueue_struct *ocfs2_wq;
30
31int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, 29int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
32 int node_num); 30 int node_num);
33 31
diff --git a/fs/open.c b/fs/open.c
index 55bdc75e2172..17cb6b1dab75 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -992,14 +992,12 @@ struct file *filp_open(const char *filename, int flags, umode_t mode)
992EXPORT_SYMBOL(filp_open); 992EXPORT_SYMBOL(filp_open);
993 993
994struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, 994struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
995 const char *filename, int flags) 995 const char *filename, int flags, umode_t mode)
996{ 996{
997 struct open_flags op; 997 struct open_flags op;
998 int err = build_open_flags(flags, 0, &op); 998 int err = build_open_flags(flags, mode, &op);
999 if (err) 999 if (err)
1000 return ERR_PTR(err); 1000 return ERR_PTR(err);
1001 if (flags & O_CREAT)
1002 return ERR_PTR(-EINVAL);
1003 return do_file_open_root(dentry, mnt, filename, &op); 1001 return do_file_open_root(dentry, mnt, filename, &op);
1004} 1002}
1005EXPORT_SYMBOL(file_open_root); 1003EXPORT_SYMBOL(file_open_root);
diff --git a/fs/orangefs/Kconfig b/fs/orangefs/Kconfig
new file mode 100644
index 000000000000..1554c02489de
--- /dev/null
+++ b/fs/orangefs/Kconfig
@@ -0,0 +1,6 @@
1config ORANGEFS_FS
2 tristate "ORANGEFS (Powered by PVFS) support"
3 select FS_POSIX_ACL
4 help
5 Orange is a parallel file system designed for use on high end
6 computing (HEC) systems.
diff --git a/fs/orangefs/Makefile b/fs/orangefs/Makefile
new file mode 100644
index 000000000000..a9d6a968fe6d
--- /dev/null
+++ b/fs/orangefs/Makefile
@@ -0,0 +1,10 @@
1#
2# Makefile for the ORANGEFS filesystem.
3#
4
5obj-$(CONFIG_ORANGEFS_FS) += orangefs.o
6
7orangefs-objs := acl.o file.o orangefs-cache.o orangefs-utils.o xattr.o \
8 dcache.o inode.o orangefs-sysfs.o orangefs-mod.o super.o \
9 devorangefs-req.o namei.o symlink.o dir.o orangefs-bufmap.o \
10 orangefs-debugfs.o waitqueue.o
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
new file mode 100644
index 000000000000..03f89dbb2512
--- /dev/null
+++ b/fs/orangefs/acl.c
@@ -0,0 +1,175 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7#include "protocol.h"
8#include "orangefs-kernel.h"
9#include "orangefs-bufmap.h"
10#include <linux/posix_acl_xattr.h>
11#include <linux/fs_struct.h>
12
13struct posix_acl *orangefs_get_acl(struct inode *inode, int type)
14{
15 struct posix_acl *acl;
16 int ret;
17 char *key = NULL, *value = NULL;
18
19 switch (type) {
20 case ACL_TYPE_ACCESS:
21 key = ORANGEFS_XATTR_NAME_ACL_ACCESS;
22 break;
23 case ACL_TYPE_DEFAULT:
24 key = ORANGEFS_XATTR_NAME_ACL_DEFAULT;
25 break;
26 default:
27 gossip_err("orangefs_get_acl: bogus value of type %d\n", type);
28 return ERR_PTR(-EINVAL);
29 }
30 /*
31 * Rather than incurring a network call just to determine the exact
32 * length of the attribute, I just allocate a max length to save on
33 * the network call. Conceivably, we could pass NULL to
34 * orangefs_inode_getxattr() to probe the length of the value, but
35 * I don't do that for now.
36 */
37 value = kmalloc(ORANGEFS_MAX_XATTR_VALUELEN, GFP_KERNEL);
38 if (value == NULL)
39 return ERR_PTR(-ENOMEM);
40
41 gossip_debug(GOSSIP_ACL_DEBUG,
42 "inode %pU, key %s, type %d\n",
43 get_khandle_from_ino(inode),
44 key,
45 type);
46 ret = orangefs_inode_getxattr(inode,
47 "",
48 key,
49 value,
50 ORANGEFS_MAX_XATTR_VALUELEN);
51 /* if the key exists, convert it to an in-memory rep */
52 if (ret > 0) {
53 acl = posix_acl_from_xattr(&init_user_ns, value, ret);
54 } else if (ret == -ENODATA || ret == -ENOSYS) {
55 acl = NULL;
56 } else {
57 gossip_err("inode %pU retrieving acl's failed with error %d\n",
58 get_khandle_from_ino(inode),
59 ret);
60 acl = ERR_PTR(ret);
61 }
62 /* kfree(NULL) is safe, so don't worry if value ever got used */
63 kfree(value);
64 return acl;
65}
66
67int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
68{
69 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
70 int error = 0;
71 void *value = NULL;
72 size_t size = 0;
73 const char *name = NULL;
74
75 switch (type) {
76 case ACL_TYPE_ACCESS:
77 name = ORANGEFS_XATTR_NAME_ACL_ACCESS;
78 if (acl) {
79 umode_t mode = inode->i_mode;
80 /*
81 * can we represent this with the traditional file
82 * mode permission bits?
83 */
84 error = posix_acl_equiv_mode(acl, &mode);
85 if (error < 0) {
86 gossip_err("%s: posix_acl_equiv_mode err: %d\n",
87 __func__,
88 error);
89 return error;
90 }
91
92 if (inode->i_mode != mode)
93 SetModeFlag(orangefs_inode);
94 inode->i_mode = mode;
95 mark_inode_dirty_sync(inode);
96 if (error == 0)
97 acl = NULL;
98 }
99 break;
100 case ACL_TYPE_DEFAULT:
101 name = ORANGEFS_XATTR_NAME_ACL_DEFAULT;
102 break;
103 default:
104 gossip_err("%s: invalid type %d!\n", __func__, type);
105 return -EINVAL;
106 }
107
108 gossip_debug(GOSSIP_ACL_DEBUG,
109 "%s: inode %pU, key %s type %d\n",
110 __func__, get_khandle_from_ino(inode),
111 name,
112 type);
113
114 if (acl) {
115 size = posix_acl_xattr_size(acl->a_count);
116 value = kmalloc(size, GFP_KERNEL);
117 if (!value)
118 return -ENOMEM;
119
120 error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
121 if (error < 0)
122 goto out;
123 }
124
125 gossip_debug(GOSSIP_ACL_DEBUG,
126 "%s: name %s, value %p, size %zd, acl %p\n",
127 __func__, name, value, size, acl);
128 /*
129 * Go ahead and set the extended attribute now. NOTE: Suppose acl
130 * was NULL, then value will be NULL and size will be 0 and that
131 * will xlate to a removexattr. However, we don't want removexattr
132 * complain if attributes does not exist.
133 */
134 error = orangefs_inode_setxattr(inode, "", name, value, size, 0);
135
136out:
137 kfree(value);
138 if (!error)
139 set_cached_acl(inode, type, acl);
140 return error;
141}
142
143int orangefs_init_acl(struct inode *inode, struct inode *dir)
144{
145 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
146 struct posix_acl *default_acl, *acl;
147 umode_t mode = inode->i_mode;
148 int error = 0;
149
150 ClearModeFlag(orangefs_inode);
151
152 error = posix_acl_create(dir, &mode, &default_acl, &acl);
153 if (error)
154 return error;
155
156 if (default_acl) {
157 error = orangefs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
158 posix_acl_release(default_acl);
159 }
160
161 if (acl) {
162 if (!error)
163 error = orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS);
164 posix_acl_release(acl);
165 }
166
167 /* If mode of the inode was changed, then do a forcible ->setattr */
168 if (mode != inode->i_mode) {
169 SetModeFlag(orangefs_inode);
170 inode->i_mode = mode;
171 orangefs_flush_inode(inode);
172 }
173
174 return error;
175}
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c
new file mode 100644
index 000000000000..5dfc4f3cfe68
--- /dev/null
+++ b/fs/orangefs/dcache.c
@@ -0,0 +1,138 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7/*
8 * Implementation of dentry (directory cache) functions.
9 */
10
11#include "protocol.h"
12#include "orangefs-kernel.h"
13
14/* Returns 1 if dentry can still be trusted, else 0. */
15static int orangefs_revalidate_lookup(struct dentry *dentry)
16{
17 struct dentry *parent_dentry = dget_parent(dentry);
18 struct inode *parent_inode = parent_dentry->d_inode;
19 struct orangefs_inode_s *parent = ORANGEFS_I(parent_inode);
20 struct inode *inode = dentry->d_inode;
21 struct orangefs_kernel_op_s *new_op;
22 int ret = 0;
23 int err = 0;
24
25 gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: attempting lookup.\n", __func__);
26
27 new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP);
28 if (!new_op)
29 goto out_put_parent;
30
31 new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW;
32 new_op->upcall.req.lookup.parent_refn = parent->refn;
33 strncpy(new_op->upcall.req.lookup.d_name,
34 dentry->d_name.name,
35 ORANGEFS_NAME_MAX);
36
37 gossip_debug(GOSSIP_DCACHE_DEBUG,
38 "%s:%s:%d interrupt flag [%d]\n",
39 __FILE__,
40 __func__,
41 __LINE__,
42 get_interruptible_flag(parent_inode));
43
44 err = service_operation(new_op, "orangefs_lookup",
45 get_interruptible_flag(parent_inode));
46
47 /* Positive dentry: reject if error or not the same inode. */
48 if (inode) {
49 if (err) {
50 gossip_debug(GOSSIP_DCACHE_DEBUG,
51 "%s:%s:%d lookup failure.\n",
52 __FILE__, __func__, __LINE__);
53 goto out_drop;
54 }
55 if (!match_handle(new_op->downcall.resp.lookup.refn.khandle,
56 inode)) {
57 gossip_debug(GOSSIP_DCACHE_DEBUG,
58 "%s:%s:%d no match.\n",
59 __FILE__, __func__, __LINE__);
60 goto out_drop;
61 }
62
63 /* Negative dentry: reject if success or error other than ENOENT. */
64 } else {
65 gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: negative dentry.\n",
66 __func__);
67 if (!err || err != -ENOENT) {
68 if (new_op->downcall.status != 0)
69 gossip_debug(GOSSIP_DCACHE_DEBUG,
70 "%s:%s:%d lookup failure.\n",
71 __FILE__, __func__, __LINE__);
72 goto out_drop;
73 }
74 }
75
76 ret = 1;
77out_release_op:
78 op_release(new_op);
79out_put_parent:
80 dput(parent_dentry);
81 return ret;
82out_drop:
83 gossip_debug(GOSSIP_DCACHE_DEBUG, "%s:%s:%d revalidate failed\n",
84 __FILE__, __func__, __LINE__);
85 goto out_release_op;
86}
87
88/*
89 * Verify that dentry is valid.
90 *
91 * Should return 1 if dentry can still be trusted, else 0.
92 */
93static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags)
94{
95 int ret;
96
97 if (flags & LOOKUP_RCU)
98 return -ECHILD;
99
100 gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: called on dentry %p.\n",
101 __func__, dentry);
102
103 /* skip root handle lookups. */
104 if (dentry->d_inode && is_root_handle(dentry->d_inode))
105 return 1;
106
107 /*
108 * If this passes, the positive dentry still exists or the negative
109 * dentry still does not exist.
110 */
111 if (!orangefs_revalidate_lookup(dentry))
112 return 0;
113
114 /* We do not need to continue with negative dentries. */
115 if (!dentry->d_inode)
116 goto out;
117
118 /* Now we must perform a getattr to validate the inode contents. */
119
120 ret = orangefs_inode_check_changed(dentry->d_inode);
121 if (ret < 0) {
122 gossip_debug(GOSSIP_DCACHE_DEBUG, "%s:%s:%d getattr failure.\n",
123 __FILE__, __func__, __LINE__);
124 return 0;
125 }
126 if (ret == 0)
127 return 0;
128
129out:
130 gossip_debug(GOSSIP_DCACHE_DEBUG,
131 "%s: negative dentry or positive dentry and inode valid.\n",
132 __func__);
133 return 1;
134}
135
136const struct dentry_operations orangefs_dentry_operations = {
137 .d_revalidate = orangefs_d_revalidate,
138};
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
new file mode 100644
index 000000000000..db170beba797
--- /dev/null
+++ b/fs/orangefs/devorangefs-req.c
@@ -0,0 +1,943 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * Changes by Acxiom Corporation to add protocol version to kernel
5 * communication, Copyright Acxiom Corporation, 2005.
6 *
7 * See COPYING in top-level directory.
8 */
9
10#include "protocol.h"
11#include "orangefs-kernel.h"
12#include "orangefs-dev-proto.h"
13#include "orangefs-bufmap.h"
14
15#include <linux/debugfs.h>
16#include <linux/slab.h>
17
18/* this file implements the /dev/pvfs2-req device node */
19
20static int open_access_count;
21
22#define DUMP_DEVICE_ERROR() \
23do { \
24 gossip_err("*****************************************************\n");\
25 gossip_err("ORANGEFS Device Error: You cannot open the device file "); \
26 gossip_err("\n/dev/%s more than once. Please make sure that\nthere " \
27 "are no ", ORANGEFS_REQDEVICE_NAME); \
28 gossip_err("instances of a program using this device\ncurrently " \
29 "running. (You must verify this!)\n"); \
30 gossip_err("For example, you can use the lsof program as follows:\n");\
31 gossip_err("'lsof | grep %s' (run this as root)\n", \
32 ORANGEFS_REQDEVICE_NAME); \
33 gossip_err(" open_access_count = %d\n", open_access_count); \
34 gossip_err("*****************************************************\n");\
35} while (0)
36
37static int hash_func(__u64 tag, int table_size)
38{
39 return do_div(tag, (unsigned int)table_size);
40}
41
42static void orangefs_devreq_add_op(struct orangefs_kernel_op_s *op)
43{
44 int index = hash_func(op->tag, hash_table_size);
45
46 list_add_tail(&op->list, &htable_ops_in_progress[index]);
47}
48
49/*
50 * find the op with this tag and remove it from the in progress
51 * hash table.
52 */
53static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag)
54{
55 struct orangefs_kernel_op_s *op, *next;
56 int index;
57
58 index = hash_func(tag, hash_table_size);
59
60 spin_lock(&htable_ops_in_progress_lock);
61 list_for_each_entry_safe(op,
62 next,
63 &htable_ops_in_progress[index],
64 list) {
65 if (op->tag == tag && !op_state_purged(op) &&
66 !op_state_given_up(op)) {
67 list_del_init(&op->list);
68 spin_unlock(&htable_ops_in_progress_lock);
69 return op;
70 }
71 }
72
73 spin_unlock(&htable_ops_in_progress_lock);
74 return NULL;
75}
76
77/* Returns whether any FS are still pending remounted */
78static int mark_all_pending_mounts(void)
79{
80 int unmounted = 1;
81 struct orangefs_sb_info_s *orangefs_sb = NULL;
82
83 spin_lock(&orangefs_superblocks_lock);
84 list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) {
85 /* All of these file system require a remount */
86 orangefs_sb->mount_pending = 1;
87 unmounted = 0;
88 }
89 spin_unlock(&orangefs_superblocks_lock);
90 return unmounted;
91}
92
93/*
94 * Determine if a given file system needs to be remounted or not
95 * Returns -1 on error
96 * 0 if already mounted
97 * 1 if needs remount
98 */
99static int fs_mount_pending(__s32 fsid)
100{
101 int mount_pending = -1;
102 struct orangefs_sb_info_s *orangefs_sb = NULL;
103
104 spin_lock(&orangefs_superblocks_lock);
105 list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) {
106 if (orangefs_sb->fs_id == fsid) {
107 mount_pending = orangefs_sb->mount_pending;
108 break;
109 }
110 }
111 spin_unlock(&orangefs_superblocks_lock);
112 return mount_pending;
113}
114
115static int orangefs_devreq_open(struct inode *inode, struct file *file)
116{
117 int ret = -EINVAL;
118
119 if (!(file->f_flags & O_NONBLOCK)) {
120 gossip_err("%s: device cannot be opened in blocking mode\n",
121 __func__);
122 goto out;
123 }
124 ret = -EACCES;
125 gossip_debug(GOSSIP_DEV_DEBUG, "client-core: opening device\n");
126 mutex_lock(&devreq_mutex);
127
128 if (open_access_count == 0) {
129 open_access_count = 1;
130 ret = 0;
131 } else {
132 DUMP_DEVICE_ERROR();
133 }
134 mutex_unlock(&devreq_mutex);
135
136out:
137
138 gossip_debug(GOSSIP_DEV_DEBUG,
139 "pvfs2-client-core: open device complete (ret = %d)\n",
140 ret);
141 return ret;
142}
143
144/* Function for read() callers into the device */
145static ssize_t orangefs_devreq_read(struct file *file,
146 char __user *buf,
147 size_t count, loff_t *offset)
148{
149 struct orangefs_kernel_op_s *op, *temp;
150 __s32 proto_ver = ORANGEFS_KERNEL_PROTO_VERSION;
151 static __s32 magic = ORANGEFS_DEVREQ_MAGIC;
152 struct orangefs_kernel_op_s *cur_op = NULL;
153 unsigned long ret;
154
155 /* We do not support blocking IO. */
156 if (!(file->f_flags & O_NONBLOCK)) {
157 gossip_err("%s: blocking read from client-core.\n",
158 __func__);
159 return -EINVAL;
160 }
161
162 /*
163 * The client will do an ioctl to find MAX_DEV_REQ_UPSIZE, then
164 * always read with that size buffer.
165 */
166 if (count != MAX_DEV_REQ_UPSIZE) {
167 gossip_err("orangefs: client-core tried to read wrong size\n");
168 return -EINVAL;
169 }
170
171restart:
172 /* Get next op (if any) from top of list. */
173 spin_lock(&orangefs_request_list_lock);
174 list_for_each_entry_safe(op, temp, &orangefs_request_list, list) {
175 __s32 fsid;
176 /* This lock is held past the end of the loop when we break. */
177 spin_lock(&op->lock);
178 if (unlikely(op_state_purged(op) || op_state_given_up(op))) {
179 spin_unlock(&op->lock);
180 continue;
181 }
182
183 fsid = fsid_of_op(op);
184 if (fsid != ORANGEFS_FS_ID_NULL) {
185 int ret;
186 /* Skip ops whose filesystem needs to be mounted. */
187 ret = fs_mount_pending(fsid);
188 if (ret == 1) {
189 gossip_debug(GOSSIP_DEV_DEBUG,
190 "%s: mount pending, skipping op tag "
191 "%llu %s\n",
192 __func__,
193 llu(op->tag),
194 get_opname_string(op));
195 spin_unlock(&op->lock);
196 continue;
197 /*
198 * Skip ops whose filesystem we don't know about unless
199 * it is being mounted.
200 */
201 /* XXX: is there a better way to detect this? */
202 } else if (ret == -1 &&
203 !(op->upcall.type ==
204 ORANGEFS_VFS_OP_FS_MOUNT ||
205 op->upcall.type ==
206 ORANGEFS_VFS_OP_GETATTR)) {
207 gossip_debug(GOSSIP_DEV_DEBUG,
208 "orangefs: skipping op tag %llu %s\n",
209 llu(op->tag), get_opname_string(op));
210 gossip_err(
211 "orangefs: ERROR: fs_mount_pending %d\n",
212 fsid);
213 spin_unlock(&op->lock);
214 continue;
215 }
216 }
217 /*
218 * Either this op does not pertain to a filesystem, is mounting
219 * a filesystem, or pertains to a mounted filesystem. Let it
220 * through.
221 */
222 cur_op = op;
223 break;
224 }
225
226 /*
227 * At this point we either have a valid op and can continue or have not
228 * found an op and must ask the client to try again later.
229 */
230 if (!cur_op) {
231 spin_unlock(&orangefs_request_list_lock);
232 return -EAGAIN;
233 }
234
235 gossip_debug(GOSSIP_DEV_DEBUG, "%s: reading op tag %llu %s\n",
236 __func__,
237 llu(cur_op->tag),
238 get_opname_string(cur_op));
239
240 /*
241 * Such an op should never be on the list in the first place. If so, we
242 * will abort.
243 */
244 if (op_state_in_progress(cur_op) || op_state_serviced(cur_op)) {
245 gossip_err("orangefs: ERROR: Current op already queued.\n");
246 list_del_init(&cur_op->list);
247 spin_unlock(&cur_op->lock);
248 spin_unlock(&orangefs_request_list_lock);
249 return -EAGAIN;
250 }
251
252 list_del_init(&cur_op->list);
253 spin_unlock(&orangefs_request_list_lock);
254
255 spin_unlock(&cur_op->lock);
256
257 /* Push the upcall out. */
258 ret = copy_to_user(buf, &proto_ver, sizeof(__s32));
259 if (ret != 0)
260 goto error;
261 ret = copy_to_user(buf+sizeof(__s32), &magic, sizeof(__s32));
262 if (ret != 0)
263 goto error;
264 ret = copy_to_user(buf+2 * sizeof(__s32), &cur_op->tag, sizeof(__u64));
265 if (ret != 0)
266 goto error;
267 ret = copy_to_user(buf+2*sizeof(__s32)+sizeof(__u64), &cur_op->upcall,
268 sizeof(struct orangefs_upcall_s));
269 if (ret != 0)
270 goto error;
271
272 spin_lock(&htable_ops_in_progress_lock);
273 spin_lock(&cur_op->lock);
274 if (unlikely(op_state_given_up(cur_op))) {
275 spin_unlock(&cur_op->lock);
276 spin_unlock(&htable_ops_in_progress_lock);
277 complete(&cur_op->waitq);
278 goto restart;
279 }
280
281 /*
282 * Set the operation to be in progress and move it between lists since
283 * it has been sent to the client.
284 */
285 set_op_state_inprogress(cur_op);
286 gossip_debug(GOSSIP_DEV_DEBUG,
287 "%s: 1 op:%s: op_state:%d: process:%s:\n",
288 __func__,
289 get_opname_string(cur_op),
290 cur_op->op_state,
291 current->comm);
292 orangefs_devreq_add_op(cur_op);
293 spin_unlock(&cur_op->lock);
294 spin_unlock(&htable_ops_in_progress_lock);
295
296 /* The client only asks to read one size buffer. */
297 return MAX_DEV_REQ_UPSIZE;
298error:
299 /*
300 * We were unable to copy the op data to the client. Put the op back in
301 * list. If client has crashed, the op will be purged later when the
302 * device is released.
303 */
304 gossip_err("orangefs: Failed to copy data to user space\n");
305 spin_lock(&orangefs_request_list_lock);
306 spin_lock(&cur_op->lock);
307 if (likely(!op_state_given_up(cur_op))) {
308 set_op_state_waiting(cur_op);
309 gossip_debug(GOSSIP_DEV_DEBUG,
310 "%s: 2 op:%s: op_state:%d: process:%s:\n",
311 __func__,
312 get_opname_string(cur_op),
313 cur_op->op_state,
314 current->comm);
315 list_add(&cur_op->list, &orangefs_request_list);
316 spin_unlock(&cur_op->lock);
317 } else {
318 spin_unlock(&cur_op->lock);
319 complete(&cur_op->waitq);
320 }
321 spin_unlock(&orangefs_request_list_lock);
322 return -EFAULT;
323}
324
325/*
326 * Function for writev() callers into the device.
327 *
328 * Userspace should have written:
329 * - __u32 version
330 * - __u32 magic
331 * - __u64 tag
332 * - struct orangefs_downcall_s
333 * - trailer buffer (in the case of READDIR operations)
334 */
335static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
336 struct iov_iter *iter)
337{
338 ssize_t ret;
339 struct orangefs_kernel_op_s *op = NULL;
340 struct {
341 __u32 version;
342 __u32 magic;
343 __u64 tag;
344 } head;
345 int total = ret = iov_iter_count(iter);
346 int n;
347 int downcall_size = sizeof(struct orangefs_downcall_s);
348 int head_size = sizeof(head);
349
350 gossip_debug(GOSSIP_DEV_DEBUG, "%s: total:%d: ret:%zd:\n",
351 __func__,
352 total,
353 ret);
354
355 if (total < MAX_DEV_REQ_DOWNSIZE) {
356 gossip_err("%s: total:%d: must be at least:%u:\n",
357 __func__,
358 total,
359 (unsigned int) MAX_DEV_REQ_DOWNSIZE);
360 return -EFAULT;
361 }
362
363 n = copy_from_iter(&head, head_size, iter);
364 if (n < head_size) {
365 gossip_err("%s: failed to copy head.\n", __func__);
366 return -EFAULT;
367 }
368
369 if (head.version < ORANGEFS_MINIMUM_USERSPACE_VERSION) {
370 gossip_err("%s: userspace claims version"
371 "%d, minimum version required: %d.\n",
372 __func__,
373 head.version,
374 ORANGEFS_MINIMUM_USERSPACE_VERSION);
375 return -EPROTO;
376 }
377
378 if (head.magic != ORANGEFS_DEVREQ_MAGIC) {
379 gossip_err("Error: Device magic number does not match.\n");
380 return -EPROTO;
381 }
382
383 /* remove the op from the in progress hash table */
384 op = orangefs_devreq_remove_op(head.tag);
385 if (!op) {
386 gossip_err("WARNING: No one's waiting for tag %llu\n",
387 llu(head.tag));
388 return ret;
389 }
390
391 n = copy_from_iter(&op->downcall, downcall_size, iter);
392 if (n != downcall_size) {
393 gossip_err("%s: failed to copy downcall.\n", __func__);
394 goto Efault;
395 }
396
397 if (op->downcall.status)
398 goto wakeup;
399
400 /*
401 * We've successfully peeled off the head and the downcall.
402 * Something has gone awry if total doesn't equal the
403 * sum of head_size, downcall_size and trailer_size.
404 */
405 if ((head_size + downcall_size + op->downcall.trailer_size) != total) {
406 gossip_err("%s: funky write, head_size:%d"
407 ": downcall_size:%d: trailer_size:%lld"
408 ": total size:%d:\n",
409 __func__,
410 head_size,
411 downcall_size,
412 op->downcall.trailer_size,
413 total);
414 goto Efault;
415 }
416
417 /* Only READDIR operations should have trailers. */
418 if ((op->downcall.type != ORANGEFS_VFS_OP_READDIR) &&
419 (op->downcall.trailer_size != 0)) {
420 gossip_err("%s: %x operation with trailer.",
421 __func__,
422 op->downcall.type);
423 goto Efault;
424 }
425
426 /* READDIR operations should always have trailers. */
427 if ((op->downcall.type == ORANGEFS_VFS_OP_READDIR) &&
428 (op->downcall.trailer_size == 0)) {
429 gossip_err("%s: %x operation with no trailer.",
430 __func__,
431 op->downcall.type);
432 goto Efault;
433 }
434
435 if (op->downcall.type != ORANGEFS_VFS_OP_READDIR)
436 goto wakeup;
437
438 op->downcall.trailer_buf =
439 vmalloc(op->downcall.trailer_size);
440 if (op->downcall.trailer_buf == NULL) {
441 gossip_err("%s: failed trailer vmalloc.\n",
442 __func__);
443 goto Enomem;
444 }
445 memset(op->downcall.trailer_buf, 0, op->downcall.trailer_size);
446 n = copy_from_iter(op->downcall.trailer_buf,
447 op->downcall.trailer_size,
448 iter);
449 if (n != op->downcall.trailer_size) {
450 gossip_err("%s: failed to copy trailer.\n", __func__);
451 vfree(op->downcall.trailer_buf);
452 goto Efault;
453 }
454
455wakeup:
456 /*
457 * Return to vfs waitqueue, and back to service_operation
458 * through wait_for_matching_downcall.
459 */
460 spin_lock(&op->lock);
461 if (unlikely(op_is_cancel(op))) {
462 spin_unlock(&op->lock);
463 put_cancel(op);
464 } else if (unlikely(op_state_given_up(op))) {
465 spin_unlock(&op->lock);
466 complete(&op->waitq);
467 } else {
468 set_op_state_serviced(op);
469 gossip_debug(GOSSIP_DEV_DEBUG,
470 "%s: op:%s: op_state:%d: process:%s:\n",
471 __func__,
472 get_opname_string(op),
473 op->op_state,
474 current->comm);
475 spin_unlock(&op->lock);
476 }
477 return ret;
478
479Efault:
480 op->downcall.status = -(ORANGEFS_ERROR_BIT | 9);
481 ret = -EFAULT;
482 goto wakeup;
483
484Enomem:
485 op->downcall.status = -(ORANGEFS_ERROR_BIT | 8);
486 ret = -ENOMEM;
487 goto wakeup;
488}
489
490/*
491 * NOTE: gets called when the last reference to this device is dropped.
492 * Using the open_access_count variable, we enforce a reference count
493 * on this file so that it can be opened by only one process at a time.
494 * the devreq_mutex is used to make sure all i/o has completed
495 * before we call orangefs_bufmap_finalize, and similar such tricky
496 * situations
497 */
498static int orangefs_devreq_release(struct inode *inode, struct file *file)
499{
500 int unmounted = 0;
501
502 gossip_debug(GOSSIP_DEV_DEBUG,
503 "%s:pvfs2-client-core: exiting, closing device\n",
504 __func__);
505
506 mutex_lock(&devreq_mutex);
507 orangefs_bufmap_finalize();
508
509 open_access_count = -1;
510
511 unmounted = mark_all_pending_mounts();
512 gossip_debug(GOSSIP_DEV_DEBUG, "ORANGEFS Device Close: Filesystem(s) %s\n",
513 (unmounted ? "UNMOUNTED" : "MOUNTED"));
514
515 purge_waiting_ops();
516 purge_inprogress_ops();
517
518 orangefs_bufmap_run_down();
519
520 gossip_debug(GOSSIP_DEV_DEBUG,
521 "pvfs2-client-core: device close complete\n");
522 open_access_count = 0;
523 mutex_unlock(&devreq_mutex);
524 return 0;
525}
526
527int is_daemon_in_service(void)
528{
529 int in_service;
530
531 /*
532 * What this function does is checks if client-core is alive
533 * based on the access count we maintain on the device.
534 */
535 mutex_lock(&devreq_mutex);
536 in_service = open_access_count == 1 ? 0 : -EIO;
537 mutex_unlock(&devreq_mutex);
538 return in_service;
539}
540
541bool __is_daemon_in_service(void)
542{
543 return open_access_count == 1;
544}
545
546static inline long check_ioctl_command(unsigned int command)
547{
548 /* Check for valid ioctl codes */
549 if (_IOC_TYPE(command) != ORANGEFS_DEV_MAGIC) {
550 gossip_err("device ioctl magic numbers don't match! Did you rebuild pvfs2-client-core/libpvfs2? [cmd %x, magic %x != %x]\n",
551 command,
552 _IOC_TYPE(command),
553 ORANGEFS_DEV_MAGIC);
554 return -EINVAL;
555 }
556 /* and valid ioctl commands */
557 if (_IOC_NR(command) >= ORANGEFS_DEV_MAXNR || _IOC_NR(command) <= 0) {
558 gossip_err("Invalid ioctl command number [%d >= %d]\n",
559 _IOC_NR(command), ORANGEFS_DEV_MAXNR);
560 return -ENOIOCTLCMD;
561 }
562 return 0;
563}
564
565static long dispatch_ioctl_command(unsigned int command, unsigned long arg)
566{
567 static __s32 magic = ORANGEFS_DEVREQ_MAGIC;
568 static __s32 max_up_size = MAX_DEV_REQ_UPSIZE;
569 static __s32 max_down_size = MAX_DEV_REQ_DOWNSIZE;
570 struct ORANGEFS_dev_map_desc user_desc;
571 int ret = 0;
572 struct dev_mask_info_s mask_info = { 0 };
573 struct dev_mask2_info_s mask2_info = { 0, 0 };
574 int upstream_kmod = 1;
575 struct orangefs_sb_info_s *orangefs_sb;
576
577 /* mtmoore: add locking here */
578
579 switch (command) {
580 case ORANGEFS_DEV_GET_MAGIC:
581 return ((put_user(magic, (__s32 __user *) arg) == -EFAULT) ?
582 -EIO :
583 0);
584 case ORANGEFS_DEV_GET_MAX_UPSIZE:
585 return ((put_user(max_up_size,
586 (__s32 __user *) arg) == -EFAULT) ?
587 -EIO :
588 0);
589 case ORANGEFS_DEV_GET_MAX_DOWNSIZE:
590 return ((put_user(max_down_size,
591 (__s32 __user *) arg) == -EFAULT) ?
592 -EIO :
593 0);
594 case ORANGEFS_DEV_MAP:
595 ret = copy_from_user(&user_desc,
596 (struct ORANGEFS_dev_map_desc __user *)
597 arg,
598 sizeof(struct ORANGEFS_dev_map_desc));
599 /* WTF -EIO and not -EFAULT? */
600 return ret ? -EIO : orangefs_bufmap_initialize(&user_desc);
601 case ORANGEFS_DEV_REMOUNT_ALL:
602 gossip_debug(GOSSIP_DEV_DEBUG,
603 "%s: got ORANGEFS_DEV_REMOUNT_ALL\n",
604 __func__);
605
606 /*
607 * remount all mounted orangefs volumes to regain the lost
608 * dynamic mount tables (if any) -- NOTE: this is done
609 * without keeping the superblock list locked due to the
610 * upcall/downcall waiting. also, the request mutex is
611 * used to ensure that no operations will be serviced until
612 * all of the remounts are serviced (to avoid ops between
613 * mounts to fail)
614 */
615 ret = mutex_lock_interruptible(&request_mutex);
616 if (ret < 0)
617 return ret;
618 gossip_debug(GOSSIP_DEV_DEBUG,
619 "%s: priority remount in progress\n",
620 __func__);
621 spin_lock(&orangefs_superblocks_lock);
622 list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) {
623 /*
624 * We have to drop the spinlock, so entries can be
625 * removed. They can't be freed, though, so we just
626 * keep the forward pointers and zero the back ones -
627 * that way we can get to the rest of the list.
628 */
629 if (!orangefs_sb->list.prev)
630 continue;
631 gossip_debug(GOSSIP_DEV_DEBUG,
632 "%s: Remounting SB %p\n",
633 __func__,
634 orangefs_sb);
635
636 spin_unlock(&orangefs_superblocks_lock);
637 ret = orangefs_remount(orangefs_sb);
638 spin_lock(&orangefs_superblocks_lock);
639 if (ret) {
640 gossip_debug(GOSSIP_DEV_DEBUG,
641 "SB %p remount failed\n",
642 orangefs_sb);
643 break;
644 }
645 }
646 spin_unlock(&orangefs_superblocks_lock);
647 gossip_debug(GOSSIP_DEV_DEBUG,
648 "%s: priority remount complete\n",
649 __func__);
650 mutex_unlock(&request_mutex);
651 return ret;
652
653 case ORANGEFS_DEV_UPSTREAM:
654 ret = copy_to_user((void __user *)arg,
655 &upstream_kmod,
656 sizeof(upstream_kmod));
657
658 if (ret != 0)
659 return -EIO;
660 else
661 return ret;
662
663 case ORANGEFS_DEV_CLIENT_MASK:
664 ret = copy_from_user(&mask2_info,
665 (void __user *)arg,
666 sizeof(struct dev_mask2_info_s));
667
668 if (ret != 0)
669 return -EIO;
670
671 client_debug_mask.mask1 = mask2_info.mask1_value;
672 client_debug_mask.mask2 = mask2_info.mask2_value;
673
674 pr_info("%s: client debug mask has been been received "
675 ":%llx: :%llx:\n",
676 __func__,
677 (unsigned long long)client_debug_mask.mask1,
678 (unsigned long long)client_debug_mask.mask2);
679
680 return ret;
681
682 case ORANGEFS_DEV_CLIENT_STRING:
683 ret = copy_from_user(&client_debug_array_string,
684 (void __user *)arg,
685 ORANGEFS_MAX_DEBUG_STRING_LEN);
686 /*
687 * The real client-core makes an effort to ensure
688 * that actual strings that aren't too long to fit in
689 * this buffer is what we get here. We're going to use
690 * string functions on the stuff we got, so we'll make
691 * this extra effort to try and keep from
692 * flowing out of this buffer when we use the string
693 * functions, even if somehow the stuff we end up
694 * with here is garbage.
695 */
696 client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN - 1] =
697 '\0';
698
699 if (ret != 0) {
700 pr_info("%s: CLIENT_STRING: copy_from_user failed\n",
701 __func__);
702 return -EIO;
703 }
704
705 pr_info("%s: client debug array string has been received.\n",
706 __func__);
707
708 if (!help_string_initialized) {
709
710 /* Free the "we don't know yet" default string... */
711 kfree(debug_help_string);
712
713 /* build a proper debug help string */
714 if (orangefs_prepare_debugfs_help_string(0)) {
715 gossip_err("%s: no debug help string \n",
716 __func__);
717 return -EIO;
718 }
719
720 /* Replace the boilerplate boot-time debug-help file. */
721 debugfs_remove(help_file_dentry);
722
723 help_file_dentry =
724 debugfs_create_file(
725 ORANGEFS_KMOD_DEBUG_HELP_FILE,
726 0444,
727 debug_dir,
728 debug_help_string,
729 &debug_help_fops);
730
731 if (!help_file_dentry) {
732 gossip_err("%s: debugfs_create_file failed for"
733 " :%s:!\n",
734 __func__,
735 ORANGEFS_KMOD_DEBUG_HELP_FILE);
736 return -EIO;
737 }
738 }
739
740 debug_mask_to_string(&client_debug_mask, 1);
741
742 debugfs_remove(client_debug_dentry);
743
744 orangefs_client_debug_init();
745
746 help_string_initialized++;
747
748 return ret;
749
750 case ORANGEFS_DEV_DEBUG:
751 ret = copy_from_user(&mask_info,
752 (void __user *)arg,
753 sizeof(mask_info));
754
755 if (ret != 0)
756 return -EIO;
757
758 if (mask_info.mask_type == KERNEL_MASK) {
759 if ((mask_info.mask_value == 0)
760 && (kernel_mask_set_mod_init)) {
761 /*
762 * the kernel debug mask was set when the
763 * kernel module was loaded; don't override
764 * it if the client-core was started without
765 * a value for ORANGEFS_KMODMASK.
766 */
767 return 0;
768 }
769 debug_mask_to_string(&mask_info.mask_value,
770 mask_info.mask_type);
771 gossip_debug_mask = mask_info.mask_value;
772 pr_info("%s: kernel debug mask has been modified to "
773 ":%s: :%llx:\n",
774 __func__,
775 kernel_debug_string,
776 (unsigned long long)gossip_debug_mask);
777 } else if (mask_info.mask_type == CLIENT_MASK) {
778 debug_mask_to_string(&mask_info.mask_value,
779 mask_info.mask_type);
780 pr_info("%s: client debug mask has been modified to"
781 ":%s: :%llx:\n",
782 __func__,
783 client_debug_string,
784 llu(mask_info.mask_value));
785 } else {
786 gossip_lerr("Invalid mask type....\n");
787 return -EINVAL;
788 }
789
790 return ret;
791
792 default:
793 return -ENOIOCTLCMD;
794 }
795 return -ENOIOCTLCMD;
796}
797
798static long orangefs_devreq_ioctl(struct file *file,
799 unsigned int command, unsigned long arg)
800{
801 long ret;
802
803 /* Check for properly constructed commands */
804 ret = check_ioctl_command(command);
805 if (ret < 0)
806 return (int)ret;
807
808 return (int)dispatch_ioctl_command(command, arg);
809}
810
811#ifdef CONFIG_COMPAT /* CONFIG_COMPAT is in .config */
812
813/* Compat structure for the ORANGEFS_DEV_MAP ioctl */
814struct ORANGEFS_dev_map_desc32 {
815 compat_uptr_t ptr;
816 __s32 total_size;
817 __s32 size;
818 __s32 count;
819};
820
821static unsigned long translate_dev_map26(unsigned long args, long *error)
822{
823 struct ORANGEFS_dev_map_desc32 __user *p32 = (void __user *)args;
824 /*
825 * Depending on the architecture, allocate some space on the
826 * user-call-stack based on our expected layout.
827 */
828 struct ORANGEFS_dev_map_desc __user *p =
829 compat_alloc_user_space(sizeof(*p));
830 compat_uptr_t addr;
831
832 *error = 0;
833 /* get the ptr from the 32 bit user-space */
834 if (get_user(addr, &p32->ptr))
835 goto err;
836 /* try to put that into a 64-bit layout */
837 if (put_user(compat_ptr(addr), &p->ptr))
838 goto err;
839 /* copy the remaining fields */
840 if (copy_in_user(&p->total_size, &p32->total_size, sizeof(__s32)))
841 goto err;
842 if (copy_in_user(&p->size, &p32->size, sizeof(__s32)))
843 goto err;
844 if (copy_in_user(&p->count, &p32->count, sizeof(__s32)))
845 goto err;
846 return (unsigned long)p;
847err:
848 *error = -EFAULT;
849 return 0;
850}
851
852/*
853 * 32 bit user-space apps' ioctl handlers when kernel modules
854 * is compiled as a 64 bit one
855 */
856static long orangefs_devreq_compat_ioctl(struct file *filp, unsigned int cmd,
857 unsigned long args)
858{
859 long ret;
860 unsigned long arg = args;
861
862 /* Check for properly constructed commands */
863 ret = check_ioctl_command(cmd);
864 if (ret < 0)
865 return ret;
866 if (cmd == ORANGEFS_DEV_MAP) {
867 /*
868 * convert the arguments to what we expect internally
869 * in kernel space
870 */
871 arg = translate_dev_map26(args, &ret);
872 if (ret < 0) {
873 gossip_err("Could not translate dev map\n");
874 return ret;
875 }
876 }
877 /* no other ioctl requires translation */
878 return dispatch_ioctl_command(cmd, arg);
879}
880
881#endif /* CONFIG_COMPAT is in .config */
882
883/* the assigned character device major number */
884static int orangefs_dev_major;
885
886/*
887 * Initialize orangefs device specific state:
888 * Must be called at module load time only
889 */
890int orangefs_dev_init(void)
891{
892 /* register orangefs-req device */
893 orangefs_dev_major = register_chrdev(0,
894 ORANGEFS_REQDEVICE_NAME,
895 &orangefs_devreq_file_operations);
896 if (orangefs_dev_major < 0) {
897 gossip_debug(GOSSIP_DEV_DEBUG,
898 "Failed to register /dev/%s (error %d)\n",
899 ORANGEFS_REQDEVICE_NAME, orangefs_dev_major);
900 return orangefs_dev_major;
901 }
902
903 gossip_debug(GOSSIP_DEV_DEBUG,
904 "*** /dev/%s character device registered ***\n",
905 ORANGEFS_REQDEVICE_NAME);
906 gossip_debug(GOSSIP_DEV_DEBUG, "'mknod /dev/%s c %d 0'.\n",
907 ORANGEFS_REQDEVICE_NAME, orangefs_dev_major);
908 return 0;
909}
910
911void orangefs_dev_cleanup(void)
912{
913 unregister_chrdev(orangefs_dev_major, ORANGEFS_REQDEVICE_NAME);
914 gossip_debug(GOSSIP_DEV_DEBUG,
915 "*** /dev/%s character device unregistered ***\n",
916 ORANGEFS_REQDEVICE_NAME);
917}
918
919static unsigned int orangefs_devreq_poll(struct file *file,
920 struct poll_table_struct *poll_table)
921{
922 int poll_revent_mask = 0;
923
924 poll_wait(file, &orangefs_request_list_waitq, poll_table);
925
926 if (!list_empty(&orangefs_request_list))
927 poll_revent_mask |= POLL_IN;
928 return poll_revent_mask;
929}
930
931const struct file_operations orangefs_devreq_file_operations = {
932 .owner = THIS_MODULE,
933 .read = orangefs_devreq_read,
934 .write_iter = orangefs_devreq_write_iter,
935 .open = orangefs_devreq_open,
936 .release = orangefs_devreq_release,
937 .unlocked_ioctl = orangefs_devreq_ioctl,
938
939#ifdef CONFIG_COMPAT /* CONFIG_COMPAT is in .config */
940 .compat_ioctl = orangefs_devreq_compat_ioctl,
941#endif
942 .poll = orangefs_devreq_poll
943};
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
new file mode 100644
index 000000000000..f30b6ecacdd1
--- /dev/null
+++ b/fs/orangefs/dir.c
@@ -0,0 +1,400 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7#include "protocol.h"
8#include "orangefs-kernel.h"
9#include "orangefs-bufmap.h"
10
11/*
12 * decode routine used by kmod to deal with the blob sent from
13 * userspace for readdirs. The blob contains zero or more of these
14 * sub-blobs:
15 * __u32 - represents length of the character string that follows.
16 * string - between 1 and ORANGEFS_NAME_MAX bytes long.
17 * padding - (if needed) to cause the __u32 plus the string to be
18 * eight byte aligned.
19 * khandle - sizeof(khandle) bytes.
20 */
21static long decode_dirents(char *ptr, size_t size,
22 struct orangefs_readdir_response_s *readdir)
23{
24 int i;
25 struct orangefs_readdir_response_s *rd =
26 (struct orangefs_readdir_response_s *) ptr;
27 char *buf = ptr;
28 int khandle_size = sizeof(struct orangefs_khandle);
29 size_t offset = offsetof(struct orangefs_readdir_response_s,
30 dirent_array);
31 /* 8 reflects eight byte alignment */
32 int smallest_blob = khandle_size + 8;
33 __u32 len;
34 int aligned_len;
35 int sizeof_u32 = sizeof(__u32);
36 long ret;
37
38 gossip_debug(GOSSIP_DIR_DEBUG, "%s: size:%zu:\n", __func__, size);
39
40 /* size is = offset on empty dirs, > offset on non-empty dirs... */
41 if (size < offset) {
42 gossip_err("%s: size:%zu: offset:%zu:\n",
43 __func__,
44 size,
45 offset);
46 ret = -EINVAL;
47 goto out;
48 }
49
50 if ((size == offset) && (readdir->orangefs_dirent_outcount != 0)) {
51 gossip_err("%s: size:%zu: dirent_outcount:%d:\n",
52 __func__,
53 size,
54 readdir->orangefs_dirent_outcount);
55 ret = -EINVAL;
56 goto out;
57 }
58
59 readdir->token = rd->token;
60 readdir->orangefs_dirent_outcount = rd->orangefs_dirent_outcount;
61 readdir->dirent_array = kcalloc(readdir->orangefs_dirent_outcount,
62 sizeof(*readdir->dirent_array),
63 GFP_KERNEL);
64 if (readdir->dirent_array == NULL) {
65 gossip_err("%s: kcalloc failed.\n", __func__);
66 ret = -ENOMEM;
67 goto out;
68 }
69
70 buf += offset;
71 size -= offset;
72
73 for (i = 0; i < readdir->orangefs_dirent_outcount; i++) {
74 if (size < smallest_blob) {
75 gossip_err("%s: size:%zu: smallest_blob:%d:\n",
76 __func__,
77 size,
78 smallest_blob);
79 ret = -EINVAL;
80 goto free;
81 }
82
83 len = *(__u32 *)buf;
84 if ((len < 1) || (len > ORANGEFS_NAME_MAX)) {
85 gossip_err("%s: len:%d:\n", __func__, len);
86 ret = -EINVAL;
87 goto free;
88 }
89
90 gossip_debug(GOSSIP_DIR_DEBUG,
91 "%s: size:%zu: len:%d:\n",
92 __func__,
93 size,
94 len);
95
96 readdir->dirent_array[i].d_name = buf + sizeof_u32;
97 readdir->dirent_array[i].d_length = len;
98
99 /*
100 * Calculate "aligned" length of this string and its
101 * associated __u32 descriptor.
102 */
103 aligned_len = ((sizeof_u32 + len + 1) + 7) & ~7;
104 gossip_debug(GOSSIP_DIR_DEBUG,
105 "%s: aligned_len:%d:\n",
106 __func__,
107 aligned_len);
108
109 /*
110 * The end of the blob should coincide with the end
111 * of the last sub-blob.
112 */
113 if (size < aligned_len + khandle_size) {
114 gossip_err("%s: ran off the end of the blob.\n",
115 __func__);
116 ret = -EINVAL;
117 goto free;
118 }
119 size -= aligned_len + khandle_size;
120
121 buf += aligned_len;
122
123 readdir->dirent_array[i].khandle =
124 *(struct orangefs_khandle *) buf;
125 buf += khandle_size;
126 }
127 ret = buf - ptr;
128 gossip_debug(GOSSIP_DIR_DEBUG, "%s: returning:%ld:\n", __func__, ret);
129 goto out;
130
131free:
132 kfree(readdir->dirent_array);
133 readdir->dirent_array = NULL;
134
135out:
136 return ret;
137}
138
139/*
140 * Read directory entries from an instance of an open directory.
141 */
142static int orangefs_readdir(struct file *file, struct dir_context *ctx)
143{
144 int ret = 0;
145 int buffer_index;
146 /*
147 * ptoken supports Orangefs' distributed directory logic, added
148 * in 2.9.2.
149 */
150 __u64 *ptoken = file->private_data;
151 __u64 pos = 0;
152 ino_t ino = 0;
153 struct dentry *dentry = file->f_path.dentry;
154 struct orangefs_kernel_op_s *new_op = NULL;
155 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(dentry->d_inode);
156 int buffer_full = 0;
157 struct orangefs_readdir_response_s readdir_response;
158 void *dents_buf;
159 int i = 0;
160 int len = 0;
161 ino_t current_ino = 0;
162 char *current_entry = NULL;
163 long bytes_decoded;
164
165 gossip_debug(GOSSIP_DIR_DEBUG,
166 "%s: ctx->pos:%lld, ptoken = %llu\n",
167 __func__,
168 lld(ctx->pos),
169 llu(*ptoken));
170
171 pos = (__u64) ctx->pos;
172
173 /* are we done? */
174 if (pos == ORANGEFS_READDIR_END) {
175 gossip_debug(GOSSIP_DIR_DEBUG,
176 "Skipping to termination path\n");
177 return 0;
178 }
179
180 gossip_debug(GOSSIP_DIR_DEBUG,
181 "orangefs_readdir called on %s (pos=%llu)\n",
182 dentry->d_name.name, llu(pos));
183
184 memset(&readdir_response, 0, sizeof(readdir_response));
185
186 new_op = op_alloc(ORANGEFS_VFS_OP_READDIR);
187 if (!new_op)
188 return -ENOMEM;
189
190 /*
191 * Only the indices are shared. No memory is actually shared, but the
192 * mechanism is used.
193 */
194 new_op->uses_shared_memory = 1;
195 new_op->upcall.req.readdir.refn = orangefs_inode->refn;
196 new_op->upcall.req.readdir.max_dirent_count =
197 ORANGEFS_MAX_DIRENT_COUNT_READDIR;
198
199 gossip_debug(GOSSIP_DIR_DEBUG,
200 "%s: upcall.req.readdir.refn.khandle: %pU\n",
201 __func__,
202 &new_op->upcall.req.readdir.refn.khandle);
203
204 new_op->upcall.req.readdir.token = *ptoken;
205
206get_new_buffer_index:
207 buffer_index = orangefs_readdir_index_get();
208 if (buffer_index < 0) {
209 ret = buffer_index;
210 gossip_lerr("orangefs_readdir: orangefs_readdir_index_get() failure (%d)\n",
211 ret);
212 goto out_free_op;
213 }
214 new_op->upcall.req.readdir.buf_index = buffer_index;
215
216 ret = service_operation(new_op,
217 "orangefs_readdir",
218 get_interruptible_flag(dentry->d_inode));
219
220 gossip_debug(GOSSIP_DIR_DEBUG,
221 "Readdir downcall status is %d. ret:%d\n",
222 new_op->downcall.status,
223 ret);
224
225 orangefs_readdir_index_put(buffer_index);
226
227 if (ret == -EAGAIN && op_state_purged(new_op)) {
228 /* Client-core indices are invalid after it restarted. */
229 gossip_debug(GOSSIP_DIR_DEBUG,
230 "%s: Getting new buffer_index for retry of readdir..\n",
231 __func__);
232 goto get_new_buffer_index;
233 }
234
235 if (ret == -EIO && op_state_purged(new_op)) {
236 gossip_err("%s: Client is down. Aborting readdir call.\n",
237 __func__);
238 goto out_slot;
239 }
240
241 if (ret < 0 || new_op->downcall.status != 0) {
242 gossip_debug(GOSSIP_DIR_DEBUG,
243 "Readdir request failed. Status:%d\n",
244 new_op->downcall.status);
245 if (ret >= 0)
246 ret = new_op->downcall.status;
247 goto out_slot;
248 }
249
250 dents_buf = new_op->downcall.trailer_buf;
251 if (dents_buf == NULL) {
252 gossip_err("Invalid NULL buffer in readdir response\n");
253 ret = -ENOMEM;
254 goto out_slot;
255 }
256
257 bytes_decoded = decode_dirents(dents_buf, new_op->downcall.trailer_size,
258 &readdir_response);
259 if (bytes_decoded < 0) {
260 ret = bytes_decoded;
261 gossip_err("Could not decode readdir from buffer %d\n", ret);
262 goto out_vfree;
263 }
264
265 if (bytes_decoded != new_op->downcall.trailer_size) {
266 gossip_err("orangefs_readdir: # bytes decoded (%ld) "
267 "!= trailer size (%ld)\n",
268 bytes_decoded,
269 (long)new_op->downcall.trailer_size);
270 ret = -EINVAL;
271 goto out_destroy_handle;
272 }
273
274 /*
275 * orangefs doesn't actually store dot and dot-dot, but
276 * we need to have them represented.
277 */
278 if (pos == 0) {
279 ino = get_ino_from_khandle(dentry->d_inode);
280 gossip_debug(GOSSIP_DIR_DEBUG,
281 "%s: calling dir_emit of \".\" with pos = %llu\n",
282 __func__,
283 llu(pos));
284 ret = dir_emit(ctx, ".", 1, ino, DT_DIR);
285 pos += 1;
286 }
287
288 if (pos == 1) {
289 ino = get_parent_ino_from_dentry(dentry);
290 gossip_debug(GOSSIP_DIR_DEBUG,
291 "%s: calling dir_emit of \"..\" with pos = %llu\n",
292 __func__,
293 llu(pos));
294 ret = dir_emit(ctx, "..", 2, ino, DT_DIR);
295 pos += 1;
296 }
297
298 /*
299 * we stored ORANGEFS_ITERATE_NEXT in ctx->pos last time around
300 * to prevent "finding" dot and dot-dot on any iteration
301 * other than the first.
302 */
303 if (ctx->pos == ORANGEFS_ITERATE_NEXT)
304 ctx->pos = 0;
305
306 gossip_debug(GOSSIP_DIR_DEBUG,
307 "%s: dirent_outcount:%d:\n",
308 __func__,
309 readdir_response.orangefs_dirent_outcount);
310 for (i = ctx->pos;
311 i < readdir_response.orangefs_dirent_outcount;
312 i++) {
313 len = readdir_response.dirent_array[i].d_length;
314 current_entry = readdir_response.dirent_array[i].d_name;
315 current_ino = orangefs_khandle_to_ino(
316 &readdir_response.dirent_array[i].khandle);
317
318 gossip_debug(GOSSIP_DIR_DEBUG,
319 "calling dir_emit for %s with len %d"
320 ", ctx->pos %ld\n",
321 current_entry,
322 len,
323 (unsigned long)ctx->pos);
324 /*
325 * type is unknown. We don't return object type
326 * in the dirent_array. This leaves getdents
327 * clueless about type.
328 */
329 ret =
330 dir_emit(ctx, current_entry, len, current_ino, DT_UNKNOWN);
331 if (!ret)
332 break;
333 ctx->pos++;
334 gossip_debug(GOSSIP_DIR_DEBUG,
335 "%s: ctx->pos:%lld\n",
336 __func__,
337 lld(ctx->pos));
338
339 }
340
341 /*
342 * we ran all the way through the last batch, set up for
343 * getting another batch...
344 */
345 if (ret) {
346 *ptoken = readdir_response.token;
347 ctx->pos = ORANGEFS_ITERATE_NEXT;
348 }
349
350 /*
351 * Did we hit the end of the directory?
352 */
353 if (readdir_response.token == ORANGEFS_READDIR_END &&
354 !buffer_full) {
355 gossip_debug(GOSSIP_DIR_DEBUG,
356 "End of dir detected; setting ctx->pos to ORANGEFS_READDIR_END.\n");
357 ctx->pos = ORANGEFS_READDIR_END;
358 }
359
360out_destroy_handle:
361 /* kfree(NULL) is safe */
362 kfree(readdir_response.dirent_array);
363out_vfree:
364 gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", dents_buf);
365 vfree(dents_buf);
366out_slot:
367 orangefs_readdir_index_put(buffer_index);
368out_free_op:
369 op_release(new_op);
370 gossip_debug(GOSSIP_DIR_DEBUG, "orangefs_readdir returning %d\n", ret);
371 return ret;
372}
373
374static int orangefs_dir_open(struct inode *inode, struct file *file)
375{
376 __u64 *ptoken;
377
378 file->private_data = kmalloc(sizeof(__u64), GFP_KERNEL);
379 if (!file->private_data)
380 return -ENOMEM;
381
382 ptoken = file->private_data;
383 *ptoken = ORANGEFS_READDIR_START;
384 return 0;
385}
386
387static int orangefs_dir_release(struct inode *inode, struct file *file)
388{
389 orangefs_flush_inode(inode);
390 kfree(file->private_data);
391 return 0;
392}
393
394/** ORANGEFS implementation of VFS directory operations */
395const struct file_operations orangefs_dir_operations = {
396 .read = generic_read_dir,
397 .iterate = orangefs_readdir,
398 .open = orangefs_dir_open,
399 .release = orangefs_dir_release,
400};
diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h
new file mode 100644
index 000000000000..66b99210f1f9
--- /dev/null
+++ b/fs/orangefs/downcall.h
@@ -0,0 +1,133 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7/*
8 * Definitions of downcalls used in Linux kernel module.
9 */
10
11#ifndef __DOWNCALL_H
12#define __DOWNCALL_H
13
14/*
15 * Sanitized the device-client core interaction
16 * for clean 32-64 bit usage
17 */
18struct orangefs_io_response {
19 __s64 amt_complete;
20};
21
22struct orangefs_lookup_response {
23 struct orangefs_object_kref refn;
24};
25
26struct orangefs_create_response {
27 struct orangefs_object_kref refn;
28};
29
30struct orangefs_symlink_response {
31 struct orangefs_object_kref refn;
32};
33
34struct orangefs_getattr_response {
35 struct ORANGEFS_sys_attr_s attributes;
36 char link_target[ORANGEFS_NAME_MAX];
37};
38
39struct orangefs_mkdir_response {
40 struct orangefs_object_kref refn;
41};
42
43/*
44 * duplication of some system interface structures so that I don't have
45 * to allocate extra memory
46 */
47struct orangefs_dirent {
48 char *d_name;
49 int d_length;
50 struct orangefs_khandle khandle;
51};
52
53struct orangefs_statfs_response {
54 __s64 block_size;
55 __s64 blocks_total;
56 __s64 blocks_avail;
57 __s64 files_total;
58 __s64 files_avail;
59};
60
61struct orangefs_fs_mount_response {
62 __s32 fs_id;
63 __s32 id;
64 struct orangefs_khandle root_khandle;
65};
66
67/* the getxattr response is the attribute value */
68struct orangefs_getxattr_response {
69 __s32 val_sz;
70 __s32 __pad1;
71 char val[ORANGEFS_MAX_XATTR_VALUELEN];
72};
73
74/* the listxattr response is an array of attribute names */
75struct orangefs_listxattr_response {
76 __s32 returned_count;
77 __s32 __pad1;
78 __u64 token;
79 char key[ORANGEFS_MAX_XATTR_LISTLEN * ORANGEFS_MAX_XATTR_NAMELEN];
80 __s32 keylen;
81 __s32 __pad2;
82 __s32 lengths[ORANGEFS_MAX_XATTR_LISTLEN];
83};
84
85struct orangefs_param_response {
86 __s64 value;
87};
88
89#define PERF_COUNT_BUF_SIZE 4096
90struct orangefs_perf_count_response {
91 char buffer[PERF_COUNT_BUF_SIZE];
92};
93
94#define FS_KEY_BUF_SIZE 4096
95struct orangefs_fs_key_response {
96 __s32 fs_keylen;
97 __s32 __pad1;
98 char fs_key[FS_KEY_BUF_SIZE];
99};
100
101struct orangefs_downcall_s {
102 __s32 type;
103 __s32 status;
104 /* currently trailer is used only by readdir */
105 __s64 trailer_size;
106 char *trailer_buf;
107
108 union {
109 struct orangefs_io_response io;
110 struct orangefs_lookup_response lookup;
111 struct orangefs_create_response create;
112 struct orangefs_symlink_response sym;
113 struct orangefs_getattr_response getattr;
114 struct orangefs_mkdir_response mkdir;
115 struct orangefs_statfs_response statfs;
116 struct orangefs_fs_mount_response fs_mount;
117 struct orangefs_getxattr_response getxattr;
118 struct orangefs_listxattr_response listxattr;
119 struct orangefs_param_response param;
120 struct orangefs_perf_count_response perf_count;
121 struct orangefs_fs_key_response fs_key;
122 } resp;
123};
124
125struct orangefs_readdir_response_s {
126 __u64 token;
127 __u64 directory_version;
128 __u32 __pad2;
129 __u32 orangefs_dirent_outcount;
130 struct orangefs_dirent *dirent_array;
131};
132
133#endif /* __DOWNCALL_H */
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
new file mode 100644
index 000000000000..ae92795ed965
--- /dev/null
+++ b/fs/orangefs/file.c
@@ -0,0 +1,717 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7/*
8 * Linux VFS file operations.
9 */
10
11#include "protocol.h"
12#include "orangefs-kernel.h"
13#include "orangefs-bufmap.h"
14#include <linux/fs.h>
15#include <linux/pagemap.h>
16
17/*
18 * Copy to client-core's address space from the buffers specified
19 * by the iovec upto total_size bytes.
20 * NOTE: the iovector can either contain addresses which
21 * can futher be kernel-space or user-space addresses.
22 * or it can pointers to struct page's
23 */
24static int precopy_buffers(int buffer_index,
25 struct iov_iter *iter,
26 size_t total_size)
27{
28 int ret = 0;
29 /*
30 * copy data from application/kernel by pulling it out
31 * of the iovec.
32 */
33
34
35 if (total_size) {
36 ret = orangefs_bufmap_copy_from_iovec(iter,
37 buffer_index,
38 total_size);
39 if (ret < 0)
40 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
41 __func__,
42 (long)ret);
43 }
44
45 if (ret < 0)
46 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
47 __func__,
48 (long)ret);
49 return ret;
50}
51
52/*
53 * Copy from client-core's address space to the buffers specified
54 * by the iovec upto total_size bytes.
55 * NOTE: the iovector can either contain addresses which
56 * can futher be kernel-space or user-space addresses.
57 * or it can pointers to struct page's
58 */
59static int postcopy_buffers(int buffer_index,
60 struct iov_iter *iter,
61 size_t total_size)
62{
63 int ret = 0;
64 /*
65 * copy data to application/kernel by pushing it out to
66 * the iovec. NOTE; target buffers can be addresses or
67 * struct page pointers.
68 */
69 if (total_size) {
70 ret = orangefs_bufmap_copy_to_iovec(iter,
71 buffer_index,
72 total_size);
73 if (ret < 0)
74 gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
75 __func__,
76 (long)ret);
77 }
78 return ret;
79}
80
81/*
82 * Post and wait for the I/O upcall to finish
83 */
84static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
85 loff_t *offset, struct iov_iter *iter,
86 size_t total_size, loff_t readahead_size)
87{
88 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
89 struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
90 struct orangefs_kernel_op_s *new_op = NULL;
91 struct iov_iter saved = *iter;
92 int buffer_index = -1;
93 ssize_t ret;
94
95 new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO);
96 if (!new_op)
97 return -ENOMEM;
98
99 /* synchronous I/O */
100 new_op->upcall.req.io.readahead_size = readahead_size;
101 new_op->upcall.req.io.io_type = type;
102 new_op->upcall.req.io.refn = orangefs_inode->refn;
103
104populate_shared_memory:
105 /* get a shared buffer index */
106 buffer_index = orangefs_bufmap_get();
107 if (buffer_index < 0) {
108 ret = buffer_index;
109 gossip_debug(GOSSIP_FILE_DEBUG,
110 "%s: orangefs_bufmap_get failure (%zd)\n",
111 __func__, ret);
112 goto out;
113 }
114 gossip_debug(GOSSIP_FILE_DEBUG,
115 "%s(%pU): GET op %p -> buffer_index %d\n",
116 __func__,
117 handle,
118 new_op,
119 buffer_index);
120
121 new_op->uses_shared_memory = 1;
122 new_op->upcall.req.io.buf_index = buffer_index;
123 new_op->upcall.req.io.count = total_size;
124 new_op->upcall.req.io.offset = *offset;
125
126 gossip_debug(GOSSIP_FILE_DEBUG,
127 "%s(%pU): offset: %llu total_size: %zd\n",
128 __func__,
129 handle,
130 llu(*offset),
131 total_size);
132 /*
133 * Stage 1: copy the buffers into client-core's address space
134 * precopy_buffers only pertains to writes.
135 */
136 if (type == ORANGEFS_IO_WRITE) {
137 ret = precopy_buffers(buffer_index,
138 iter,
139 total_size);
140 if (ret < 0)
141 goto out;
142 }
143
144 gossip_debug(GOSSIP_FILE_DEBUG,
145 "%s(%pU): Calling post_io_request with tag (%llu)\n",
146 __func__,
147 handle,
148 llu(new_op->tag));
149
150 /* Stage 2: Service the I/O operation */
151 ret = service_operation(new_op,
152 type == ORANGEFS_IO_WRITE ?
153 "file_write" :
154 "file_read",
155 get_interruptible_flag(inode));
156
157 /*
158 * If service_operation() returns -EAGAIN #and# the operation was
159 * purged from orangefs_request_list or htable_ops_in_progress, then
160 * we know that the client was restarted, causing the shared memory
161 * area to be wiped clean. To restart a write operation in this
162 * case, we must re-copy the data from the user's iovec to a NEW
163 * shared memory location. To restart a read operation, we must get
164 * a new shared memory location.
165 */
166 if (ret == -EAGAIN && op_state_purged(new_op)) {
167 orangefs_bufmap_put(buffer_index);
168 buffer_index = -1;
169 if (type == ORANGEFS_IO_WRITE)
170 *iter = saved;
171 gossip_debug(GOSSIP_FILE_DEBUG,
172 "%s:going to repopulate_shared_memory.\n",
173 __func__);
174 goto populate_shared_memory;
175 }
176
177 if (ret < 0) {
178 if (ret == -EINTR) {
179 /*
180 * We can't return EINTR if any data was written,
181 * it's not POSIX. It is minimally acceptable
182 * to give a partial write, the way NFS does.
183 *
184 * It would be optimal to return all or nothing,
185 * but if a userspace write is bigger than
186 * an IO buffer, and the interrupt occurs
187 * between buffer writes, that would not be
188 * possible.
189 */
190 switch (new_op->op_state - OP_VFS_STATE_GIVEN_UP) {
191 /*
192 * If the op was waiting when the interrupt
193 * occurred, then the client-core did not
194 * trigger the write.
195 */
196 case OP_VFS_STATE_WAITING:
197 if (*offset == 0)
198 ret = -EINTR;
199 else
200 ret = 0;
201 break;
202 /*
203 * If the op was in progress when the interrupt
204 * occurred, then the client-core was able to
205 * trigger the write.
206 */
207 case OP_VFS_STATE_INPROGR:
208 ret = total_size;
209 break;
210 default:
211 gossip_err("%s: unexpected op state :%d:.\n",
212 __func__,
213 new_op->op_state);
214 ret = 0;
215 break;
216 }
217 gossip_debug(GOSSIP_FILE_DEBUG,
218 "%s: got EINTR, state:%d: %p\n",
219 __func__,
220 new_op->op_state,
221 new_op);
222 } else {
223 gossip_err("%s: error in %s handle %pU, returning %zd\n",
224 __func__,
225 type == ORANGEFS_IO_READ ?
226 "read from" : "write to",
227 handle, ret);
228 }
229 if (orangefs_cancel_op_in_progress(new_op))
230 return ret;
231
232 goto out;
233 }
234
235 /*
236 * Stage 3: Post copy buffers from client-core's address space
237 * postcopy_buffers only pertains to reads.
238 */
239 if (type == ORANGEFS_IO_READ) {
240 ret = postcopy_buffers(buffer_index,
241 iter,
242 new_op->downcall.resp.io.amt_complete);
243 if (ret < 0)
244 goto out;
245 }
246 gossip_debug(GOSSIP_FILE_DEBUG,
247 "%s(%pU): Amount %s, returned by the sys-io call:%d\n",
248 __func__,
249 handle,
250 type == ORANGEFS_IO_READ ? "read" : "written",
251 (int)new_op->downcall.resp.io.amt_complete);
252
253 ret = new_op->downcall.resp.io.amt_complete;
254
255out:
256 if (buffer_index >= 0) {
257 orangefs_bufmap_put(buffer_index);
258 gossip_debug(GOSSIP_FILE_DEBUG,
259 "%s(%pU): PUT buffer_index %d\n",
260 __func__, handle, buffer_index);
261 buffer_index = -1;
262 }
263 op_release(new_op);
264 return ret;
265}
266
267/*
268 * Common entry point for read/write/readv/writev
269 * This function will dispatch it to either the direct I/O
270 * or buffered I/O path depending on the mount options and/or
271 * augmented/extended metadata attached to the file.
272 * Note: File extended attributes override any mount options.
273 */
274static ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file,
275 loff_t *offset, struct iov_iter *iter)
276{
277 struct inode *inode = file->f_mapping->host;
278 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
279 struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
280 size_t count = iov_iter_count(iter);
281 ssize_t total_count = 0;
282 ssize_t ret = -EINVAL;
283
284 gossip_debug(GOSSIP_FILE_DEBUG,
285 "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
286 __func__,
287 handle,
288 (int)count);
289
290 if (type == ORANGEFS_IO_WRITE) {
291 gossip_debug(GOSSIP_FILE_DEBUG,
292 "%s(%pU): proceeding with offset : %llu, "
293 "size %d\n",
294 __func__,
295 handle,
296 llu(*offset),
297 (int)count);
298 }
299
300 if (count == 0) {
301 ret = 0;
302 goto out;
303 }
304
305 while (iov_iter_count(iter)) {
306 size_t each_count = iov_iter_count(iter);
307 size_t amt_complete;
308
309 /* how much to transfer in this loop iteration */
310 if (each_count > orangefs_bufmap_size_query())
311 each_count = orangefs_bufmap_size_query();
312
313 gossip_debug(GOSSIP_FILE_DEBUG,
314 "%s(%pU): size of each_count(%d)\n",
315 __func__,
316 handle,
317 (int)each_count);
318 gossip_debug(GOSSIP_FILE_DEBUG,
319 "%s(%pU): BEFORE wait_for_io: offset is %d\n",
320 __func__,
321 handle,
322 (int)*offset);
323
324 ret = wait_for_direct_io(type, inode, offset, iter,
325 each_count, 0);
326 gossip_debug(GOSSIP_FILE_DEBUG,
327 "%s(%pU): return from wait_for_io:%d\n",
328 __func__,
329 handle,
330 (int)ret);
331
332 if (ret < 0)
333 goto out;
334
335 *offset += ret;
336 total_count += ret;
337 amt_complete = ret;
338
339 gossip_debug(GOSSIP_FILE_DEBUG,
340 "%s(%pU): AFTER wait_for_io: offset is %d\n",
341 __func__,
342 handle,
343 (int)*offset);
344
345 /*
346 * if we got a short I/O operations,
347 * fall out and return what we got so far
348 */
349 if (amt_complete < each_count)
350 break;
351 } /*end while */
352
353out:
354 if (total_count > 0)
355 ret = total_count;
356 if (ret > 0) {
357 if (type == ORANGEFS_IO_READ) {
358 file_accessed(file);
359 } else {
360 SetMtimeFlag(orangefs_inode);
361 inode->i_mtime = CURRENT_TIME;
362 mark_inode_dirty_sync(inode);
363 }
364 }
365
366 gossip_debug(GOSSIP_FILE_DEBUG,
367 "%s(%pU): Value(%d) returned.\n",
368 __func__,
369 handle,
370 (int)ret);
371
372 return ret;
373}
374
375/*
376 * Read data from a specified offset in a file (referenced by inode).
377 * Data may be placed either in a user or kernel buffer.
378 */
379ssize_t orangefs_inode_read(struct inode *inode,
380 struct iov_iter *iter,
381 loff_t *offset,
382 loff_t readahead_size)
383{
384 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
385 size_t count = iov_iter_count(iter);
386 size_t bufmap_size;
387 ssize_t ret = -EINVAL;
388
389 g_orangefs_stats.reads++;
390
391 bufmap_size = orangefs_bufmap_size_query();
392 if (count > bufmap_size) {
393 gossip_debug(GOSSIP_FILE_DEBUG,
394 "%s: count is too large (%zd/%zd)!\n",
395 __func__, count, bufmap_size);
396 return -EINVAL;
397 }
398
399 gossip_debug(GOSSIP_FILE_DEBUG,
400 "%s(%pU) %zd@%llu\n",
401 __func__,
402 &orangefs_inode->refn.khandle,
403 count,
404 llu(*offset));
405
406 ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, offset, iter,
407 count, readahead_size);
408 if (ret > 0)
409 *offset += ret;
410
411 gossip_debug(GOSSIP_FILE_DEBUG,
412 "%s(%pU): Value(%zd) returned.\n",
413 __func__,
414 &orangefs_inode->refn.khandle,
415 ret);
416
417 return ret;
418}
419
420static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
421{
422 struct file *file = iocb->ki_filp;
423 loff_t pos = *(&iocb->ki_pos);
424 ssize_t rc = 0;
425
426 BUG_ON(iocb->private);
427
428 gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n");
429
430 g_orangefs_stats.reads++;
431
432 rc = do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter);
433 iocb->ki_pos = pos;
434
435 return rc;
436}
437
438static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
439{
440 struct file *file = iocb->ki_filp;
441 loff_t pos;
442 ssize_t rc;
443
444 BUG_ON(iocb->private);
445
446 gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n");
447
448 mutex_lock(&file->f_mapping->host->i_mutex);
449
450 /* Make sure generic_write_checks sees an up to date inode size. */
451 if (file->f_flags & O_APPEND) {
452 rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1);
453 if (rc == -ESTALE)
454 rc = -EIO;
455 if (rc) {
456 gossip_err("%s: orangefs_inode_getattr failed, "
457 "rc:%zd:.\n", __func__, rc);
458 goto out;
459 }
460 }
461
462 if (file->f_pos > i_size_read(file->f_mapping->host))
463 orangefs_i_size_write(file->f_mapping->host, file->f_pos);
464
465 rc = generic_write_checks(iocb, iter);
466
467 if (rc <= 0) {
468 gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
469 __func__, rc);
470 goto out;
471 }
472
473 /*
474 * if we are appending, generic_write_checks would have updated
475 * pos to the end of the file, so we will wait till now to set
476 * pos...
477 */
478 pos = *(&iocb->ki_pos);
479
480 rc = do_readv_writev(ORANGEFS_IO_WRITE,
481 file,
482 &pos,
483 iter);
484 if (rc < 0) {
485 gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
486 __func__, rc);
487 goto out;
488 }
489
490 iocb->ki_pos = pos;
491 g_orangefs_stats.writes++;
492
493out:
494
495 mutex_unlock(&file->f_mapping->host->i_mutex);
496 return rc;
497}
498
499/*
500 * Perform a miscellaneous operation on a file.
501 */
502static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
503{
504 int ret = -ENOTTY;
505 __u64 val = 0;
506 unsigned long uval;
507
508 gossip_debug(GOSSIP_FILE_DEBUG,
509 "orangefs_ioctl: called with cmd %d\n",
510 cmd);
511
512 /*
513 * we understand some general ioctls on files, such as the immutable
514 * and append flags
515 */
516 if (cmd == FS_IOC_GETFLAGS) {
517 val = 0;
518 ret = orangefs_inode_getxattr(file_inode(file),
519 ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
520 "user.pvfs2.meta_hint",
521 &val, sizeof(val));
522 if (ret < 0 && ret != -ENODATA)
523 return ret;
524 else if (ret == -ENODATA)
525 val = 0;
526 uval = val;
527 gossip_debug(GOSSIP_FILE_DEBUG,
528 "orangefs_ioctl: FS_IOC_GETFLAGS: %llu\n",
529 (unsigned long long)uval);
530 return put_user(uval, (int __user *)arg);
531 } else if (cmd == FS_IOC_SETFLAGS) {
532 ret = 0;
533 if (get_user(uval, (int __user *)arg))
534 return -EFAULT;
535 /*
536 * ORANGEFS_MIRROR_FL is set internally when the mirroring mode
537 * is turned on for a file. The user is not allowed to turn
538 * on this bit, but the bit is present if the user first gets
539 * the flags and then updates the flags with some new
540 * settings. So, we ignore it in the following edit. bligon.
541 */
542 if ((uval & ~ORANGEFS_MIRROR_FL) &
543 (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) {
544 gossip_err("orangefs_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n");
545 return -EINVAL;
546 }
547 val = uval;
548 gossip_debug(GOSSIP_FILE_DEBUG,
549 "orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n",
550 (unsigned long long)val);
551 ret = orangefs_inode_setxattr(file_inode(file),
552 ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
553 "user.pvfs2.meta_hint",
554 &val, sizeof(val), 0);
555 }
556
557 return ret;
558}
559
560/*
561 * Memory map a region of a file.
562 */
563static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
564{
565 gossip_debug(GOSSIP_FILE_DEBUG,
566 "orangefs_file_mmap: called on %s\n",
567 (file ?
568 (char *)file->f_path.dentry->d_name.name :
569 (char *)"Unknown"));
570
571 /* set the sequential readahead hint */
572 vma->vm_flags |= VM_SEQ_READ;
573 vma->vm_flags &= ~VM_RAND_READ;
574
575 /* Use readonly mmap since we cannot support writable maps. */
576 return generic_file_readonly_mmap(file, vma);
577}
578
579#define mapping_nrpages(idata) ((idata)->nrpages)
580
581/*
582 * Called to notify the module that there are no more references to
583 * this file (i.e. no processes have it open).
584 *
585 * \note Not called when each file is closed.
586 */
587static int orangefs_file_release(struct inode *inode, struct file *file)
588{
589 gossip_debug(GOSSIP_FILE_DEBUG,
590 "orangefs_file_release: called on %s\n",
591 file->f_path.dentry->d_name.name);
592
593 orangefs_flush_inode(inode);
594
595 /*
596 * remove all associated inode pages from the page cache and mmap
597 * readahead cache (if any); this forces an expensive refresh of
598 * data for the next caller of mmap (or 'get_block' accesses)
599 */
600 if (file->f_path.dentry->d_inode &&
601 file->f_path.dentry->d_inode->i_mapping &&
602 mapping_nrpages(&file->f_path.dentry->d_inode->i_data))
603 truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping,
604 0);
605 return 0;
606}
607
608/*
609 * Push all data for a specific file onto permanent storage.
610 */
611static int orangefs_fsync(struct file *file,
612 loff_t start,
613 loff_t end,
614 int datasync)
615{
616 int ret = -EINVAL;
617 struct orangefs_inode_s *orangefs_inode =
618 ORANGEFS_I(file->f_path.dentry->d_inode);
619 struct orangefs_kernel_op_s *new_op = NULL;
620
621 /* required call */
622 filemap_write_and_wait_range(file->f_mapping, start, end);
623
624 new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC);
625 if (!new_op)
626 return -ENOMEM;
627 new_op->upcall.req.fsync.refn = orangefs_inode->refn;
628
629 ret = service_operation(new_op,
630 "orangefs_fsync",
631 get_interruptible_flag(file->f_path.dentry->d_inode));
632
633 gossip_debug(GOSSIP_FILE_DEBUG,
634 "orangefs_fsync got return value of %d\n",
635 ret);
636
637 op_release(new_op);
638
639 orangefs_flush_inode(file->f_path.dentry->d_inode);
640 return ret;
641}
642
643/*
644 * Change the file pointer position for an instance of an open file.
645 *
646 * \note If .llseek is overriden, we must acquire lock as described in
647 * Documentation/filesystems/Locking.
648 *
649 * Future upgrade could support SEEK_DATA and SEEK_HOLE but would
650 * require much changes to the FS
651 */
652static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin)
653{
654 int ret = -EINVAL;
655 struct inode *inode = file_inode(file);
656
657 if (origin == SEEK_END) {
658 /*
659 * revalidate the inode's file size.
660 * NOTE: We are only interested in file size here,
661 * so we set mask accordingly.
662 */
663 ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1);
664 if (ret == -ESTALE)
665 ret = -EIO;
666 if (ret) {
667 gossip_debug(GOSSIP_FILE_DEBUG,
668 "%s:%s:%d calling make bad inode\n",
669 __FILE__,
670 __func__,
671 __LINE__);
672 return ret;
673 }
674 }
675
676 gossip_debug(GOSSIP_FILE_DEBUG,
677 "orangefs_file_llseek: offset is %ld | origin is %d"
678 " | inode size is %lu\n",
679 (long)offset,
680 origin,
681 (unsigned long)i_size_read(inode));
682
683 return generic_file_llseek(file, offset, origin);
684}
685
686/*
687 * Support local locks (locks that only this kernel knows about)
688 * if Orangefs was mounted -o local_lock.
689 */
690static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl)
691{
692 int rc = -EINVAL;
693
694 if (ORANGEFS_SB(filp->f_inode->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) {
695 if (cmd == F_GETLK) {
696 rc = 0;
697 posix_test_lock(filp, fl);
698 } else {
699 rc = posix_lock_file(filp, fl, NULL);
700 }
701 }
702
703 return rc;
704}
705
706/** ORANGEFS implementation of VFS file operations */
707const struct file_operations orangefs_file_operations = {
708 .llseek = orangefs_file_llseek,
709 .read_iter = orangefs_file_read_iter,
710 .write_iter = orangefs_file_write_iter,
711 .lock = orangefs_lock,
712 .unlocked_ioctl = orangefs_ioctl,
713 .mmap = orangefs_file_mmap,
714 .open = generic_file_open,
715 .release = orangefs_file_release,
716 .fsync = orangefs_fsync,
717};
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
new file mode 100644
index 000000000000..2382e267b49e
--- /dev/null
+++ b/fs/orangefs/inode.c
@@ -0,0 +1,475 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7/*
8 * Linux VFS inode operations.
9 */
10
11#include "protocol.h"
12#include "orangefs-kernel.h"
13#include "orangefs-bufmap.h"
14
15static int read_one_page(struct page *page)
16{
17 int ret;
18 int max_block;
19 ssize_t bytes_read = 0;
20 struct inode *inode = page->mapping->host;
21 const __u32 blocksize = PAGE_CACHE_SIZE; /* inode->i_blksize */
22 const __u32 blockbits = PAGE_CACHE_SHIFT; /* inode->i_blkbits */
23 struct iov_iter to;
24 struct bio_vec bv = {.bv_page = page, .bv_len = PAGE_SIZE};
25
26 iov_iter_bvec(&to, ITER_BVEC | READ, &bv, 1, PAGE_SIZE);
27
28 gossip_debug(GOSSIP_INODE_DEBUG,
29 "orangefs_readpage called with page %p\n",
30 page);
31
32 max_block = ((inode->i_size / blocksize) + 1);
33
34 if (page->index < max_block) {
35 loff_t blockptr_offset = (((loff_t) page->index) << blockbits);
36
37 bytes_read = orangefs_inode_read(inode,
38 &to,
39 &blockptr_offset,
40 inode->i_size);
41 }
42 /* this will only zero remaining unread portions of the page data */
43 iov_iter_zero(~0U, &to);
44 /* takes care of potential aliasing */
45 flush_dcache_page(page);
46 if (bytes_read < 0) {
47 ret = bytes_read;
48 SetPageError(page);
49 } else {
50 SetPageUptodate(page);
51 if (PageError(page))
52 ClearPageError(page);
53 ret = 0;
54 }
55 /* unlock the page after the ->readpage() routine completes */
56 unlock_page(page);
57 return ret;
58}
59
60static int orangefs_readpage(struct file *file, struct page *page)
61{
62 return read_one_page(page);
63}
64
65static int orangefs_readpages(struct file *file,
66 struct address_space *mapping,
67 struct list_head *pages,
68 unsigned nr_pages)
69{
70 int page_idx;
71 int ret;
72
73 gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_readpages called\n");
74
75 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
76 struct page *page;
77
78 page = list_entry(pages->prev, struct page, lru);
79 list_del(&page->lru);
80 if (!add_to_page_cache(page,
81 mapping,
82 page->index,
83 GFP_KERNEL)) {
84 ret = read_one_page(page);
85 gossip_debug(GOSSIP_INODE_DEBUG,
86 "failure adding page to cache, read_one_page returned: %d\n",
87 ret);
88 } else {
89 page_cache_release(page);
90 }
91 }
92 BUG_ON(!list_empty(pages));
93 return 0;
94}
95
96static void orangefs_invalidatepage(struct page *page,
97 unsigned int offset,
98 unsigned int length)
99{
100 gossip_debug(GOSSIP_INODE_DEBUG,
101 "orangefs_invalidatepage called on page %p "
102 "(offset is %u)\n",
103 page,
104 offset);
105
106 ClearPageUptodate(page);
107 ClearPageMappedToDisk(page);
108 return;
109
110}
111
112static int orangefs_releasepage(struct page *page, gfp_t foo)
113{
114 gossip_debug(GOSSIP_INODE_DEBUG,
115 "orangefs_releasepage called on page %p\n",
116 page);
117 return 0;
118}
119
120/*
121 * Having a direct_IO entry point in the address_space_operations
122 * struct causes the kernel to allows us to use O_DIRECT on
123 * open. Nothing will ever call this thing, but in the future we
124 * will need to be able to use O_DIRECT on open in order to support
125 * AIO. Modeled after NFS, they do this too.
126 */
127/*
128 * static ssize_t orangefs_direct_IO(int rw,
129 * struct kiocb *iocb,
130 * struct iov_iter *iter,
131 * loff_t offset)
132 *{
133 * gossip_debug(GOSSIP_INODE_DEBUG,
134 * "orangefs_direct_IO: %s\n",
135 * iocb->ki_filp->f_path.dentry->d_name.name);
136 *
137 * return -EINVAL;
138 *}
139 */
140
141struct backing_dev_info orangefs_backing_dev_info = {
142 .name = "orangefs",
143 .ra_pages = 0,
144 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
145};
146
147/** ORANGEFS2 implementation of address space operations */
148const struct address_space_operations orangefs_address_operations = {
149 .readpage = orangefs_readpage,
150 .readpages = orangefs_readpages,
151 .invalidatepage = orangefs_invalidatepage,
152 .releasepage = orangefs_releasepage,
153/* .direct_IO = orangefs_direct_IO */
154};
155
156static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
157{
158 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
159 struct orangefs_kernel_op_s *new_op;
160 loff_t orig_size;
161 int ret = -EINVAL;
162
163 gossip_debug(GOSSIP_INODE_DEBUG,
164 "%s: %pU: Handle is %pU | fs_id %d | size is %llu\n",
165 __func__,
166 get_khandle_from_ino(inode),
167 &orangefs_inode->refn.khandle,
168 orangefs_inode->refn.fs_id,
169 iattr->ia_size);
170
171 /* Ensure that we have a up to date size, so we know if it changed. */
172 ret = orangefs_inode_getattr(inode, 0, 1);
173 if (ret == -ESTALE)
174 ret = -EIO;
175 if (ret) {
176 gossip_err("%s: orangefs_inode_getattr failed, ret:%d:.\n",
177 __func__, ret);
178 return ret;
179 }
180 orig_size = i_size_read(inode);
181
182 truncate_setsize(inode, iattr->ia_size);
183
184 new_op = op_alloc(ORANGEFS_VFS_OP_TRUNCATE);
185 if (!new_op)
186 return -ENOMEM;
187
188 new_op->upcall.req.truncate.refn = orangefs_inode->refn;
189 new_op->upcall.req.truncate.size = (__s64) iattr->ia_size;
190
191 ret = service_operation(new_op, __func__,
192 get_interruptible_flag(inode));
193
194 /*
195 * the truncate has no downcall members to retrieve, but
196 * the status value tells us if it went through ok or not
197 */
198 gossip_debug(GOSSIP_INODE_DEBUG,
199 "orangefs: orangefs_truncate got return value of %d\n",
200 ret);
201
202 op_release(new_op);
203
204 if (ret != 0)
205 return ret;
206
207 /*
208 * Only change the c/mtime if we are changing the size or we are
209 * explicitly asked to change it. This handles the semantic difference
210 * between truncate() and ftruncate() as implemented in the VFS.
211 *
212 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
213 * special case where we need to update the times despite not having
214 * these flags set. For all other operations the VFS set these flags
215 * explicitly if it wants a timestamp update.
216 */
217 if (orig_size != i_size_read(inode) &&
218 !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
219 iattr->ia_ctime = iattr->ia_mtime =
220 current_fs_time(inode->i_sb);
221 iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
222 }
223
224 return ret;
225}
226
227/*
228 * Change attributes of an object referenced by dentry.
229 */
230int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
231{
232 int ret = -EINVAL;
233 struct inode *inode = dentry->d_inode;
234
235 gossip_debug(GOSSIP_INODE_DEBUG,
236 "orangefs_setattr: called on %s\n",
237 dentry->d_name.name);
238
239 ret = inode_change_ok(inode, iattr);
240 if (ret)
241 goto out;
242
243 if ((iattr->ia_valid & ATTR_SIZE) &&
244 iattr->ia_size != i_size_read(inode)) {
245 ret = orangefs_setattr_size(inode, iattr);
246 if (ret)
247 goto out;
248 }
249
250 setattr_copy(inode, iattr);
251 mark_inode_dirty(inode);
252
253 ret = orangefs_inode_setattr(inode, iattr);
254 gossip_debug(GOSSIP_INODE_DEBUG,
255 "orangefs_setattr: inode_setattr returned %d\n",
256 ret);
257
258 if (!ret && (iattr->ia_valid & ATTR_MODE))
259 /* change mod on a file that has ACLs */
260 ret = posix_acl_chmod(inode, inode->i_mode);
261
262out:
263 gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n", ret);
264 return ret;
265}
266
267/*
268 * Obtain attributes of an object given a dentry
269 */
270int orangefs_getattr(struct vfsmount *mnt,
271 struct dentry *dentry,
272 struct kstat *kstat)
273{
274 int ret = -ENOENT;
275 struct inode *inode = dentry->d_inode;
276 struct orangefs_inode_s *orangefs_inode = NULL;
277
278 gossip_debug(GOSSIP_INODE_DEBUG,
279 "orangefs_getattr: called on %s\n",
280 dentry->d_name.name);
281
282 ret = orangefs_inode_getattr(inode, 0, 1);
283 if (ret == 0) {
284 generic_fillattr(inode, kstat);
285
286 /* override block size reported to stat */
287 orangefs_inode = ORANGEFS_I(inode);
288 kstat->blksize = orangefs_inode->blksize;
289 }
290 return ret;
291}
292
293int orangefs_permission(struct inode *inode, int mask)
294{
295 int ret;
296
297 if (mask & MAY_NOT_BLOCK)
298 return -ECHILD;
299
300 gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__);
301
302 /* Make sure the permission (and other common attrs) are up to date. */
303 ret = orangefs_inode_getattr(inode, 0, 0);
304 if (ret < 0)
305 return ret;
306
307 return generic_permission(inode, mask);
308}
309
310/* ORANGEDS2 implementation of VFS inode operations for files */
311struct inode_operations orangefs_file_inode_operations = {
312 .get_acl = orangefs_get_acl,
313 .set_acl = orangefs_set_acl,
314 .setattr = orangefs_setattr,
315 .getattr = orangefs_getattr,
316 .setxattr = generic_setxattr,
317 .getxattr = generic_getxattr,
318 .listxattr = orangefs_listxattr,
319 .removexattr = generic_removexattr,
320 .permission = orangefs_permission,
321};
322
323static int orangefs_init_iops(struct inode *inode)
324{
325 inode->i_mapping->a_ops = &orangefs_address_operations;
326
327 switch (inode->i_mode & S_IFMT) {
328 case S_IFREG:
329 inode->i_op = &orangefs_file_inode_operations;
330 inode->i_fop = &orangefs_file_operations;
331 inode->i_blkbits = PAGE_CACHE_SHIFT;
332 break;
333 case S_IFLNK:
334 inode->i_op = &orangefs_symlink_inode_operations;
335 break;
336 case S_IFDIR:
337 inode->i_op = &orangefs_dir_inode_operations;
338 inode->i_fop = &orangefs_dir_operations;
339 break;
340 default:
341 gossip_debug(GOSSIP_INODE_DEBUG,
342 "%s: unsupported mode\n",
343 __func__);
344 return -EINVAL;
345 }
346
347 return 0;
348}
349
350/*
351 * Given a ORANGEFS object identifier (fsid, handle), convert it into a ino_t type
352 * that will be used as a hash-index from where the handle will
353 * be searched for in the VFS hash table of inodes.
354 */
355static inline ino_t orangefs_handle_hash(struct orangefs_object_kref *ref)
356{
357 if (!ref)
358 return 0;
359 return orangefs_khandle_to_ino(&(ref->khandle));
360}
361
362/*
363 * Called to set up an inode from iget5_locked.
364 */
365static int orangefs_set_inode(struct inode *inode, void *data)
366{
367 struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data;
368 ORANGEFS_I(inode)->refn.fs_id = ref->fs_id;
369 ORANGEFS_I(inode)->refn.khandle = ref->khandle;
370 return 0;
371}
372
373/*
374 * Called to determine if handles match.
375 */
376static int orangefs_test_inode(struct inode *inode, void *data)
377{
378 struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data;
379 struct orangefs_inode_s *orangefs_inode = NULL;
380
381 orangefs_inode = ORANGEFS_I(inode);
382 return (!ORANGEFS_khandle_cmp(&(orangefs_inode->refn.khandle), &(ref->khandle))
383 && orangefs_inode->refn.fs_id == ref->fs_id);
384}
385
386/*
387 * Front-end to lookup the inode-cache maintained by the VFS using the ORANGEFS
388 * file handle.
389 *
390 * @sb: the file system super block instance.
391 * @ref: The ORANGEFS object for which we are trying to locate an inode structure.
392 */
393struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref *ref)
394{
395 struct inode *inode = NULL;
396 unsigned long hash;
397 int error;
398
399 hash = orangefs_handle_hash(ref);
400 inode = iget5_locked(sb, hash, orangefs_test_inode, orangefs_set_inode, ref);
401 if (!inode || !(inode->i_state & I_NEW))
402 return inode;
403
404 error = orangefs_inode_getattr(inode, 1, 0);
405 if (error) {
406 iget_failed(inode);
407 return ERR_PTR(error);
408 }
409
410 inode->i_ino = hash; /* needed for stat etc */
411 orangefs_init_iops(inode);
412 unlock_new_inode(inode);
413
414 gossip_debug(GOSSIP_INODE_DEBUG,
415 "iget handle %pU, fsid %d hash %ld i_ino %lu\n",
416 &ref->khandle,
417 ref->fs_id,
418 hash,
419 inode->i_ino);
420
421 return inode;
422}
423
424/*
425 * Allocate an inode for a newly created file and insert it into the inode hash.
426 */
427struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
428 int mode, dev_t dev, struct orangefs_object_kref *ref)
429{
430 unsigned long hash = orangefs_handle_hash(ref);
431 struct inode *inode;
432 int error;
433
434 gossip_debug(GOSSIP_INODE_DEBUG,
435 "%s:(sb is %p | MAJOR(dev)=%u | MINOR(dev)=%u mode=%o)\n",
436 __func__,
437 sb,
438 MAJOR(dev),
439 MINOR(dev),
440 mode);
441
442 inode = new_inode(sb);
443 if (!inode)
444 return NULL;
445
446 orangefs_set_inode(inode, ref);
447 inode->i_ino = hash; /* needed for stat etc */
448
449 error = orangefs_inode_getattr(inode, 1, 0);
450 if (error)
451 goto out_iput;
452
453 orangefs_init_iops(inode);
454
455 inode->i_mode = mode;
456 inode->i_uid = current_fsuid();
457 inode->i_gid = current_fsgid();
458 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
459 inode->i_size = PAGE_CACHE_SIZE;
460 inode->i_rdev = dev;
461
462 error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref);
463 if (error < 0)
464 goto out_iput;
465
466 gossip_debug(GOSSIP_INODE_DEBUG,
467 "Initializing ACL's for inode %pU\n",
468 get_khandle_from_ino(inode));
469 orangefs_init_acl(inode, dir);
470 return inode;
471
472out_iput:
473 iput(inode);
474 return ERR_PTR(error);
475}
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
new file mode 100644
index 000000000000..5a60c508af4e
--- /dev/null
+++ b/fs/orangefs/namei.c
@@ -0,0 +1,462 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7/*
8 * Linux VFS namei operations.
9 */
10
11#include "protocol.h"
12#include "orangefs-kernel.h"
13
14/*
15 * Get a newly allocated inode to go with a negative dentry.
16 */
17static int orangefs_create(struct inode *dir,
18 struct dentry *dentry,
19 umode_t mode,
20 bool exclusive)
21{
22 struct orangefs_inode_s *parent = ORANGEFS_I(dir);
23 struct orangefs_kernel_op_s *new_op;
24 struct inode *inode;
25 int ret;
26
27 gossip_debug(GOSSIP_NAME_DEBUG, "%s: %s\n",
28 __func__,
29 dentry->d_name.name);
30
31 new_op = op_alloc(ORANGEFS_VFS_OP_CREATE);
32 if (!new_op)
33 return -ENOMEM;
34
35 new_op->upcall.req.create.parent_refn = parent->refn;
36
37 fill_default_sys_attrs(new_op->upcall.req.create.attributes,
38 ORANGEFS_TYPE_METAFILE, mode);
39
40 strncpy(new_op->upcall.req.create.d_name,
41 dentry->d_name.name, ORANGEFS_NAME_MAX);
42
43 ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
44
45 gossip_debug(GOSSIP_NAME_DEBUG,
46 "%s: %s: handle:%pU: fsid:%d: new_op:%p: ret:%d:\n",
47 __func__,
48 dentry->d_name.name,
49 &new_op->downcall.resp.create.refn.khandle,
50 new_op->downcall.resp.create.refn.fs_id,
51 new_op,
52 ret);
53
54 if (ret < 0)
55 goto out;
56
57 inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0,
58 &new_op->downcall.resp.create.refn);
59 if (IS_ERR(inode)) {
60 gossip_err("%s: Failed to allocate inode for file :%s:\n",
61 __func__,
62 dentry->d_name.name);
63 ret = PTR_ERR(inode);
64 goto out;
65 }
66
67 gossip_debug(GOSSIP_NAME_DEBUG,
68 "%s: Assigned inode :%pU: for file :%s:\n",
69 __func__,
70 get_khandle_from_ino(inode),
71 dentry->d_name.name);
72
73 d_instantiate(dentry, inode);
74 unlock_new_inode(inode);
75
76 gossip_debug(GOSSIP_NAME_DEBUG,
77 "%s: dentry instantiated for %s\n",
78 __func__,
79 dentry->d_name.name);
80
81 SetMtimeFlag(parent);
82 dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
83 mark_inode_dirty_sync(dir);
84 ret = 0;
85out:
86 op_release(new_op);
87 gossip_debug(GOSSIP_NAME_DEBUG,
88 "%s: %s: returning %d\n",
89 __func__,
90 dentry->d_name.name,
91 ret);
92 return ret;
93}
94
95/*
96 * Attempt to resolve an object name (dentry->d_name), parent handle, and
97 * fsid into a handle for the object.
98 */
99static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
100 unsigned int flags)
101{
102 struct orangefs_inode_s *parent = ORANGEFS_I(dir);
103 struct orangefs_kernel_op_s *new_op;
104 struct inode *inode;
105 struct dentry *res;
106 int ret = -EINVAL;
107
108 /*
109 * in theory we could skip a lookup here (if the intent is to
110 * create) in order to avoid a potentially failed lookup, but
111 * leaving it in can skip a valid lookup and try to create a file
112 * that already exists (e.g. the vfs already handles checking for
113 * -EEXIST on O_EXCL opens, which is broken if we skip this lookup
114 * in the create path)
115 */
116 gossip_debug(GOSSIP_NAME_DEBUG, "%s called on %s\n",
117 __func__, dentry->d_name.name);
118
119 if (dentry->d_name.len > (ORANGEFS_NAME_MAX - 1))
120 return ERR_PTR(-ENAMETOOLONG);
121
122 new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP);
123 if (!new_op)
124 return ERR_PTR(-ENOMEM);
125
126 new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW;
127
128 gossip_debug(GOSSIP_NAME_DEBUG, "%s:%s:%d using parent %pU\n",
129 __FILE__,
130 __func__,
131 __LINE__,
132 &parent->refn.khandle);
133 new_op->upcall.req.lookup.parent_refn = parent->refn;
134
135 strncpy(new_op->upcall.req.lookup.d_name, dentry->d_name.name,
136 ORANGEFS_NAME_MAX);
137
138 gossip_debug(GOSSIP_NAME_DEBUG,
139 "%s: doing lookup on %s under %pU,%d\n",
140 __func__,
141 new_op->upcall.req.lookup.d_name,
142 &new_op->upcall.req.lookup.parent_refn.khandle,
143 new_op->upcall.req.lookup.parent_refn.fs_id);
144
145 ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
146
147 gossip_debug(GOSSIP_NAME_DEBUG,
148 "Lookup Got %pU, fsid %d (ret=%d)\n",
149 &new_op->downcall.resp.lookup.refn.khandle,
150 new_op->downcall.resp.lookup.refn.fs_id,
151 ret);
152
153 if (ret < 0) {
154 if (ret == -ENOENT) {
155 /*
156 * if no inode was found, add a negative dentry to
157 * dcache anyway; if we don't, we don't hold expected
158 * lookup semantics and we most noticeably break
159 * during directory renames.
160 *
161 * however, if the operation failed or exited, do not
162 * add the dentry (e.g. in the case that a touch is
163 * issued on a file that already exists that was
164 * interrupted during this lookup -- no need to add
165 * another negative dentry for an existing file)
166 */
167
168 gossip_debug(GOSSIP_NAME_DEBUG,
169 "orangefs_lookup: Adding *negative* dentry "
170 "%p for %s\n",
171 dentry,
172 dentry->d_name.name);
173
174 d_add(dentry, NULL);
175 res = NULL;
176 goto out;
177 }
178
179 /* must be a non-recoverable error */
180 res = ERR_PTR(ret);
181 goto out;
182 }
183
184 inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
185 if (IS_ERR(inode)) {
186 gossip_debug(GOSSIP_NAME_DEBUG,
187 "error %ld from iget\n", PTR_ERR(inode));
188 res = ERR_CAST(inode);
189 goto out;
190 }
191
192 gossip_debug(GOSSIP_NAME_DEBUG,
193 "%s:%s:%d "
194 "Found good inode [%lu] with count [%d]\n",
195 __FILE__,
196 __func__,
197 __LINE__,
198 inode->i_ino,
199 (int)atomic_read(&inode->i_count));
200
201 /* update dentry/inode pair into dcache */
202 res = d_splice_alias(inode, dentry);
203
204 gossip_debug(GOSSIP_NAME_DEBUG,
205 "Lookup success (inode ct = %d)\n",
206 (int)atomic_read(&inode->i_count));
207out:
208 op_release(new_op);
209 return res;
210}
211
212/* return 0 on success; non-zero otherwise */
213static int orangefs_unlink(struct inode *dir, struct dentry *dentry)
214{
215 struct inode *inode = dentry->d_inode;
216 struct orangefs_inode_s *parent = ORANGEFS_I(dir);
217 struct orangefs_kernel_op_s *new_op;
218 int ret;
219
220 gossip_debug(GOSSIP_NAME_DEBUG,
221 "%s: called on %s\n"
222 " (inode %pU): Parent is %pU | fs_id %d\n",
223 __func__,
224 dentry->d_name.name,
225 get_khandle_from_ino(inode),
226 &parent->refn.khandle,
227 parent->refn.fs_id);
228
229 new_op = op_alloc(ORANGEFS_VFS_OP_REMOVE);
230 if (!new_op)
231 return -ENOMEM;
232
233 new_op->upcall.req.remove.parent_refn = parent->refn;
234 strncpy(new_op->upcall.req.remove.d_name, dentry->d_name.name,
235 ORANGEFS_NAME_MAX);
236
237 ret = service_operation(new_op, "orangefs_unlink",
238 get_interruptible_flag(inode));
239
240 gossip_debug(GOSSIP_NAME_DEBUG,
241 "%s: service_operation returned:%d:\n",
242 __func__,
243 ret);
244
245 op_release(new_op);
246
247 if (!ret) {
248 drop_nlink(inode);
249
250 SetMtimeFlag(parent);
251 dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
252 mark_inode_dirty_sync(dir);
253 }
254 return ret;
255}
256
257static int orangefs_symlink(struct inode *dir,
258 struct dentry *dentry,
259 const char *symname)
260{
261 struct orangefs_inode_s *parent = ORANGEFS_I(dir);
262 struct orangefs_kernel_op_s *new_op;
263 struct inode *inode;
264 int mode = 755;
265 int ret;
266
267 gossip_debug(GOSSIP_NAME_DEBUG, "%s: called\n", __func__);
268
269 if (!symname)
270 return -EINVAL;
271
272 if (strlen(symname)+1 > ORANGEFS_NAME_MAX)
273 return -ENAMETOOLONG;
274
275 new_op = op_alloc(ORANGEFS_VFS_OP_SYMLINK);
276 if (!new_op)
277 return -ENOMEM;
278
279 new_op->upcall.req.sym.parent_refn = parent->refn;
280
281 fill_default_sys_attrs(new_op->upcall.req.sym.attributes,
282 ORANGEFS_TYPE_SYMLINK,
283 mode);
284
285 strncpy(new_op->upcall.req.sym.entry_name,
286 dentry->d_name.name,
287 ORANGEFS_NAME_MAX);
288 strncpy(new_op->upcall.req.sym.target, symname, ORANGEFS_NAME_MAX);
289
290 ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
291
292 gossip_debug(GOSSIP_NAME_DEBUG,
293 "Symlink Got ORANGEFS handle %pU on fsid %d (ret=%d)\n",
294 &new_op->downcall.resp.sym.refn.khandle,
295 new_op->downcall.resp.sym.refn.fs_id, ret);
296
297 if (ret < 0) {
298 gossip_debug(GOSSIP_NAME_DEBUG,
299 "%s: failed with error code %d\n",
300 __func__, ret);
301 goto out;
302 }
303
304 inode = orangefs_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0,
305 &new_op->downcall.resp.sym.refn);
306 if (IS_ERR(inode)) {
307 gossip_err
308 ("*** Failed to allocate orangefs symlink inode\n");
309 ret = PTR_ERR(inode);
310 goto out;
311 }
312
313 gossip_debug(GOSSIP_NAME_DEBUG,
314 "Assigned symlink inode new number of %pU\n",
315 get_khandle_from_ino(inode));
316
317 d_instantiate(dentry, inode);
318 unlock_new_inode(inode);
319
320 gossip_debug(GOSSIP_NAME_DEBUG,
321 "Inode (Symlink) %pU -> %s\n",
322 get_khandle_from_ino(inode),
323 dentry->d_name.name);
324
325 SetMtimeFlag(parent);
326 dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
327 mark_inode_dirty_sync(dir);
328 ret = 0;
329out:
330 op_release(new_op);
331 return ret;
332}
333
334static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
335{
336 struct orangefs_inode_s *parent = ORANGEFS_I(dir);
337 struct orangefs_kernel_op_s *new_op;
338 struct inode *inode;
339 int ret;
340
341 new_op = op_alloc(ORANGEFS_VFS_OP_MKDIR);
342 if (!new_op)
343 return -ENOMEM;
344
345 new_op->upcall.req.mkdir.parent_refn = parent->refn;
346
347 fill_default_sys_attrs(new_op->upcall.req.mkdir.attributes,
348 ORANGEFS_TYPE_DIRECTORY, mode);
349
350 strncpy(new_op->upcall.req.mkdir.d_name,
351 dentry->d_name.name, ORANGEFS_NAME_MAX);
352
353 ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
354
355 gossip_debug(GOSSIP_NAME_DEBUG,
356 "Mkdir Got ORANGEFS handle %pU on fsid %d\n",
357 &new_op->downcall.resp.mkdir.refn.khandle,
358 new_op->downcall.resp.mkdir.refn.fs_id);
359
360 if (ret < 0) {
361 gossip_debug(GOSSIP_NAME_DEBUG,
362 "%s: failed with error code %d\n",
363 __func__, ret);
364 goto out;
365 }
366
367 inode = orangefs_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0,
368 &new_op->downcall.resp.mkdir.refn);
369 if (IS_ERR(inode)) {
370 gossip_err("*** Failed to allocate orangefs dir inode\n");
371 ret = PTR_ERR(inode);
372 goto out;
373 }
374
375 gossip_debug(GOSSIP_NAME_DEBUG,
376 "Assigned dir inode new number of %pU\n",
377 get_khandle_from_ino(inode));
378
379 d_instantiate(dentry, inode);
380 unlock_new_inode(inode);
381
382 gossip_debug(GOSSIP_NAME_DEBUG,
383 "Inode (Directory) %pU -> %s\n",
384 get_khandle_from_ino(inode),
385 dentry->d_name.name);
386
387 /*
388 * NOTE: we have no good way to keep nlink consistent for directories
389 * across clients; keep constant at 1.
390 */
391 SetMtimeFlag(parent);
392 dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
393 mark_inode_dirty_sync(dir);
394out:
395 op_release(new_op);
396 return ret;
397}
398
399static int orangefs_rename(struct inode *old_dir,
400 struct dentry *old_dentry,
401 struct inode *new_dir,
402 struct dentry *new_dentry)
403{
404 struct orangefs_kernel_op_s *new_op;
405 int ret;
406
407 gossip_debug(GOSSIP_NAME_DEBUG,
408 "orangefs_rename: called (%s/%s => %s/%s) ct=%d\n",
409 old_dentry->d_parent->d_name.name,
410 old_dentry->d_name.name,
411 new_dentry->d_parent->d_name.name,
412 new_dentry->d_name.name,
413 d_count(new_dentry));
414
415 new_op = op_alloc(ORANGEFS_VFS_OP_RENAME);
416 if (!new_op)
417 return -EINVAL;
418
419 new_op->upcall.req.rename.old_parent_refn = ORANGEFS_I(old_dir)->refn;
420 new_op->upcall.req.rename.new_parent_refn = ORANGEFS_I(new_dir)->refn;
421
422 strncpy(new_op->upcall.req.rename.d_old_name,
423 old_dentry->d_name.name,
424 ORANGEFS_NAME_MAX);
425 strncpy(new_op->upcall.req.rename.d_new_name,
426 new_dentry->d_name.name,
427 ORANGEFS_NAME_MAX);
428
429 ret = service_operation(new_op,
430 "orangefs_rename",
431 get_interruptible_flag(old_dentry->d_inode));
432
433 gossip_debug(GOSSIP_NAME_DEBUG,
434 "orangefs_rename: got downcall status %d\n",
435 ret);
436
437 if (new_dentry->d_inode)
438 new_dentry->d_inode->i_ctime = CURRENT_TIME;
439
440 op_release(new_op);
441 return ret;
442}
443
444/* ORANGEFS implementation of VFS inode operations for directories */
445struct inode_operations orangefs_dir_inode_operations = {
446 .lookup = orangefs_lookup,
447 .get_acl = orangefs_get_acl,
448 .set_acl = orangefs_set_acl,
449 .create = orangefs_create,
450 .unlink = orangefs_unlink,
451 .symlink = orangefs_symlink,
452 .mkdir = orangefs_mkdir,
453 .rmdir = orangefs_unlink,
454 .rename = orangefs_rename,
455 .setattr = orangefs_setattr,
456 .getattr = orangefs_getattr,
457 .setxattr = generic_setxattr,
458 .getxattr = generic_getxattr,
459 .removexattr = generic_removexattr,
460 .listxattr = orangefs_listxattr,
461 .permission = orangefs_permission,
462};
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
new file mode 100644
index 000000000000..1f8acc9f9a88
--- /dev/null
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -0,0 +1,556 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6#include "protocol.h"
7#include "orangefs-kernel.h"
8#include "orangefs-bufmap.h"
9
10struct slot_map {
11 int c;
12 wait_queue_head_t q;
13 int count;
14 unsigned long *map;
15};
16
17static struct slot_map rw_map = {
18 .c = -1,
19 .q = __WAIT_QUEUE_HEAD_INITIALIZER(rw_map.q)
20};
21static struct slot_map readdir_map = {
22 .c = -1,
23 .q = __WAIT_QUEUE_HEAD_INITIALIZER(readdir_map.q)
24};
25
26
27static void install(struct slot_map *m, int count, unsigned long *map)
28{
29 spin_lock(&m->q.lock);
30 m->c = m->count = count;
31 m->map = map;
32 wake_up_all_locked(&m->q);
33 spin_unlock(&m->q.lock);
34}
35
36static void mark_killed(struct slot_map *m)
37{
38 spin_lock(&m->q.lock);
39 m->c -= m->count + 1;
40 spin_unlock(&m->q.lock);
41}
42
43static void run_down(struct slot_map *m)
44{
45 DEFINE_WAIT(wait);
46 spin_lock(&m->q.lock);
47 if (m->c != -1) {
48 for (;;) {
49 if (likely(list_empty(&wait.task_list)))
50 __add_wait_queue_tail(&m->q, &wait);
51 set_current_state(TASK_UNINTERRUPTIBLE);
52
53 if (m->c == -1)
54 break;
55
56 spin_unlock(&m->q.lock);
57 schedule();
58 spin_lock(&m->q.lock);
59 }
60 __remove_wait_queue(&m->q, &wait);
61 __set_current_state(TASK_RUNNING);
62 }
63 m->map = NULL;
64 spin_unlock(&m->q.lock);
65}
66
67static void put(struct slot_map *m, int slot)
68{
69 int v;
70 spin_lock(&m->q.lock);
71 __clear_bit(slot, m->map);
72 v = ++m->c;
73 if (unlikely(v == 1)) /* no free slots -> one free slot */
74 wake_up_locked(&m->q);
75 else if (unlikely(v == -1)) /* finished dying */
76 wake_up_all_locked(&m->q);
77 spin_unlock(&m->q.lock);
78}
79
80static int wait_for_free(struct slot_map *m)
81{
82 long left = slot_timeout_secs * HZ;
83 DEFINE_WAIT(wait);
84
85 do {
86 long n = left, t;
87 if (likely(list_empty(&wait.task_list)))
88 __add_wait_queue_tail_exclusive(&m->q, &wait);
89 set_current_state(TASK_INTERRUPTIBLE);
90
91 if (m->c > 0)
92 break;
93
94 if (m->c < 0) {
95 /* we are waiting for map to be installed */
96 /* it would better be there soon, or we go away */
97 if (n > ORANGEFS_BUFMAP_WAIT_TIMEOUT_SECS * HZ)
98 n = ORANGEFS_BUFMAP_WAIT_TIMEOUT_SECS * HZ;
99 }
100 spin_unlock(&m->q.lock);
101 t = schedule_timeout(n);
102 spin_lock(&m->q.lock);
103 if (unlikely(!t) && n != left && m->c < 0)
104 left = t;
105 else
106 left = t + (left - n);
107 if (unlikely(signal_pending(current)))
108 left = -EINTR;
109 } while (left > 0);
110
111 if (!list_empty(&wait.task_list))
112 list_del(&wait.task_list);
113 else if (left <= 0 && waitqueue_active(&m->q))
114 __wake_up_locked_key(&m->q, TASK_INTERRUPTIBLE, NULL);
115 __set_current_state(TASK_RUNNING);
116
117 if (likely(left > 0))
118 return 0;
119
120 return left < 0 ? -EINTR : -ETIMEDOUT;
121}
122
123static int get(struct slot_map *m)
124{
125 int res = 0;
126 spin_lock(&m->q.lock);
127 if (unlikely(m->c <= 0))
128 res = wait_for_free(m);
129 if (likely(!res)) {
130 m->c--;
131 res = find_first_zero_bit(m->map, m->count);
132 __set_bit(res, m->map);
133 }
134 spin_unlock(&m->q.lock);
135 return res;
136}
137
138/* used to describe mapped buffers */
139struct orangefs_bufmap_desc {
140 void *uaddr; /* user space address pointer */
141 struct page **page_array; /* array of mapped pages */
142 int array_count; /* size of above arrays */
143 struct list_head list_link;
144};
145
146static struct orangefs_bufmap {
147 int desc_size;
148 int desc_shift;
149 int desc_count;
150 int total_size;
151 int page_count;
152
153 struct page **page_array;
154 struct orangefs_bufmap_desc *desc_array;
155
156 /* array to track usage of buffer descriptors */
157 unsigned long *buffer_index_array;
158
159 /* array to track usage of buffer descriptors for readdir */
160#define N DIV_ROUND_UP(ORANGEFS_READDIR_DEFAULT_DESC_COUNT, BITS_PER_LONG)
161 unsigned long readdir_index_array[N];
162#undef N
163} *__orangefs_bufmap;
164
165static DEFINE_SPINLOCK(orangefs_bufmap_lock);
166
167static void
168orangefs_bufmap_unmap(struct orangefs_bufmap *bufmap)
169{
170 int i;
171
172 for (i = 0; i < bufmap->page_count; i++)
173 page_cache_release(bufmap->page_array[i]);
174}
175
176static void
177orangefs_bufmap_free(struct orangefs_bufmap *bufmap)
178{
179 kfree(bufmap->page_array);
180 kfree(bufmap->desc_array);
181 kfree(bufmap->buffer_index_array);
182 kfree(bufmap);
183}
184
185/*
186 * XXX: Can the size and shift change while the caller gives up the
187 * XXX: lock between calling this and doing something useful?
188 */
189
190int orangefs_bufmap_size_query(void)
191{
192 struct orangefs_bufmap *bufmap;
193 int size = 0;
194 spin_lock(&orangefs_bufmap_lock);
195 bufmap = __orangefs_bufmap;
196 if (bufmap)
197 size = bufmap->desc_size;
198 spin_unlock(&orangefs_bufmap_lock);
199 return size;
200}
201
202int orangefs_bufmap_shift_query(void)
203{
204 struct orangefs_bufmap *bufmap;
205 int shift = 0;
206 spin_lock(&orangefs_bufmap_lock);
207 bufmap = __orangefs_bufmap;
208 if (bufmap)
209 shift = bufmap->desc_shift;
210 spin_unlock(&orangefs_bufmap_lock);
211 return shift;
212}
213
214static DECLARE_WAIT_QUEUE_HEAD(bufmap_waitq);
215static DECLARE_WAIT_QUEUE_HEAD(readdir_waitq);
216
217/*
218 * orangefs_get_bufmap_init
219 *
220 * If bufmap_init is 1, then the shared memory system, including the
221 * buffer_index_array, is available. Otherwise, it is not.
222 *
223 * returns the value of bufmap_init
224 */
225int orangefs_get_bufmap_init(void)
226{
227 return __orangefs_bufmap ? 1 : 0;
228}
229
230
231static struct orangefs_bufmap *
232orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc)
233{
234 struct orangefs_bufmap *bufmap;
235
236 bufmap = kzalloc(sizeof(*bufmap), GFP_KERNEL);
237 if (!bufmap)
238 goto out;
239
240 bufmap->total_size = user_desc->total_size;
241 bufmap->desc_count = user_desc->count;
242 bufmap->desc_size = user_desc->size;
243 bufmap->desc_shift = ilog2(bufmap->desc_size);
244
245 bufmap->buffer_index_array =
246 kzalloc(DIV_ROUND_UP(bufmap->desc_count, BITS_PER_LONG), GFP_KERNEL);
247 if (!bufmap->buffer_index_array) {
248 gossip_err("orangefs: could not allocate %d buffer indices\n",
249 bufmap->desc_count);
250 goto out_free_bufmap;
251 }
252
253 bufmap->desc_array =
254 kcalloc(bufmap->desc_count, sizeof(struct orangefs_bufmap_desc),
255 GFP_KERNEL);
256 if (!bufmap->desc_array) {
257 gossip_err("orangefs: could not allocate %d descriptors\n",
258 bufmap->desc_count);
259 goto out_free_index_array;
260 }
261
262 bufmap->page_count = bufmap->total_size / PAGE_SIZE;
263
264 /* allocate storage to track our page mappings */
265 bufmap->page_array =
266 kcalloc(bufmap->page_count, sizeof(struct page *), GFP_KERNEL);
267 if (!bufmap->page_array)
268 goto out_free_desc_array;
269
270 return bufmap;
271
272out_free_desc_array:
273 kfree(bufmap->desc_array);
274out_free_index_array:
275 kfree(bufmap->buffer_index_array);
276out_free_bufmap:
277 kfree(bufmap);
278out:
279 return NULL;
280}
281
282static int
283orangefs_bufmap_map(struct orangefs_bufmap *bufmap,
284 struct ORANGEFS_dev_map_desc *user_desc)
285{
286 int pages_per_desc = bufmap->desc_size / PAGE_SIZE;
287 int offset = 0, ret, i;
288
289 /* map the pages */
290 ret = get_user_pages_fast((unsigned long)user_desc->ptr,
291 bufmap->page_count, 1, bufmap->page_array);
292
293 if (ret < 0)
294 return ret;
295
296 if (ret != bufmap->page_count) {
297 gossip_err("orangefs error: asked for %d pages, only got %d.\n",
298 bufmap->page_count, ret);
299
300 for (i = 0; i < ret; i++) {
301 SetPageError(bufmap->page_array[i]);
302 page_cache_release(bufmap->page_array[i]);
303 }
304 return -ENOMEM;
305 }
306
307 /*
308 * ideally we want to get kernel space pointers for each page, but
309 * we can't kmap that many pages at once if highmem is being used.
310 * so instead, we just kmap/kunmap the page address each time the
311 * kaddr is needed.
312 */
313 for (i = 0; i < bufmap->page_count; i++)
314 flush_dcache_page(bufmap->page_array[i]);
315
316 /* build a list of available descriptors */
317 for (offset = 0, i = 0; i < bufmap->desc_count; i++) {
318 bufmap->desc_array[i].page_array = &bufmap->page_array[offset];
319 bufmap->desc_array[i].array_count = pages_per_desc;
320 bufmap->desc_array[i].uaddr =
321 (user_desc->ptr + (i * pages_per_desc * PAGE_SIZE));
322 offset += pages_per_desc;
323 }
324
325 return 0;
326}
327
328/*
329 * orangefs_bufmap_initialize()
330 *
331 * initializes the mapped buffer interface
332 *
333 * returns 0 on success, -errno on failure
334 */
335int orangefs_bufmap_initialize(struct ORANGEFS_dev_map_desc *user_desc)
336{
337 struct orangefs_bufmap *bufmap;
338 int ret = -EINVAL;
339
340 gossip_debug(GOSSIP_BUFMAP_DEBUG,
341 "orangefs_bufmap_initialize: called (ptr ("
342 "%p) sz (%d) cnt(%d).\n",
343 user_desc->ptr,
344 user_desc->size,
345 user_desc->count);
346
347 /*
348 * sanity check alignment and size of buffer that caller wants to
349 * work with
350 */
351 if (PAGE_ALIGN((unsigned long)user_desc->ptr) !=
352 (unsigned long)user_desc->ptr) {
353 gossip_err("orangefs error: memory alignment (front). %p\n",
354 user_desc->ptr);
355 goto out;
356 }
357
358 if (PAGE_ALIGN(((unsigned long)user_desc->ptr + user_desc->total_size))
359 != (unsigned long)(user_desc->ptr + user_desc->total_size)) {
360 gossip_err("orangefs error: memory alignment (back).(%p + %d)\n",
361 user_desc->ptr,
362 user_desc->total_size);
363 goto out;
364 }
365
366 if (user_desc->total_size != (user_desc->size * user_desc->count)) {
367 gossip_err("orangefs error: user provided an oddly sized buffer: (%d, %d, %d)\n",
368 user_desc->total_size,
369 user_desc->size,
370 user_desc->count);
371 goto out;
372 }
373
374 if ((user_desc->size % PAGE_SIZE) != 0) {
375 gossip_err("orangefs error: bufmap size not page size divisible (%d).\n",
376 user_desc->size);
377 goto out;
378 }
379
380 ret = -ENOMEM;
381 bufmap = orangefs_bufmap_alloc(user_desc);
382 if (!bufmap)
383 goto out;
384
385 ret = orangefs_bufmap_map(bufmap, user_desc);
386 if (ret)
387 goto out_free_bufmap;
388
389
390 spin_lock(&orangefs_bufmap_lock);
391 if (__orangefs_bufmap) {
392 spin_unlock(&orangefs_bufmap_lock);
393 gossip_err("orangefs: error: bufmap already initialized.\n");
394 ret = -EINVAL;
395 goto out_unmap_bufmap;
396 }
397 __orangefs_bufmap = bufmap;
398 install(&rw_map,
399 bufmap->desc_count,
400 bufmap->buffer_index_array);
401 install(&readdir_map,
402 ORANGEFS_READDIR_DEFAULT_DESC_COUNT,
403 bufmap->readdir_index_array);
404 spin_unlock(&orangefs_bufmap_lock);
405
406 gossip_debug(GOSSIP_BUFMAP_DEBUG,
407 "orangefs_bufmap_initialize: exiting normally\n");
408 return 0;
409
410out_unmap_bufmap:
411 orangefs_bufmap_unmap(bufmap);
412out_free_bufmap:
413 orangefs_bufmap_free(bufmap);
414out:
415 return ret;
416}
417
418/*
419 * orangefs_bufmap_finalize()
420 *
421 * shuts down the mapped buffer interface and releases any resources
422 * associated with it
423 *
424 * no return value
425 */
426void orangefs_bufmap_finalize(void)
427{
428 struct orangefs_bufmap *bufmap = __orangefs_bufmap;
429 if (!bufmap)
430 return;
431 gossip_debug(GOSSIP_BUFMAP_DEBUG, "orangefs_bufmap_finalize: called\n");
432 mark_killed(&rw_map);
433 mark_killed(&readdir_map);
434 gossip_debug(GOSSIP_BUFMAP_DEBUG,
435 "orangefs_bufmap_finalize: exiting normally\n");
436}
437
438void orangefs_bufmap_run_down(void)
439{
440 struct orangefs_bufmap *bufmap = __orangefs_bufmap;
441 if (!bufmap)
442 return;
443 run_down(&rw_map);
444 run_down(&readdir_map);
445 spin_lock(&orangefs_bufmap_lock);
446 __orangefs_bufmap = NULL;
447 spin_unlock(&orangefs_bufmap_lock);
448 orangefs_bufmap_unmap(bufmap);
449 orangefs_bufmap_free(bufmap);
450}
451
452/*
453 * orangefs_bufmap_get()
454 *
455 * gets a free mapped buffer descriptor, will sleep until one becomes
456 * available if necessary
457 *
458 * returns slot on success, -errno on failure
459 */
460int orangefs_bufmap_get(void)
461{
462 return get(&rw_map);
463}
464
465/*
466 * orangefs_bufmap_put()
467 *
468 * returns a mapped buffer descriptor to the collection
469 *
470 * no return value
471 */
472void orangefs_bufmap_put(int buffer_index)
473{
474 put(&rw_map, buffer_index);
475}
476
477/*
478 * orangefs_readdir_index_get()
479 *
480 * gets a free descriptor, will sleep until one becomes
481 * available if necessary.
482 * Although the readdir buffers are not mapped into kernel space
483 * we could do that at a later point of time. Regardless, these
484 * indices are used by the client-core.
485 *
486 * returns slot on success, -errno on failure
487 */
488int orangefs_readdir_index_get(void)
489{
490 return get(&readdir_map);
491}
492
493void orangefs_readdir_index_put(int buffer_index)
494{
495 put(&readdir_map, buffer_index);
496}
497
498/*
499 * we've been handed an iovec, we need to copy it to
500 * the shared memory descriptor at "buffer_index".
501 */
502int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter,
503 int buffer_index,
504 size_t size)
505{
506 struct orangefs_bufmap_desc *to;
507 int i;
508
509 gossip_debug(GOSSIP_BUFMAP_DEBUG,
510 "%s: buffer_index:%d: size:%zu:\n",
511 __func__, buffer_index, size);
512
513 to = &__orangefs_bufmap->desc_array[buffer_index];
514 for (i = 0; size; i++) {
515 struct page *page = to->page_array[i];
516 size_t n = size;
517 if (n > PAGE_SIZE)
518 n = PAGE_SIZE;
519 n = copy_page_from_iter(page, 0, n, iter);
520 if (!n)
521 return -EFAULT;
522 size -= n;
523 }
524 return 0;
525
526}
527
528/*
529 * we've been handed an iovec, we need to fill it from
530 * the shared memory descriptor at "buffer_index".
531 */
532int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter,
533 int buffer_index,
534 size_t size)
535{
536 struct orangefs_bufmap_desc *from;
537 int i;
538
539 from = &__orangefs_bufmap->desc_array[buffer_index];
540 gossip_debug(GOSSIP_BUFMAP_DEBUG,
541 "%s: buffer_index:%d: size:%zu:\n",
542 __func__, buffer_index, size);
543
544
545 for (i = 0; size; i++) {
546 struct page *page = from->page_array[i];
547 size_t n = size;
548 if (n > PAGE_SIZE)
549 n = PAGE_SIZE;
550 n = copy_page_to_iter(page, 0, n, iter);
551 if (!n)
552 return -EFAULT;
553 size -= n;
554 }
555 return 0;
556}
diff --git a/fs/orangefs/orangefs-bufmap.h b/fs/orangefs/orangefs-bufmap.h
new file mode 100644
index 000000000000..71f64f4057b5
--- /dev/null
+++ b/fs/orangefs/orangefs-bufmap.h
@@ -0,0 +1,36 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7#ifndef __ORANGEFS_BUFMAP_H
8#define __ORANGEFS_BUFMAP_H
9
10int orangefs_bufmap_size_query(void);
11
12int orangefs_bufmap_shift_query(void);
13
14int orangefs_bufmap_initialize(struct ORANGEFS_dev_map_desc *user_desc);
15
16void orangefs_bufmap_finalize(void);
17
18void orangefs_bufmap_run_down(void);
19
20int orangefs_bufmap_get(void);
21
22void orangefs_bufmap_put(int buffer_index);
23
24int orangefs_readdir_index_get(void);
25
26void orangefs_readdir_index_put(int buffer_index);
27
28int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter,
29 int buffer_index,
30 size_t size);
31
32int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter,
33 int buffer_index,
34 size_t size);
35
36#endif /* __ORANGEFS_BUFMAP_H */
diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c
new file mode 100644
index 000000000000..900a2e38e11b
--- /dev/null
+++ b/fs/orangefs/orangefs-cache.c
@@ -0,0 +1,161 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7#include "protocol.h"
8#include "orangefs-kernel.h"
9
10/* tags assigned to kernel upcall operations */
11static __u64 next_tag_value;
12static DEFINE_SPINLOCK(next_tag_value_lock);
13
14/* the orangefs memory caches */
15
16/* a cache for orangefs upcall/downcall operations */
17static struct kmem_cache *op_cache;
18
19int op_cache_initialize(void)
20{
21 op_cache = kmem_cache_create("orangefs_op_cache",
22 sizeof(struct orangefs_kernel_op_s),
23 0,
24 ORANGEFS_CACHE_CREATE_FLAGS,
25 NULL);
26
27 if (!op_cache) {
28 gossip_err("Cannot create orangefs_op_cache\n");
29 return -ENOMEM;
30 }
31
32 /* initialize our atomic tag counter */
33 spin_lock(&next_tag_value_lock);
34 next_tag_value = 100;
35 spin_unlock(&next_tag_value_lock);
36 return 0;
37}
38
39int op_cache_finalize(void)
40{
41 kmem_cache_destroy(op_cache);
42 return 0;
43}
44
45char *get_opname_string(struct orangefs_kernel_op_s *new_op)
46{
47 if (new_op) {
48 __s32 type = new_op->upcall.type;
49
50 if (type == ORANGEFS_VFS_OP_FILE_IO)
51 return "OP_FILE_IO";
52 else if (type == ORANGEFS_VFS_OP_LOOKUP)
53 return "OP_LOOKUP";
54 else if (type == ORANGEFS_VFS_OP_CREATE)
55 return "OP_CREATE";
56 else if (type == ORANGEFS_VFS_OP_GETATTR)
57 return "OP_GETATTR";
58 else if (type == ORANGEFS_VFS_OP_REMOVE)
59 return "OP_REMOVE";
60 else if (type == ORANGEFS_VFS_OP_MKDIR)
61 return "OP_MKDIR";
62 else if (type == ORANGEFS_VFS_OP_READDIR)
63 return "OP_READDIR";
64 else if (type == ORANGEFS_VFS_OP_READDIRPLUS)
65 return "OP_READDIRPLUS";
66 else if (type == ORANGEFS_VFS_OP_SETATTR)
67 return "OP_SETATTR";
68 else if (type == ORANGEFS_VFS_OP_SYMLINK)
69 return "OP_SYMLINK";
70 else if (type == ORANGEFS_VFS_OP_RENAME)
71 return "OP_RENAME";
72 else if (type == ORANGEFS_VFS_OP_STATFS)
73 return "OP_STATFS";
74 else if (type == ORANGEFS_VFS_OP_TRUNCATE)
75 return "OP_TRUNCATE";
76 else if (type == ORANGEFS_VFS_OP_MMAP_RA_FLUSH)
77 return "OP_MMAP_RA_FLUSH";
78 else if (type == ORANGEFS_VFS_OP_FS_MOUNT)
79 return "OP_FS_MOUNT";
80 else if (type == ORANGEFS_VFS_OP_FS_UMOUNT)
81 return "OP_FS_UMOUNT";
82 else if (type == ORANGEFS_VFS_OP_GETXATTR)
83 return "OP_GETXATTR";
84 else if (type == ORANGEFS_VFS_OP_SETXATTR)
85 return "OP_SETXATTR";
86 else if (type == ORANGEFS_VFS_OP_LISTXATTR)
87 return "OP_LISTXATTR";
88 else if (type == ORANGEFS_VFS_OP_REMOVEXATTR)
89 return "OP_REMOVEXATTR";
90 else if (type == ORANGEFS_VFS_OP_PARAM)
91 return "OP_PARAM";
92 else if (type == ORANGEFS_VFS_OP_PERF_COUNT)
93 return "OP_PERF_COUNT";
94 else if (type == ORANGEFS_VFS_OP_CANCEL)
95 return "OP_CANCEL";
96 else if (type == ORANGEFS_VFS_OP_FSYNC)
97 return "OP_FSYNC";
98 else if (type == ORANGEFS_VFS_OP_FSKEY)
99 return "OP_FSKEY";
100 }
101 return "OP_UNKNOWN?";
102}
103
104void orangefs_new_tag(struct orangefs_kernel_op_s *op)
105{
106 spin_lock(&next_tag_value_lock);
107 op->tag = next_tag_value++;
108 if (next_tag_value == 0)
109 next_tag_value = 100;
110 spin_unlock(&next_tag_value_lock);
111}
112
113struct orangefs_kernel_op_s *op_alloc(__s32 type)
114{
115 struct orangefs_kernel_op_s *new_op = NULL;
116
117 new_op = kmem_cache_zalloc(op_cache, GFP_KERNEL);
118 if (new_op) {
119 INIT_LIST_HEAD(&new_op->list);
120 spin_lock_init(&new_op->lock);
121 init_completion(&new_op->waitq);
122
123 new_op->upcall.type = ORANGEFS_VFS_OP_INVALID;
124 new_op->downcall.type = ORANGEFS_VFS_OP_INVALID;
125 new_op->downcall.status = -1;
126
127 new_op->op_state = OP_VFS_STATE_UNKNOWN;
128
129 /* initialize the op specific tag and upcall credentials */
130 orangefs_new_tag(new_op);
131 new_op->upcall.type = type;
132 new_op->attempts = 0;
133 gossip_debug(GOSSIP_CACHE_DEBUG,
134 "Alloced OP (%p: %llu %s)\n",
135 new_op,
136 llu(new_op->tag),
137 get_opname_string(new_op));
138
139 new_op->upcall.uid = from_kuid(current_user_ns(),
140 current_fsuid());
141
142 new_op->upcall.gid = from_kgid(current_user_ns(),
143 current_fsgid());
144 } else {
145 gossip_err("op_alloc: kmem_cache_zalloc failed!\n");
146 }
147 return new_op;
148}
149
150void op_release(struct orangefs_kernel_op_s *orangefs_op)
151{
152 if (orangefs_op) {
153 gossip_debug(GOSSIP_CACHE_DEBUG,
154 "Releasing OP (%p: %llu)\n",
155 orangefs_op,
156 llu(orangefs_op->tag));
157 kmem_cache_free(op_cache, orangefs_op);
158 } else {
159 gossip_err("NULL pointer in op_release\n");
160 }
161}
diff --git a/fs/orangefs/orangefs-debug.h b/fs/orangefs/orangefs-debug.h
new file mode 100644
index 000000000000..387db17cde2b
--- /dev/null
+++ b/fs/orangefs/orangefs-debug.h
@@ -0,0 +1,92 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7/* This file just defines debugging masks to be used with the gossip
8 * logging utility. All debugging masks for ORANGEFS are kept here to make
9 * sure we don't have collisions.
10 */
11
12#ifndef __ORANGEFS_DEBUG_H
13#define __ORANGEFS_DEBUG_H
14
15#ifdef __KERNEL__
16#include <linux/types.h>
17#else
18#include <stdint.h>
19#endif
20
21#define GOSSIP_NO_DEBUG (__u64)0
22
23#define GOSSIP_SUPER_DEBUG ((__u64)1 << 0)
24#define GOSSIP_INODE_DEBUG ((__u64)1 << 1)
25#define GOSSIP_FILE_DEBUG ((__u64)1 << 2)
26#define GOSSIP_DIR_DEBUG ((__u64)1 << 3)
27#define GOSSIP_UTILS_DEBUG ((__u64)1 << 4)
28#define GOSSIP_WAIT_DEBUG ((__u64)1 << 5)
29#define GOSSIP_ACL_DEBUG ((__u64)1 << 6)
30#define GOSSIP_DCACHE_DEBUG ((__u64)1 << 7)
31#define GOSSIP_DEV_DEBUG ((__u64)1 << 8)
32#define GOSSIP_NAME_DEBUG ((__u64)1 << 9)
33#define GOSSIP_BUFMAP_DEBUG ((__u64)1 << 10)
34#define GOSSIP_CACHE_DEBUG ((__u64)1 << 11)
35#define GOSSIP_DEBUGFS_DEBUG ((__u64)1 << 12)
36#define GOSSIP_XATTR_DEBUG ((__u64)1 << 13)
37#define GOSSIP_INIT_DEBUG ((__u64)1 << 14)
38#define GOSSIP_SYSFS_DEBUG ((__u64)1 << 15)
39
40#define GOSSIP_MAX_NR 16
41#define GOSSIP_MAX_DEBUG (((__u64)1 << GOSSIP_MAX_NR) - 1)
42
43/*function prototypes*/
44__u64 ORANGEFS_kmod_eventlog_to_mask(const char *event_logging);
45__u64 ORANGEFS_debug_eventlog_to_mask(const char *event_logging);
46char *ORANGEFS_debug_mask_to_eventlog(__u64 mask);
47char *ORANGEFS_kmod_mask_to_eventlog(__u64 mask);
48
49/* a private internal type */
50struct __keyword_mask_s {
51 const char *keyword;
52 __u64 mask_val;
53};
54
55/*
56 * Map all kmod keywords to kmod debug masks here. Keep this
57 * structure "packed":
58 *
59 * "all" is always last...
60 *
61 * keyword mask_val index
62 * foo 1 0
63 * bar 2 1
64 * baz 4 2
65 * qux 8 3
66 * . . .
67 */
68static struct __keyword_mask_s s_kmod_keyword_mask_map[] = {
69 {"super", GOSSIP_SUPER_DEBUG},
70 {"inode", GOSSIP_INODE_DEBUG},
71 {"file", GOSSIP_FILE_DEBUG},
72 {"dir", GOSSIP_DIR_DEBUG},
73 {"utils", GOSSIP_UTILS_DEBUG},
74 {"wait", GOSSIP_WAIT_DEBUG},
75 {"acl", GOSSIP_ACL_DEBUG},
76 {"dcache", GOSSIP_DCACHE_DEBUG},
77 {"dev", GOSSIP_DEV_DEBUG},
78 {"name", GOSSIP_NAME_DEBUG},
79 {"bufmap", GOSSIP_BUFMAP_DEBUG},
80 {"cache", GOSSIP_CACHE_DEBUG},
81 {"debugfs", GOSSIP_DEBUGFS_DEBUG},
82 {"xattr", GOSSIP_XATTR_DEBUG},
83 {"init", GOSSIP_INIT_DEBUG},
84 {"sysfs", GOSSIP_SYSFS_DEBUG},
85 {"none", GOSSIP_NO_DEBUG},
86 {"all", GOSSIP_MAX_DEBUG}
87};
88
89static const int num_kmod_keyword_mask_map = (int)
90 (sizeof(s_kmod_keyword_mask_map) / sizeof(struct __keyword_mask_s));
91
92#endif /* __ORANGEFS_DEBUG_H */
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
new file mode 100644
index 000000000000..19670b8b4053
--- /dev/null
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -0,0 +1,455 @@
1/*
2 * What: /sys/kernel/debug/orangefs/debug-help
3 * Date: June 2015
4 * Contact: Mike Marshall <hubcap@omnibond.com>
5 * Description:
6 * List of client and kernel debug keywords.
7 *
8 *
9 * What: /sys/kernel/debug/orangefs/client-debug
10 * Date: June 2015
11 * Contact: Mike Marshall <hubcap@omnibond.com>
12 * Description:
13 * Debug setting for "the client", the userspace
14 * helper for the kernel module.
15 *
16 *
17 * What: /sys/kernel/debug/orangefs/kernel-debug
18 * Date: June 2015
19 * Contact: Mike Marshall <hubcap@omnibond.com>
20 * Description:
21 * Debug setting for the orangefs kernel module.
22 *
23 * Any of the keywords, or comma-separated lists
24 * of keywords, from debug-help can be catted to
25 * client-debug or kernel-debug.
26 *
27 * "none", "all" and "verbose" are special keywords
28 * for client-debug. Setting client-debug to "all"
29 * is kind of like trying to drink water from a
30 * fire hose, "verbose" triggers most of the same
31 * output except for the constant flow of output
32 * from the main wait loop.
33 *
34 * "none" and "all" are similar settings for kernel-debug
35 * no need for a "verbose".
36 */
37#include <linux/debugfs.h>
38#include <linux/slab.h>
39
40#include <linux/uaccess.h>
41
42#include "orangefs-debugfs.h"
43#include "protocol.h"
44#include "orangefs-kernel.h"
45
46static int orangefs_debug_disabled = 1;
47
48static int orangefs_debug_help_open(struct inode *, struct file *);
49
50const struct file_operations debug_help_fops = {
51 .open = orangefs_debug_help_open,
52 .read = seq_read,
53 .release = seq_release,
54 .llseek = seq_lseek,
55};
56
57static void *help_start(struct seq_file *, loff_t *);
58static void *help_next(struct seq_file *, void *, loff_t *);
59static void help_stop(struct seq_file *, void *);
60static int help_show(struct seq_file *, void *);
61
62static const struct seq_operations help_debug_ops = {
63 .start = help_start,
64 .next = help_next,
65 .stop = help_stop,
66 .show = help_show,
67};
68
69/*
70 * Used to protect data in ORANGEFS_KMOD_DEBUG_FILE and
71 * ORANGEFS_KMOD_DEBUG_FILE.
72 */
73static DEFINE_MUTEX(orangefs_debug_lock);
74
75int orangefs_debug_open(struct inode *, struct file *);
76
77static ssize_t orangefs_debug_read(struct file *,
78 char __user *,
79 size_t,
80 loff_t *);
81
82static ssize_t orangefs_debug_write(struct file *,
83 const char __user *,
84 size_t,
85 loff_t *);
86
87static const struct file_operations kernel_debug_fops = {
88 .open = orangefs_debug_open,
89 .read = orangefs_debug_read,
90 .write = orangefs_debug_write,
91 .llseek = generic_file_llseek,
92};
93
94/*
95 * initialize kmod debug operations, create orangefs debugfs dir and
96 * ORANGEFS_KMOD_DEBUG_HELP_FILE.
97 */
98int orangefs_debugfs_init(void)
99{
100
101 int rc = -ENOMEM;
102
103 debug_dir = debugfs_create_dir("orangefs", NULL);
104 if (!debug_dir) {
105 pr_info("%s: debugfs_create_dir failed.\n", __func__);
106 goto out;
107 }
108
109 help_file_dentry = debugfs_create_file(ORANGEFS_KMOD_DEBUG_HELP_FILE,
110 0444,
111 debug_dir,
112 debug_help_string,
113 &debug_help_fops);
114 if (!help_file_dentry) {
115 pr_info("%s: debugfs_create_file failed.\n", __func__);
116 goto out;
117 }
118
119 orangefs_debug_disabled = 0;
120 rc = 0;
121
122out:
123
124 return rc;
125}
126
127void orangefs_debugfs_cleanup(void)
128{
129 if (debug_dir)
130 debugfs_remove_recursive(debug_dir);
131}
132
133/* open ORANGEFS_KMOD_DEBUG_HELP_FILE */
134static int orangefs_debug_help_open(struct inode *inode, struct file *file)
135{
136 int rc = -ENODEV;
137 int ret;
138
139 gossip_debug(GOSSIP_DEBUGFS_DEBUG,
140 "orangefs_debug_help_open: start\n");
141
142 if (orangefs_debug_disabled)
143 goto out;
144
145 ret = seq_open(file, &help_debug_ops);
146 if (ret)
147 goto out;
148
149 ((struct seq_file *)(file->private_data))->private = inode->i_private;
150
151 rc = 0;
152
153out:
154 gossip_debug(GOSSIP_DEBUGFS_DEBUG,
155 "orangefs_debug_help_open: rc:%d:\n",
156 rc);
157 return rc;
158}
159
160/*
161 * I think start always gets called again after stop. Start
162 * needs to return NULL when it is done. The whole "payload"
163 * in this case is a single (long) string, so by the second
164 * time we get to start (pos = 1), we're done.
165 */
166static void *help_start(struct seq_file *m, loff_t *pos)
167{
168 void *payload = NULL;
169
170 gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_start: start\n");
171
172 if (*pos == 0)
173 payload = m->private;
174
175 return payload;
176}
177
178static void *help_next(struct seq_file *m, void *v, loff_t *pos)
179{
180 gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_next: start\n");
181
182 return NULL;
183}
184
185static void help_stop(struct seq_file *m, void *p)
186{
187 gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_stop: start\n");
188}
189
190static int help_show(struct seq_file *m, void *v)
191{
192 gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_show: start\n");
193
194 seq_puts(m, v);
195
196 return 0;
197}
198
199/*
200 * initialize the kernel-debug file.
201 */
202int orangefs_kernel_debug_init(void)
203{
204 int rc = -ENOMEM;
205 struct dentry *ret;
206 char *k_buffer = NULL;
207
208 gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
209
210 k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
211 if (!k_buffer)
212 goto out;
213
214 if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
215 strcpy(k_buffer, kernel_debug_string);
216 strcat(k_buffer, "\n");
217 } else {
218 strcpy(k_buffer, "none\n");
219 pr_info("%s: overflow 1!\n", __func__);
220 }
221
222 ret = debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE,
223 0444,
224 debug_dir,
225 k_buffer,
226 &kernel_debug_fops);
227 if (!ret) {
228 pr_info("%s: failed to create %s.\n",
229 __func__,
230 ORANGEFS_KMOD_DEBUG_FILE);
231 goto out;
232 }
233
234 rc = 0;
235
236out:
237
238 gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
239 return rc;
240}
241
242/*
243 * initialize the client-debug file.
244 */
245int orangefs_client_debug_init(void)
246{
247
248 int rc = -ENOMEM;
249 char *c_buffer = NULL;
250
251 gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
252
253 c_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
254 if (!c_buffer)
255 goto out;
256
257 if (strlen(client_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
258 strcpy(c_buffer, client_debug_string);
259 strcat(c_buffer, "\n");
260 } else {
261 strcpy(c_buffer, "none\n");
262 pr_info("%s: overflow! 2\n", __func__);
263 }
264
265 client_debug_dentry = debugfs_create_file(ORANGEFS_CLIENT_DEBUG_FILE,
266 0444,
267 debug_dir,
268 c_buffer,
269 &kernel_debug_fops);
270 if (!client_debug_dentry) {
271 pr_info("%s: failed to create updated %s.\n",
272 __func__,
273 ORANGEFS_CLIENT_DEBUG_FILE);
274 goto out;
275 }
276
277 rc = 0;
278
279out:
280
281 gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
282 return rc;
283}
284
285/* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/
286int orangefs_debug_open(struct inode *inode, struct file *file)
287{
288 int rc = -ENODEV;
289
290 gossip_debug(GOSSIP_DEBUGFS_DEBUG,
291 "%s: orangefs_debug_disabled: %d\n",
292 __func__,
293 orangefs_debug_disabled);
294
295 if (orangefs_debug_disabled)
296 goto out;
297
298 rc = 0;
299 mutex_lock(&orangefs_debug_lock);
300 file->private_data = inode->i_private;
301 mutex_unlock(&orangefs_debug_lock);
302
303out:
304 gossip_debug(GOSSIP_DEBUGFS_DEBUG,
305 "orangefs_debug_open: rc: %d\n",
306 rc);
307 return rc;
308}
309
310static ssize_t orangefs_debug_read(struct file *file,
311 char __user *ubuf,
312 size_t count,
313 loff_t *ppos)
314{
315 char *buf;
316 int sprintf_ret;
317 ssize_t read_ret = -ENOMEM;
318
319 gossip_debug(GOSSIP_DEBUGFS_DEBUG, "orangefs_debug_read: start\n");
320
321 buf = kmalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
322 if (!buf)
323 goto out;
324
325 mutex_lock(&orangefs_debug_lock);
326 sprintf_ret = sprintf(buf, "%s", (char *)file->private_data);
327 mutex_unlock(&orangefs_debug_lock);
328
329 read_ret = simple_read_from_buffer(ubuf, count, ppos, buf, sprintf_ret);
330
331 kfree(buf);
332
333out:
334 gossip_debug(GOSSIP_DEBUGFS_DEBUG,
335 "orangefs_debug_read: ret: %zu\n",
336 read_ret);
337
338 return read_ret;
339}
340
341static ssize_t orangefs_debug_write(struct file *file,
342 const char __user *ubuf,
343 size_t count,
344 loff_t *ppos)
345{
346 char *buf;
347 int rc = -EFAULT;
348 size_t silly = 0;
349 char *debug_string;
350 struct orangefs_kernel_op_s *new_op = NULL;
351 struct client_debug_mask c_mask = { NULL, 0, 0 };
352
353 gossip_debug(GOSSIP_DEBUGFS_DEBUG,
354 "orangefs_debug_write: %s\n",
355 file->f_path.dentry->d_name.name);
356
357 /*
358 * Thwart users who try to jamb a ridiculous number
359 * of bytes into the debug file...
360 */
361 if (count > ORANGEFS_MAX_DEBUG_STRING_LEN + 1) {
362 silly = count;
363 count = ORANGEFS_MAX_DEBUG_STRING_LEN + 1;
364 }
365
366 buf = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
367 if (!buf)
368 goto out;
369
370 if (copy_from_user(buf, ubuf, count - 1)) {
371 gossip_debug(GOSSIP_DEBUGFS_DEBUG,
372 "%s: copy_from_user failed!\n",
373 __func__);
374 goto out;
375 }
376
377 /*
378 * Map the keyword string from userspace into a valid debug mask.
379 * The mapping process involves mapping the human-inputted string
380 * into a valid mask, and then rebuilding the string from the
381 * verified valid mask.
382 *
383 * A service operation is required to set a new client-side
384 * debug mask.
385 */
386 if (!strcmp(file->f_path.dentry->d_name.name,
387 ORANGEFS_KMOD_DEBUG_FILE)) {
388 debug_string_to_mask(buf, &gossip_debug_mask, 0);
389 debug_mask_to_string(&gossip_debug_mask, 0);
390 debug_string = kernel_debug_string;
391 gossip_debug(GOSSIP_DEBUGFS_DEBUG,
392 "New kernel debug string is %s\n",
393 kernel_debug_string);
394 } else {
395 /* Can't reset client debug mask if client is not running. */
396 if (is_daemon_in_service()) {
397 pr_info("%s: Client not running :%d:\n",
398 __func__,
399 is_daemon_in_service());
400 goto out;
401 }
402
403 debug_string_to_mask(buf, &c_mask, 1);
404 debug_mask_to_string(&c_mask, 1);
405 debug_string = client_debug_string;
406
407 new_op = op_alloc(ORANGEFS_VFS_OP_PARAM);
408 if (!new_op) {
409 pr_info("%s: op_alloc failed!\n", __func__);
410 goto out;
411 }
412
413 new_op->upcall.req.param.op =
414 ORANGEFS_PARAM_REQUEST_OP_TWO_MASK_VALUES;
415 new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET;
416 memset(new_op->upcall.req.param.s_value,
417 0,
418 ORANGEFS_MAX_DEBUG_STRING_LEN);
419 sprintf(new_op->upcall.req.param.s_value,
420 "%llx %llx\n",
421 c_mask.mask1,
422 c_mask.mask2);
423
424 /* service_operation returns 0 on success... */
425 rc = service_operation(new_op,
426 "orangefs_param",
427 ORANGEFS_OP_INTERRUPTIBLE);
428
429 if (rc)
430 gossip_debug(GOSSIP_DEBUGFS_DEBUG,
431 "%s: service_operation failed! rc:%d:\n",
432 __func__,
433 rc);
434
435 op_release(new_op);
436 }
437
438 mutex_lock(&orangefs_debug_lock);
439 memset(file->f_inode->i_private, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
440 sprintf((char *)file->f_inode->i_private, "%s\n", debug_string);
441 mutex_unlock(&orangefs_debug_lock);
442
443 *ppos += count;
444 if (silly)
445 rc = silly;
446 else
447 rc = count;
448
449out:
450 gossip_debug(GOSSIP_DEBUGFS_DEBUG,
451 "orangefs_debug_write: rc: %d\n",
452 rc);
453 kfree(buf);
454 return rc;
455}
diff --git a/fs/orangefs/orangefs-debugfs.h b/fs/orangefs/orangefs-debugfs.h
new file mode 100644
index 000000000000..e4828c0e3ef9
--- /dev/null
+++ b/fs/orangefs/orangefs-debugfs.h
@@ -0,0 +1,3 @@
1int orangefs_debugfs_init(void);
2int orangefs_kernel_debug_init(void);
3void orangefs_debugfs_cleanup(void);
diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h
new file mode 100644
index 000000000000..9eac9d9a3f3a
--- /dev/null
+++ b/fs/orangefs/orangefs-dev-proto.h
@@ -0,0 +1,62 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7#ifndef _ORANGEFS_DEV_PROTO_H
8#define _ORANGEFS_DEV_PROTO_H
9
10/*
11 * types and constants shared between user space and kernel space for
12 * device interaction using a common protocol
13 */
14
15/*
16 * valid orangefs kernel operation types
17 */
18#define ORANGEFS_VFS_OP_INVALID 0xFF000000
19#define ORANGEFS_VFS_OP_FILE_IO 0xFF000001
20#define ORANGEFS_VFS_OP_LOOKUP 0xFF000002
21#define ORANGEFS_VFS_OP_CREATE 0xFF000003
22#define ORANGEFS_VFS_OP_GETATTR 0xFF000004
23#define ORANGEFS_VFS_OP_REMOVE 0xFF000005
24#define ORANGEFS_VFS_OP_MKDIR 0xFF000006
25#define ORANGEFS_VFS_OP_READDIR 0xFF000007
26#define ORANGEFS_VFS_OP_SETATTR 0xFF000008
27#define ORANGEFS_VFS_OP_SYMLINK 0xFF000009
28#define ORANGEFS_VFS_OP_RENAME 0xFF00000A
29#define ORANGEFS_VFS_OP_STATFS 0xFF00000B
30#define ORANGEFS_VFS_OP_TRUNCATE 0xFF00000C
31#define ORANGEFS_VFS_OP_MMAP_RA_FLUSH 0xFF00000D
32#define ORANGEFS_VFS_OP_FS_MOUNT 0xFF00000E
33#define ORANGEFS_VFS_OP_FS_UMOUNT 0xFF00000F
34#define ORANGEFS_VFS_OP_GETXATTR 0xFF000010
35#define ORANGEFS_VFS_OP_SETXATTR 0xFF000011
36#define ORANGEFS_VFS_OP_LISTXATTR 0xFF000012
37#define ORANGEFS_VFS_OP_REMOVEXATTR 0xFF000013
38#define ORANGEFS_VFS_OP_PARAM 0xFF000014
39#define ORANGEFS_VFS_OP_PERF_COUNT 0xFF000015
40#define ORANGEFS_VFS_OP_CANCEL 0xFF00EE00
41#define ORANGEFS_VFS_OP_FSYNC 0xFF00EE01
42#define ORANGEFS_VFS_OP_FSKEY 0xFF00EE02
43#define ORANGEFS_VFS_OP_READDIRPLUS 0xFF00EE03
44
45/*
46 * Misc constants. Please retain them as multiples of 8!
47 * Otherwise 32-64 bit interactions will be messed up :)
48 */
49#define ORANGEFS_MAX_DEBUG_STRING_LEN 0x00000400
50#define ORANGEFS_MAX_DEBUG_ARRAY_LEN 0x00000800
51
52/*
53 * The maximum number of directory entries in a single request is 96.
54 * XXX: Why can this not be higher. The client-side code can handle up to 512.
55 * XXX: What happens if we expect more than the client can return?
56 */
57#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 96
58
59#include "upcall.h"
60#include "downcall.h"
61
62#endif
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
new file mode 100644
index 000000000000..a9925e296ceb
--- /dev/null
+++ b/fs/orangefs/orangefs-kernel.h
@@ -0,0 +1,623 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7/*
8 * The ORANGEFS Linux kernel support allows ORANGEFS volumes to be mounted and
9 * accessed through the Linux VFS (i.e. using standard I/O system calls).
10 * This support is only needed on clients that wish to mount the file system.
11 *
12 */
13
14/*
15 * Declarations and macros for the ORANGEFS Linux kernel support.
16 */
17
18#ifndef __ORANGEFSKERNEL_H
19#define __ORANGEFSKERNEL_H
20
21#include <linux/kernel.h>
22#include <linux/moduleparam.h>
23#include <linux/statfs.h>
24#include <linux/backing-dev.h>
25#include <linux/device.h>
26#include <linux/mpage.h>
27#include <linux/namei.h>
28#include <linux/errno.h>
29#include <linux/init.h>
30#include <linux/module.h>
31#include <linux/slab.h>
32#include <linux/types.h>
33#include <linux/fs.h>
34#include <linux/vmalloc.h>
35
36#include <linux/aio.h>
37#include <linux/posix_acl.h>
38#include <linux/posix_acl_xattr.h>
39#include <linux/compat.h>
40#include <linux/mount.h>
41#include <linux/uaccess.h>
42#include <linux/atomic.h>
43#include <linux/uio.h>
44#include <linux/sched.h>
45#include <linux/mm.h>
46#include <linux/wait.h>
47#include <linux/dcache.h>
48#include <linux/pagemap.h>
49#include <linux/poll.h>
50#include <linux/rwsem.h>
51#include <linux/xattr.h>
52#include <linux/exportfs.h>
53
54#include <asm/unaligned.h>
55
56#include "orangefs-dev-proto.h"
57
58#ifdef ORANGEFS_KERNEL_DEBUG
59#define ORANGEFS_DEFAULT_OP_TIMEOUT_SECS 10
60#else
61#define ORANGEFS_DEFAULT_OP_TIMEOUT_SECS 20
62#endif
63
64#define ORANGEFS_BUFMAP_WAIT_TIMEOUT_SECS 30
65
66#define ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS 900 /* 15 minutes */
67
68#define ORANGEFS_REQDEVICE_NAME "pvfs2-req"
69
70#define ORANGEFS_DEVREQ_MAGIC 0x20030529
71#define ORANGEFS_LINK_MAX 0x000000FF
72#define ORANGEFS_PURGE_RETRY_COUNT 0x00000005
73#define ORANGEFS_MAX_NUM_OPTIONS 0x00000004
74#define ORANGEFS_MAX_MOUNT_OPT_LEN 0x00000080
75#define ORANGEFS_MAX_FSKEY_LEN 64
76
77#define MAX_DEV_REQ_UPSIZE (2 * sizeof(__s32) + \
78sizeof(__u64) + sizeof(struct orangefs_upcall_s))
79#define MAX_DEV_REQ_DOWNSIZE (2 * sizeof(__s32) + \
80sizeof(__u64) + sizeof(struct orangefs_downcall_s))
81
82/*
83 * valid orangefs kernel operation states
84 *
85 * unknown - op was just initialized
86 * waiting - op is on request_list (upward bound)
87 * inprogr - op is in progress (waiting for downcall)
88 * serviced - op has matching downcall; ok
89 * purged - op has to start a timer since client-core
90 * exited uncleanly before servicing op
91 * given up - submitter has given up waiting for it
92 */
93enum orangefs_vfs_op_states {
94 OP_VFS_STATE_UNKNOWN = 0,
95 OP_VFS_STATE_WAITING = 1,
96 OP_VFS_STATE_INPROGR = 2,
97 OP_VFS_STATE_SERVICED = 4,
98 OP_VFS_STATE_PURGED = 8,
99 OP_VFS_STATE_GIVEN_UP = 16,
100};
101
102/*
103 * An array of client_debug_mask will be built to hold debug keyword/mask
104 * values fetched from userspace.
105 */
106struct client_debug_mask {
107 char *keyword;
108 __u64 mask1;
109 __u64 mask2;
110};
111
112/*
113 * orangefs kernel memory related flags
114 */
115
116#if ((defined ORANGEFS_KERNEL_DEBUG) && (defined CONFIG_DEBUG_SLAB))
117#define ORANGEFS_CACHE_CREATE_FLAGS SLAB_RED_ZONE
118#else
119#define ORANGEFS_CACHE_CREATE_FLAGS 0
120#endif /* ((defined ORANGEFS_KERNEL_DEBUG) && (defined CONFIG_DEBUG_SLAB)) */
121
122/* orangefs xattr and acl related defines */
123#define ORANGEFS_XATTR_INDEX_POSIX_ACL_ACCESS 1
124#define ORANGEFS_XATTR_INDEX_POSIX_ACL_DEFAULT 2
125#define ORANGEFS_XATTR_INDEX_TRUSTED 3
126#define ORANGEFS_XATTR_INDEX_DEFAULT 4
127
128#define ORANGEFS_XATTR_NAME_ACL_ACCESS XATTR_NAME_POSIX_ACL_ACCESS
129#define ORANGEFS_XATTR_NAME_ACL_DEFAULT XATTR_NAME_POSIX_ACL_DEFAULT
130#define ORANGEFS_XATTR_NAME_TRUSTED_PREFIX "trusted."
131#define ORANGEFS_XATTR_NAME_DEFAULT_PREFIX ""
132
133/* these functions are defined in orangefs-utils.c */
134int orangefs_prepare_cdm_array(char *debug_array_string);
135int orangefs_prepare_debugfs_help_string(int);
136
137/* defined in orangefs-debugfs.c */
138int orangefs_client_debug_init(void);
139
140void debug_string_to_mask(char *, void *, int);
141void do_c_mask(int, char *, struct client_debug_mask **);
142void do_k_mask(int, char *, __u64 **);
143
144void debug_mask_to_string(void *, int);
145void do_k_string(void *, int);
146void do_c_string(void *, int);
147int check_amalgam_keyword(void *, int);
148int keyword_is_amalgam(char *);
149
150/*these variables are defined in orangefs-mod.c */
151extern char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
152extern char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
153extern char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
154extern unsigned int kernel_mask_set_mod_init;
155
156extern int orangefs_init_acl(struct inode *inode, struct inode *dir);
157extern const struct xattr_handler *orangefs_xattr_handlers[];
158
159extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type);
160extern int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
161
162/*
163 * Redefine xtvec structure so that we could move helper functions out of
164 * the define
165 */
166struct xtvec {
167 __kernel_off_t xtv_off; /* must be off_t */
168 __kernel_size_t xtv_len; /* must be size_t */
169};
170
171/*
172 * orangefs data structures
173 */
174struct orangefs_kernel_op_s {
175 enum orangefs_vfs_op_states op_state;
176 __u64 tag;
177
178 /*
179 * Set uses_shared_memory to non zero if this operation uses
180 * shared memory. If true, then a retry on the op must also
181 * get a new shared memory buffer and re-populate it.
182 * Cancels don't care - it only matters for service_operation()
183 * retry logics and cancels don't go through it anymore. It
184 * safely stays non-zero when we use it as slot_to_free.
185 */
186 union {
187 int uses_shared_memory;
188 int slot_to_free;
189 };
190
191 struct orangefs_upcall_s upcall;
192 struct orangefs_downcall_s downcall;
193
194 struct completion waitq;
195 spinlock_t lock;
196
197 int attempts;
198
199 struct list_head list;
200};
201
202#define set_op_state_waiting(op) ((op)->op_state = OP_VFS_STATE_WAITING)
203#define set_op_state_inprogress(op) ((op)->op_state = OP_VFS_STATE_INPROGR)
204#define set_op_state_given_up(op) ((op)->op_state = OP_VFS_STATE_GIVEN_UP)
205static inline void set_op_state_serviced(struct orangefs_kernel_op_s *op)
206{
207 op->op_state = OP_VFS_STATE_SERVICED;
208 complete(&op->waitq);
209}
210
211#define op_state_waiting(op) ((op)->op_state & OP_VFS_STATE_WAITING)
212#define op_state_in_progress(op) ((op)->op_state & OP_VFS_STATE_INPROGR)
213#define op_state_serviced(op) ((op)->op_state & OP_VFS_STATE_SERVICED)
214#define op_state_purged(op) ((op)->op_state & OP_VFS_STATE_PURGED)
215#define op_state_given_up(op) ((op)->op_state & OP_VFS_STATE_GIVEN_UP)
216#define op_is_cancel(op) ((op)->upcall.type == ORANGEFS_VFS_OP_CANCEL)
217
218void op_release(struct orangefs_kernel_op_s *op);
219
220extern void orangefs_bufmap_put(int);
221static inline void put_cancel(struct orangefs_kernel_op_s *op)
222{
223 orangefs_bufmap_put(op->slot_to_free);
224 op_release(op);
225}
226
227static inline void set_op_state_purged(struct orangefs_kernel_op_s *op)
228{
229 spin_lock(&op->lock);
230 if (unlikely(op_is_cancel(op))) {
231 list_del_init(&op->list);
232 spin_unlock(&op->lock);
233 put_cancel(op);
234 } else {
235 op->op_state |= OP_VFS_STATE_PURGED;
236 complete(&op->waitq);
237 spin_unlock(&op->lock);
238 }
239}
240
241/* per inode private orangefs info */
242struct orangefs_inode_s {
243 struct orangefs_object_kref refn;
244 char link_target[ORANGEFS_NAME_MAX];
245 __s64 blksize;
246 /*
247 * Reading/Writing Extended attributes need to acquire the appropriate
248 * reader/writer semaphore on the orangefs_inode_s structure.
249 */
250 struct rw_semaphore xattr_sem;
251
252 struct inode vfs_inode;
253 sector_t last_failed_block_index_read;
254
255 /*
256 * State of in-memory attributes not yet flushed to disk associated
257 * with this object
258 */
259 unsigned long pinode_flags;
260};
261
262#define P_ATIME_FLAG 0
263#define P_MTIME_FLAG 1
264#define P_CTIME_FLAG 2
265#define P_MODE_FLAG 3
266
267#define ClearAtimeFlag(pinode) clear_bit(P_ATIME_FLAG, &(pinode)->pinode_flags)
268#define SetAtimeFlag(pinode) set_bit(P_ATIME_FLAG, &(pinode)->pinode_flags)
269#define AtimeFlag(pinode) test_bit(P_ATIME_FLAG, &(pinode)->pinode_flags)
270
271#define ClearMtimeFlag(pinode) clear_bit(P_MTIME_FLAG, &(pinode)->pinode_flags)
272#define SetMtimeFlag(pinode) set_bit(P_MTIME_FLAG, &(pinode)->pinode_flags)
273#define MtimeFlag(pinode) test_bit(P_MTIME_FLAG, &(pinode)->pinode_flags)
274
275#define ClearCtimeFlag(pinode) clear_bit(P_CTIME_FLAG, &(pinode)->pinode_flags)
276#define SetCtimeFlag(pinode) set_bit(P_CTIME_FLAG, &(pinode)->pinode_flags)
277#define CtimeFlag(pinode) test_bit(P_CTIME_FLAG, &(pinode)->pinode_flags)
278
279#define ClearModeFlag(pinode) clear_bit(P_MODE_FLAG, &(pinode)->pinode_flags)
280#define SetModeFlag(pinode) set_bit(P_MODE_FLAG, &(pinode)->pinode_flags)
281#define ModeFlag(pinode) test_bit(P_MODE_FLAG, &(pinode)->pinode_flags)
282
283/* per superblock private orangefs info */
284struct orangefs_sb_info_s {
285 struct orangefs_khandle root_khandle;
286 __s32 fs_id;
287 int id;
288 int flags;
289#define ORANGEFS_OPT_INTR 0x01
290#define ORANGEFS_OPT_LOCAL_LOCK 0x02
291 char devname[ORANGEFS_MAX_SERVER_ADDR_LEN];
292 struct super_block *sb;
293 int mount_pending;
294 struct list_head list;
295};
296
297/*
298 * structure that holds the state of any async I/O operation issued
299 * through the VFS. Needed especially to handle cancellation requests
300 * or even completion notification so that the VFS client-side daemon
301 * can free up its vfs_request slots.
302 */
303struct orangefs_kiocb_s {
304 /* the pointer to the task that initiated the AIO */
305 struct task_struct *tsk;
306
307 /* pointer to the kiocb that kicked this operation */
308 struct kiocb *kiocb;
309
310 /* buffer index that was used for the I/O */
311 struct orangefs_bufmap *bufmap;
312 int buffer_index;
313
314 /* orangefs kernel operation type */
315 struct orangefs_kernel_op_s *op;
316
317 /* The user space buffers from/to which I/O is being staged */
318 struct iovec *iov;
319
320 /* number of elements in the iovector */
321 unsigned long nr_segs;
322
323 /* set to indicate the type of the operation */
324 int rw;
325
326 /* file offset */
327 loff_t offset;
328
329 /* and the count in bytes */
330 size_t bytes_to_be_copied;
331
332 ssize_t bytes_copied;
333 int needs_cleanup;
334};
335
336struct orangefs_stats {
337 unsigned long cache_hits;
338 unsigned long cache_misses;
339 unsigned long reads;
340 unsigned long writes;
341};
342
343extern struct orangefs_stats g_orangefs_stats;
344
345/*
346 * NOTE: See Documentation/filesystems/porting for information
347 * on implementing FOO_I and properly accessing fs private data
348 */
349static inline struct orangefs_inode_s *ORANGEFS_I(struct inode *inode)
350{
351 return container_of(inode, struct orangefs_inode_s, vfs_inode);
352}
353
354static inline struct orangefs_sb_info_s *ORANGEFS_SB(struct super_block *sb)
355{
356 return (struct orangefs_sb_info_s *) sb->s_fs_info;
357}
358
359/* ino_t descends from "unsigned long", 8 bytes, 64 bits. */
360static inline ino_t orangefs_khandle_to_ino(struct orangefs_khandle *khandle)
361{
362 union {
363 unsigned char u[8];
364 __u64 ino;
365 } ihandle;
366
367 ihandle.u[0] = khandle->u[0] ^ khandle->u[4];
368 ihandle.u[1] = khandle->u[1] ^ khandle->u[5];
369 ihandle.u[2] = khandle->u[2] ^ khandle->u[6];
370 ihandle.u[3] = khandle->u[3] ^ khandle->u[7];
371 ihandle.u[4] = khandle->u[12] ^ khandle->u[8];
372 ihandle.u[5] = khandle->u[13] ^ khandle->u[9];
373 ihandle.u[6] = khandle->u[14] ^ khandle->u[10];
374 ihandle.u[7] = khandle->u[15] ^ khandle->u[11];
375
376 return ihandle.ino;
377}
378
379static inline struct orangefs_khandle *get_khandle_from_ino(struct inode *inode)
380{
381 return &(ORANGEFS_I(inode)->refn.khandle);
382}
383
384static inline __s32 get_fsid_from_ino(struct inode *inode)
385{
386 return ORANGEFS_I(inode)->refn.fs_id;
387}
388
389static inline ino_t get_ino_from_khandle(struct inode *inode)
390{
391 struct orangefs_khandle *khandle;
392 ino_t ino;
393
394 khandle = get_khandle_from_ino(inode);
395 ino = orangefs_khandle_to_ino(khandle);
396 return ino;
397}
398
399static inline ino_t get_parent_ino_from_dentry(struct dentry *dentry)
400{
401 return get_ino_from_khandle(dentry->d_parent->d_inode);
402}
403
404static inline int is_root_handle(struct inode *inode)
405{
406 gossip_debug(GOSSIP_DCACHE_DEBUG,
407 "%s: root handle: %pU, this handle: %pU:\n",
408 __func__,
409 &ORANGEFS_SB(inode->i_sb)->root_khandle,
410 get_khandle_from_ino(inode));
411
412 if (ORANGEFS_khandle_cmp(&(ORANGEFS_SB(inode->i_sb)->root_khandle),
413 get_khandle_from_ino(inode)))
414 return 0;
415 else
416 return 1;
417}
418
419static inline int match_handle(struct orangefs_khandle resp_handle,
420 struct inode *inode)
421{
422 gossip_debug(GOSSIP_DCACHE_DEBUG,
423 "%s: one handle: %pU, another handle:%pU:\n",
424 __func__,
425 &resp_handle,
426 get_khandle_from_ino(inode));
427
428 if (ORANGEFS_khandle_cmp(&resp_handle, get_khandle_from_ino(inode)))
429 return 0;
430 else
431 return 1;
432}
433
434/*
435 * defined in orangefs-cache.c
436 */
437int op_cache_initialize(void);
438int op_cache_finalize(void);
439struct orangefs_kernel_op_s *op_alloc(__s32 type);
440void orangefs_new_tag(struct orangefs_kernel_op_s *op);
441char *get_opname_string(struct orangefs_kernel_op_s *new_op);
442
443int orangefs_inode_cache_initialize(void);
444int orangefs_inode_cache_finalize(void);
445
446/*
447 * defined in orangefs-mod.c
448 */
449void purge_inprogress_ops(void);
450
451/*
452 * defined in waitqueue.c
453 */
454void purge_waiting_ops(void);
455
456/*
457 * defined in super.c
458 */
459struct dentry *orangefs_mount(struct file_system_type *fst,
460 int flags,
461 const char *devname,
462 void *data);
463
464void orangefs_kill_sb(struct super_block *sb);
465int orangefs_remount(struct orangefs_sb_info_s *);
466
467int fsid_key_table_initialize(void);
468void fsid_key_table_finalize(void);
469
470/*
471 * defined in inode.c
472 */
473__u32 convert_to_orangefs_mask(unsigned long lite_mask);
474struct inode *orangefs_new_inode(struct super_block *sb,
475 struct inode *dir,
476 int mode,
477 dev_t dev,
478 struct orangefs_object_kref *ref);
479
480int orangefs_setattr(struct dentry *dentry, struct iattr *iattr);
481
482int orangefs_getattr(struct vfsmount *mnt,
483 struct dentry *dentry,
484 struct kstat *kstat);
485
486int orangefs_permission(struct inode *inode, int mask);
487
488/*
489 * defined in xattr.c
490 */
491int orangefs_setxattr(struct dentry *dentry,
492 const char *name,
493 const void *value,
494 size_t size,
495 int flags);
496
497ssize_t orangefs_getxattr(struct dentry *dentry,
498 const char *name,
499 void *buffer,
500 size_t size);
501
502ssize_t orangefs_listxattr(struct dentry *dentry, char *buffer, size_t size);
503
504/*
505 * defined in namei.c
506 */
507struct inode *orangefs_iget(struct super_block *sb,
508 struct orangefs_object_kref *ref);
509
510ssize_t orangefs_inode_read(struct inode *inode,
511 struct iov_iter *iter,
512 loff_t *offset,
513 loff_t readahead_size);
514
515/*
516 * defined in devorangefs-req.c
517 */
518int orangefs_dev_init(void);
519void orangefs_dev_cleanup(void);
520int is_daemon_in_service(void);
521bool __is_daemon_in_service(void);
522
523/*
524 * defined in orangefs-utils.c
525 */
526__s32 fsid_of_op(struct orangefs_kernel_op_s *op);
527
528int orangefs_flush_inode(struct inode *inode);
529
530ssize_t orangefs_inode_getxattr(struct inode *inode,
531 const char *prefix,
532 const char *name,
533 void *buffer,
534 size_t size);
535
536int orangefs_inode_setxattr(struct inode *inode,
537 const char *prefix,
538 const char *name,
539 const void *value,
540 size_t size,
541 int flags);
542
543int orangefs_inode_getattr(struct inode *inode, int new, int size);
544
545int orangefs_inode_check_changed(struct inode *inode);
546
547int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr);
548
549void orangefs_make_bad_inode(struct inode *inode);
550
551int orangefs_unmount_sb(struct super_block *sb);
552
553bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op);
554
555int orangefs_normalize_to_errno(__s32 error_code);
556
557extern struct mutex devreq_mutex;
558extern struct mutex request_mutex;
559extern int debug;
560extern int op_timeout_secs;
561extern int slot_timeout_secs;
562extern struct list_head orangefs_superblocks;
563extern spinlock_t orangefs_superblocks_lock;
564extern struct list_head orangefs_request_list;
565extern spinlock_t orangefs_request_list_lock;
566extern wait_queue_head_t orangefs_request_list_waitq;
567extern struct list_head *htable_ops_in_progress;
568extern spinlock_t htable_ops_in_progress_lock;
569extern int hash_table_size;
570
571extern const struct address_space_operations orangefs_address_operations;
572extern struct backing_dev_info orangefs_backing_dev_info;
573extern struct inode_operations orangefs_file_inode_operations;
574extern const struct file_operations orangefs_file_operations;
575extern struct inode_operations orangefs_symlink_inode_operations;
576extern struct inode_operations orangefs_dir_inode_operations;
577extern const struct file_operations orangefs_dir_operations;
578extern const struct dentry_operations orangefs_dentry_operations;
579extern const struct file_operations orangefs_devreq_file_operations;
580
581extern wait_queue_head_t orangefs_bufmap_init_waitq;
582
583/*
584 * misc convenience macros
585 */
586
587#define ORANGEFS_OP_INTERRUPTIBLE 1 /* service_operation() is interruptible */
588#define ORANGEFS_OP_PRIORITY 2 /* service_operation() is high priority */
589#define ORANGEFS_OP_CANCELLATION 4 /* this is a cancellation */
590#define ORANGEFS_OP_NO_MUTEX 8 /* don't acquire request_mutex */
591#define ORANGEFS_OP_ASYNC 16 /* Queue it, but don't wait */
592
593int service_operation(struct orangefs_kernel_op_s *op,
594 const char *op_name,
595 int flags);
596
597#define get_interruptible_flag(inode) \
598 ((ORANGEFS_SB(inode->i_sb)->flags & ORANGEFS_OPT_INTR) ? \
599 ORANGEFS_OP_INTERRUPTIBLE : 0)
600
601#define fill_default_sys_attrs(sys_attr, type, mode) \
602do { \
603 sys_attr.owner = from_kuid(current_user_ns(), current_fsuid()); \
604 sys_attr.group = from_kgid(current_user_ns(), current_fsgid()); \
605 sys_attr.perms = ORANGEFS_util_translate_mode(mode); \
606 sys_attr.mtime = 0; \
607 sys_attr.atime = 0; \
608 sys_attr.ctime = 0; \
609 sys_attr.mask = ORANGEFS_ATTR_SYS_ALL_SETABLE; \
610} while (0)
611
612static inline void orangefs_i_size_write(struct inode *inode, loff_t i_size)
613{
614#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
615 mutex_lock(&inode->i_mutex);
616#endif
617 i_size_write(inode, i_size);
618#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
619 mutex_unlock(&inode->i_mutex);
620#endif
621}
622
623#endif /* __ORANGEFSKERNEL_H */
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
new file mode 100644
index 000000000000..6f072a8c0de1
--- /dev/null
+++ b/fs/orangefs/orangefs-mod.c
@@ -0,0 +1,293 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * Changes by Acxiom Corporation to add proc file handler for pvfs2 client
5 * parameters, Copyright Acxiom Corporation, 2005.
6 *
7 * See COPYING in top-level directory.
8 */
9
10#include "protocol.h"
11#include "orangefs-kernel.h"
12#include "orangefs-debugfs.h"
13#include "orangefs-sysfs.h"
14
15/* ORANGEFS_VERSION is a ./configure define */
16#ifndef ORANGEFS_VERSION
17#define ORANGEFS_VERSION "upstream"
18#endif
19
20/*
21 * global variables declared here
22 */
23
24/* array of client debug keyword/mask values */
25struct client_debug_mask *cdm_array;
26int cdm_element_count;
27
28char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN] = "none";
29char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
30char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
31
32char *debug_help_string;
33int help_string_initialized;
34struct dentry *help_file_dentry;
35struct dentry *client_debug_dentry;
36struct dentry *debug_dir;
37int client_verbose_index;
38int client_all_index;
39struct orangefs_stats g_orangefs_stats;
40
41/* the size of the hash tables for ops in progress */
42int hash_table_size = 509;
43
44static ulong module_parm_debug_mask;
45__u64 gossip_debug_mask;
46struct client_debug_mask client_debug_mask = { NULL, 0, 0 };
47unsigned int kernel_mask_set_mod_init; /* implicitly false */
48int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS;
49int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS;
50
51MODULE_LICENSE("GPL");
52MODULE_AUTHOR("ORANGEFS Development Team");
53MODULE_DESCRIPTION("The Linux Kernel VFS interface to ORANGEFS");
54MODULE_PARM_DESC(module_parm_debug_mask, "debugging level (see orangefs-debug.h for values)");
55MODULE_PARM_DESC(op_timeout_secs, "Operation timeout in seconds");
56MODULE_PARM_DESC(slot_timeout_secs, "Slot timeout in seconds");
57MODULE_PARM_DESC(hash_table_size,
58 "size of hash table for operations in progress");
59
60static struct file_system_type orangefs_fs_type = {
61 .name = "pvfs2",
62 .mount = orangefs_mount,
63 .kill_sb = orangefs_kill_sb,
64 .owner = THIS_MODULE,
65};
66
67module_param(hash_table_size, int, 0);
68module_param(module_parm_debug_mask, ulong, 0644);
69module_param(op_timeout_secs, int, 0);
70module_param(slot_timeout_secs, int, 0);
71
72/* synchronizes the request device file */
73DEFINE_MUTEX(devreq_mutex);
74
75/*
76 * Blocks non-priority requests from being queued for servicing. This
77 * could be used for protecting the request list data structure, but
78 * for now it's only being used to stall the op addition to the request
79 * list
80 */
81DEFINE_MUTEX(request_mutex);
82
83/* hash table for storing operations waiting for matching downcall */
84struct list_head *htable_ops_in_progress;
85DEFINE_SPINLOCK(htable_ops_in_progress_lock);
86
87/* list for queueing upcall operations */
88LIST_HEAD(orangefs_request_list);
89
90/* used to protect the above orangefs_request_list */
91DEFINE_SPINLOCK(orangefs_request_list_lock);
92
93/* used for incoming request notification */
94DECLARE_WAIT_QUEUE_HEAD(orangefs_request_list_waitq);
95
96static int __init orangefs_init(void)
97{
98 int ret = -1;
99 __u32 i = 0;
100
101 /* convert input debug mask to a 64-bit unsigned integer */
102 gossip_debug_mask = (unsigned long long) module_parm_debug_mask;
103
104 /*
105 * set the kernel's gossip debug string; invalid mask values will
106 * be ignored.
107 */
108 debug_mask_to_string(&gossip_debug_mask, 0);
109
110 /* remove any invalid values from the mask */
111 debug_string_to_mask(kernel_debug_string, &gossip_debug_mask, 0);
112
113 /*
114 * if the mask has a non-zero value, then indicate that the mask
115 * was set when the kernel module was loaded. The orangefs dev ioctl
116 * command will look at this boolean to determine if the kernel's
117 * debug mask should be overwritten when the client-core is started.
118 */
119 if (gossip_debug_mask != 0)
120 kernel_mask_set_mod_init = true;
121
122 pr_info("%s: called with debug mask: :%s: :%llx:\n",
123 __func__,
124 kernel_debug_string,
125 (unsigned long long)gossip_debug_mask);
126
127 ret = bdi_init(&orangefs_backing_dev_info);
128
129 if (ret)
130 return ret;
131
132 if (op_timeout_secs < 0)
133 op_timeout_secs = 0;
134
135 if (slot_timeout_secs < 0)
136 slot_timeout_secs = 0;
137
138 /* initialize global book keeping data structures */
139 ret = op_cache_initialize();
140 if (ret < 0)
141 goto err;
142
143 ret = orangefs_inode_cache_initialize();
144 if (ret < 0)
145 goto cleanup_op;
146
147 htable_ops_in_progress =
148 kcalloc(hash_table_size, sizeof(struct list_head), GFP_KERNEL);
149 if (!htable_ops_in_progress) {
150 gossip_err("Failed to initialize op hashtable");
151 ret = -ENOMEM;
152 goto cleanup_inode;
153 }
154
155 /* initialize a doubly linked at each hash table index */
156 for (i = 0; i < hash_table_size; i++)
157 INIT_LIST_HEAD(&htable_ops_in_progress[i]);
158
159 ret = fsid_key_table_initialize();
160 if (ret < 0)
161 goto cleanup_progress_table;
162
163 /*
164 * Build the contents of /sys/kernel/debug/orangefs/debug-help
165 * from the keywords in the kernel keyword/mask array.
166 *
167 * The keywords in the client keyword/mask array are
168 * unknown at boot time.
169 *
170 * orangefs_prepare_debugfs_help_string will be used again
171 * later to rebuild the debug-help file after the client starts
172 * and passes along the needed info. The argument signifies
173 * which time orangefs_prepare_debugfs_help_string is being
174 * called.
175 */
176 ret = orangefs_prepare_debugfs_help_string(1);
177 if (ret)
178 goto cleanup_key_table;
179
180 ret = orangefs_debugfs_init();
181 if (ret)
182 goto debugfs_init_failed;
183
184 ret = orangefs_kernel_debug_init();
185 if (ret)
186 goto kernel_debug_init_failed;
187
188 ret = orangefs_sysfs_init();
189 if (ret)
190 goto sysfs_init_failed;
191
192 /* Initialize the orangefsdev subsystem. */
193 ret = orangefs_dev_init();
194 if (ret < 0) {
195 gossip_err("%s: could not initialize device subsystem %d!\n",
196 __func__,
197 ret);
198 goto cleanup_device;
199 }
200
201 ret = register_filesystem(&orangefs_fs_type);
202 if (ret == 0) {
203 pr_info("orangefs: module version %s loaded\n", ORANGEFS_VERSION);
204 ret = 0;
205 goto out;
206 }
207
208 orangefs_sysfs_exit();
209
210cleanup_device:
211 orangefs_dev_cleanup();
212
213sysfs_init_failed:
214
215kernel_debug_init_failed:
216
217debugfs_init_failed:
218 orangefs_debugfs_cleanup();
219
220cleanup_key_table:
221 fsid_key_table_finalize();
222
223cleanup_progress_table:
224 kfree(htable_ops_in_progress);
225
226cleanup_inode:
227 orangefs_inode_cache_finalize();
228
229cleanup_op:
230 op_cache_finalize();
231
232err:
233 bdi_destroy(&orangefs_backing_dev_info);
234
235out:
236 return ret;
237}
238
239static void __exit orangefs_exit(void)
240{
241 int i = 0;
242 gossip_debug(GOSSIP_INIT_DEBUG, "orangefs: orangefs_exit called\n");
243
244 unregister_filesystem(&orangefs_fs_type);
245 orangefs_debugfs_cleanup();
246 orangefs_sysfs_exit();
247 fsid_key_table_finalize();
248 orangefs_dev_cleanup();
249 BUG_ON(!list_empty(&orangefs_request_list));
250 for (i = 0; i < hash_table_size; i++)
251 BUG_ON(!list_empty(&htable_ops_in_progress[i]));
252
253 orangefs_inode_cache_finalize();
254 op_cache_finalize();
255
256 kfree(htable_ops_in_progress);
257
258 bdi_destroy(&orangefs_backing_dev_info);
259
260 pr_info("orangefs: module version %s unloaded\n", ORANGEFS_VERSION);
261}
262
263/*
264 * What we do in this function is to walk the list of operations
265 * that are in progress in the hash table and mark them as purged as well.
266 */
267void purge_inprogress_ops(void)
268{
269 int i;
270
271 for (i = 0; i < hash_table_size; i++) {
272 struct orangefs_kernel_op_s *op;
273 struct orangefs_kernel_op_s *next;
274
275 spin_lock(&htable_ops_in_progress_lock);
276 list_for_each_entry_safe(op,
277 next,
278 &htable_ops_in_progress[i],
279 list) {
280 set_op_state_purged(op);
281 gossip_debug(GOSSIP_DEV_DEBUG,
282 "%s: op:%s: op_state:%d: process:%s:\n",
283 __func__,
284 get_opname_string(op),
285 op->op_state,
286 current->comm);
287 }
288 spin_unlock(&htable_ops_in_progress_lock);
289 }
290}
291
292module_init(orangefs_init);
293module_exit(orangefs_exit);
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
new file mode 100644
index 000000000000..5c03113e3ad2
--- /dev/null
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -0,0 +1,1772 @@
1/*
2 * Documentation/ABI/stable/orangefs-sysfs:
3 *
4 * What: /sys/fs/orangefs/perf_counter_reset
5 * Date: June 2015
6 * Contact: Mike Marshall <hubcap@omnibond.com>
7 * Description:
8 * echo a 0 or a 1 into perf_counter_reset to
9 * reset all the counters in
10 * /sys/fs/orangefs/perf_counters
11 * except ones with PINT_PERF_PRESERVE set.
12 *
13 *
14 * What: /sys/fs/orangefs/perf_counters/...
15 * Date: Jun 2015
16 * Contact: Mike Marshall <hubcap@omnibond.com>
17 * Description:
18 * Counters and settings for various caches.
19 * Read only.
20 *
21 *
22 * What: /sys/fs/orangefs/perf_time_interval_secs
23 * Date: Jun 2015
24 * Contact: Mike Marshall <hubcap@omnibond.com>
25 * Description:
26 * Length of perf counter intervals in
27 * seconds.
28 *
29 *
30 * What: /sys/fs/orangefs/perf_history_size
31 * Date: Jun 2015
32 * Contact: Mike Marshall <hubcap@omnibond.com>
33 * Description:
34 * The perf_counters cache statistics have N, or
35 * perf_history_size, samples. The default is
36 * one.
37 *
38 * Every perf_time_interval_secs the (first)
39 * samples are reset.
40 *
41 * If N is greater than one, the "current" set
42 * of samples is reset, and the samples from the
43 * other N-1 intervals remain available.
44 *
45 *
46 * What: /sys/fs/orangefs/op_timeout_secs
47 * Date: Jun 2015
48 * Contact: Mike Marshall <hubcap@omnibond.com>
49 * Description:
50 * Service operation timeout in seconds.
51 *
52 *
53 * What: /sys/fs/orangefs/slot_timeout_secs
54 * Date: Jun 2015
55 * Contact: Mike Marshall <hubcap@omnibond.com>
56 * Description:
57 * "Slot" timeout in seconds. A "slot"
58 * is an indexed buffer in the shared
59 * memory segment used for communication
60 * between the kernel module and userspace.
61 * Slots are requested and waited for,
62 * the wait times out after slot_timeout_secs.
63 *
64 *
65 * What: /sys/fs/orangefs/acache/...
66 * Date: Jun 2015
67 * Contact: Mike Marshall <hubcap@omnibond.com>
68 * Description:
69 * Attribute cache configurable settings.
70 *
71 *
72 * What: /sys/fs/orangefs/ncache/...
73 * Date: Jun 2015
74 * Contact: Mike Marshall <hubcap@omnibond.com>
75 * Description:
76 * Name cache configurable settings.
77 *
78 *
79 * What: /sys/fs/orangefs/capcache/...
80 * Date: Jun 2015
81 * Contact: Mike Marshall <hubcap@omnibond.com>
82 * Description:
83 * Capability cache configurable settings.
84 *
85 *
86 * What: /sys/fs/orangefs/ccache/...
87 * Date: Jun 2015
88 * Contact: Mike Marshall <hubcap@omnibond.com>
89 * Description:
90 * Credential cache configurable settings.
91 *
92 */
93
94#include <linux/fs.h>
95#include <linux/kobject.h>
96#include <linux/string.h>
97#include <linux/sysfs.h>
98#include <linux/module.h>
99#include <linux/init.h>
100
101#include "protocol.h"
102#include "orangefs-kernel.h"
103#include "orangefs-sysfs.h"
104
105#define ORANGEFS_KOBJ_ID "orangefs"
106#define ACACHE_KOBJ_ID "acache"
107#define CAPCACHE_KOBJ_ID "capcache"
108#define CCACHE_KOBJ_ID "ccache"
109#define NCACHE_KOBJ_ID "ncache"
110#define PC_KOBJ_ID "pc"
111#define STATS_KOBJ_ID "stats"
112
113struct orangefs_obj {
114 struct kobject kobj;
115 int op_timeout_secs;
116 int perf_counter_reset;
117 int perf_history_size;
118 int perf_time_interval_secs;
119 int slot_timeout_secs;
120};
121
122struct acache_orangefs_obj {
123 struct kobject kobj;
124 int hard_limit;
125 int reclaim_percentage;
126 int soft_limit;
127 int timeout_msecs;
128};
129
130struct capcache_orangefs_obj {
131 struct kobject kobj;
132 int hard_limit;
133 int reclaim_percentage;
134 int soft_limit;
135 int timeout_secs;
136};
137
138struct ccache_orangefs_obj {
139 struct kobject kobj;
140 int hard_limit;
141 int reclaim_percentage;
142 int soft_limit;
143 int timeout_secs;
144};
145
146struct ncache_orangefs_obj {
147 struct kobject kobj;
148 int hard_limit;
149 int reclaim_percentage;
150 int soft_limit;
151 int timeout_msecs;
152};
153
154struct pc_orangefs_obj {
155 struct kobject kobj;
156 char *acache;
157 char *capcache;
158 char *ncache;
159};
160
161struct stats_orangefs_obj {
162 struct kobject kobj;
163 int reads;
164 int writes;
165};
166
167struct orangefs_attribute {
168 struct attribute attr;
169 ssize_t (*show)(struct orangefs_obj *orangefs_obj,
170 struct orangefs_attribute *attr,
171 char *buf);
172 ssize_t (*store)(struct orangefs_obj *orangefs_obj,
173 struct orangefs_attribute *attr,
174 const char *buf,
175 size_t count);
176};
177
178struct acache_orangefs_attribute {
179 struct attribute attr;
180 ssize_t (*show)(struct acache_orangefs_obj *acache_orangefs_obj,
181 struct acache_orangefs_attribute *attr,
182 char *buf);
183 ssize_t (*store)(struct acache_orangefs_obj *acache_orangefs_obj,
184 struct acache_orangefs_attribute *attr,
185 const char *buf,
186 size_t count);
187};
188
189struct capcache_orangefs_attribute {
190 struct attribute attr;
191 ssize_t (*show)(struct capcache_orangefs_obj *capcache_orangefs_obj,
192 struct capcache_orangefs_attribute *attr,
193 char *buf);
194 ssize_t (*store)(struct capcache_orangefs_obj *capcache_orangefs_obj,
195 struct capcache_orangefs_attribute *attr,
196 const char *buf,
197 size_t count);
198};
199
200struct ccache_orangefs_attribute {
201 struct attribute attr;
202 ssize_t (*show)(struct ccache_orangefs_obj *ccache_orangefs_obj,
203 struct ccache_orangefs_attribute *attr,
204 char *buf);
205 ssize_t (*store)(struct ccache_orangefs_obj *ccache_orangefs_obj,
206 struct ccache_orangefs_attribute *attr,
207 const char *buf,
208 size_t count);
209};
210
211struct ncache_orangefs_attribute {
212 struct attribute attr;
213 ssize_t (*show)(struct ncache_orangefs_obj *ncache_orangefs_obj,
214 struct ncache_orangefs_attribute *attr,
215 char *buf);
216 ssize_t (*store)(struct ncache_orangefs_obj *ncache_orangefs_obj,
217 struct ncache_orangefs_attribute *attr,
218 const char *buf,
219 size_t count);
220};
221
222struct pc_orangefs_attribute {
223 struct attribute attr;
224 ssize_t (*show)(struct pc_orangefs_obj *pc_orangefs_obj,
225 struct pc_orangefs_attribute *attr,
226 char *buf);
227 ssize_t (*store)(struct pc_orangefs_obj *pc_orangefs_obj,
228 struct pc_orangefs_attribute *attr,
229 const char *buf,
230 size_t count);
231};
232
233struct stats_orangefs_attribute {
234 struct attribute attr;
235 ssize_t (*show)(struct stats_orangefs_obj *stats_orangefs_obj,
236 struct stats_orangefs_attribute *attr,
237 char *buf);
238 ssize_t (*store)(struct stats_orangefs_obj *stats_orangefs_obj,
239 struct stats_orangefs_attribute *attr,
240 const char *buf,
241 size_t count);
242};
243
244static ssize_t orangefs_attr_show(struct kobject *kobj,
245 struct attribute *attr,
246 char *buf)
247{
248 struct orangefs_attribute *attribute;
249 struct orangefs_obj *orangefs_obj;
250 int rc;
251
252 attribute = container_of(attr, struct orangefs_attribute, attr);
253 orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
254
255 if (!attribute->show) {
256 rc = -EIO;
257 goto out;
258 }
259
260 rc = attribute->show(orangefs_obj, attribute, buf);
261
262out:
263 return rc;
264}
265
266static ssize_t orangefs_attr_store(struct kobject *kobj,
267 struct attribute *attr,
268 const char *buf,
269 size_t len)
270{
271 struct orangefs_attribute *attribute;
272 struct orangefs_obj *orangefs_obj;
273 int rc;
274
275 gossip_debug(GOSSIP_SYSFS_DEBUG,
276 "orangefs_attr_store: start\n");
277
278 attribute = container_of(attr, struct orangefs_attribute, attr);
279 orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
280
281 if (!attribute->store) {
282 rc = -EIO;
283 goto out;
284 }
285
286 rc = attribute->store(orangefs_obj, attribute, buf, len);
287
288out:
289 return rc;
290}
291
292static const struct sysfs_ops orangefs_sysfs_ops = {
293 .show = orangefs_attr_show,
294 .store = orangefs_attr_store,
295};
296
297static ssize_t acache_orangefs_attr_show(struct kobject *kobj,
298 struct attribute *attr,
299 char *buf)
300{
301 struct acache_orangefs_attribute *attribute;
302 struct acache_orangefs_obj *acache_orangefs_obj;
303 int rc;
304
305 attribute = container_of(attr, struct acache_orangefs_attribute, attr);
306 acache_orangefs_obj =
307 container_of(kobj, struct acache_orangefs_obj, kobj);
308
309 if (!attribute->show) {
310 rc = -EIO;
311 goto out;
312 }
313
314 rc = attribute->show(acache_orangefs_obj, attribute, buf);
315
316out:
317 return rc;
318}
319
320static ssize_t acache_orangefs_attr_store(struct kobject *kobj,
321 struct attribute *attr,
322 const char *buf,
323 size_t len)
324{
325 struct acache_orangefs_attribute *attribute;
326 struct acache_orangefs_obj *acache_orangefs_obj;
327 int rc;
328
329 gossip_debug(GOSSIP_SYSFS_DEBUG,
330 "acache_orangefs_attr_store: start\n");
331
332 attribute = container_of(attr, struct acache_orangefs_attribute, attr);
333 acache_orangefs_obj =
334 container_of(kobj, struct acache_orangefs_obj, kobj);
335
336 if (!attribute->store) {
337 rc = -EIO;
338 goto out;
339 }
340
341 rc = attribute->store(acache_orangefs_obj, attribute, buf, len);
342
343out:
344 return rc;
345}
346
347static const struct sysfs_ops acache_orangefs_sysfs_ops = {
348 .show = acache_orangefs_attr_show,
349 .store = acache_orangefs_attr_store,
350};
351
352static ssize_t capcache_orangefs_attr_show(struct kobject *kobj,
353 struct attribute *attr,
354 char *buf)
355{
356 struct capcache_orangefs_attribute *attribute;
357 struct capcache_orangefs_obj *capcache_orangefs_obj;
358 int rc;
359
360 attribute =
361 container_of(attr, struct capcache_orangefs_attribute, attr);
362 capcache_orangefs_obj =
363 container_of(kobj, struct capcache_orangefs_obj, kobj);
364
365 if (!attribute->show) {
366 rc = -EIO;
367 goto out;
368 }
369
370 rc = attribute->show(capcache_orangefs_obj, attribute, buf);
371
372out:
373 return rc;
374}
375
376static ssize_t capcache_orangefs_attr_store(struct kobject *kobj,
377 struct attribute *attr,
378 const char *buf,
379 size_t len)
380{
381 struct capcache_orangefs_attribute *attribute;
382 struct capcache_orangefs_obj *capcache_orangefs_obj;
383 int rc;
384
385 gossip_debug(GOSSIP_SYSFS_DEBUG,
386 "capcache_orangefs_attr_store: start\n");
387
388 attribute =
389 container_of(attr, struct capcache_orangefs_attribute, attr);
390 capcache_orangefs_obj =
391 container_of(kobj, struct capcache_orangefs_obj, kobj);
392
393 if (!attribute->store) {
394 rc = -EIO;
395 goto out;
396 }
397
398 rc = attribute->store(capcache_orangefs_obj, attribute, buf, len);
399
400out:
401 return rc;
402}
403
404static const struct sysfs_ops capcache_orangefs_sysfs_ops = {
405 .show = capcache_orangefs_attr_show,
406 .store = capcache_orangefs_attr_store,
407};
408
409static ssize_t ccache_orangefs_attr_show(struct kobject *kobj,
410 struct attribute *attr,
411 char *buf)
412{
413 struct ccache_orangefs_attribute *attribute;
414 struct ccache_orangefs_obj *ccache_orangefs_obj;
415 int rc;
416
417 attribute =
418 container_of(attr, struct ccache_orangefs_attribute, attr);
419 ccache_orangefs_obj =
420 container_of(kobj, struct ccache_orangefs_obj, kobj);
421
422 if (!attribute->show) {
423 rc = -EIO;
424 goto out;
425 }
426
427 rc = attribute->show(ccache_orangefs_obj, attribute, buf);
428
429out:
430 return rc;
431}
432
433static ssize_t ccache_orangefs_attr_store(struct kobject *kobj,
434 struct attribute *attr,
435 const char *buf,
436 size_t len)
437{
438 struct ccache_orangefs_attribute *attribute;
439 struct ccache_orangefs_obj *ccache_orangefs_obj;
440 int rc;
441
442 gossip_debug(GOSSIP_SYSFS_DEBUG,
443 "ccache_orangefs_attr_store: start\n");
444
445 attribute =
446 container_of(attr, struct ccache_orangefs_attribute, attr);
447 ccache_orangefs_obj =
448 container_of(kobj, struct ccache_orangefs_obj, kobj);
449
450 if (!attribute->store) {
451 rc = -EIO;
452 goto out;
453 }
454
455 rc = attribute->store(ccache_orangefs_obj, attribute, buf, len);
456
457out:
458 return rc;
459}
460
461static const struct sysfs_ops ccache_orangefs_sysfs_ops = {
462 .show = ccache_orangefs_attr_show,
463 .store = ccache_orangefs_attr_store,
464};
465
466static ssize_t ncache_orangefs_attr_show(struct kobject *kobj,
467 struct attribute *attr,
468 char *buf)
469{
470 struct ncache_orangefs_attribute *attribute;
471 struct ncache_orangefs_obj *ncache_orangefs_obj;
472 int rc;
473
474 attribute = container_of(attr, struct ncache_orangefs_attribute, attr);
475 ncache_orangefs_obj =
476 container_of(kobj, struct ncache_orangefs_obj, kobj);
477
478 if (!attribute->show) {
479 rc = -EIO;
480 goto out;
481 }
482
483 rc = attribute->show(ncache_orangefs_obj, attribute, buf);
484
485out:
486 return rc;
487}
488
489static ssize_t ncache_orangefs_attr_store(struct kobject *kobj,
490 struct attribute *attr,
491 const char *buf,
492 size_t len)
493{
494 struct ncache_orangefs_attribute *attribute;
495 struct ncache_orangefs_obj *ncache_orangefs_obj;
496 int rc;
497
498 gossip_debug(GOSSIP_SYSFS_DEBUG,
499 "ncache_orangefs_attr_store: start\n");
500
501 attribute = container_of(attr, struct ncache_orangefs_attribute, attr);
502 ncache_orangefs_obj =
503 container_of(kobj, struct ncache_orangefs_obj, kobj);
504
505 if (!attribute->store) {
506 rc = -EIO;
507 goto out;
508 }
509
510 rc = attribute->store(ncache_orangefs_obj, attribute, buf, len);
511
512out:
513 return rc;
514}
515
516static const struct sysfs_ops ncache_orangefs_sysfs_ops = {
517 .show = ncache_orangefs_attr_show,
518 .store = ncache_orangefs_attr_store,
519};
520
521static ssize_t pc_orangefs_attr_show(struct kobject *kobj,
522 struct attribute *attr,
523 char *buf)
524{
525 struct pc_orangefs_attribute *attribute;
526 struct pc_orangefs_obj *pc_orangefs_obj;
527 int rc;
528
529 attribute = container_of(attr, struct pc_orangefs_attribute, attr);
530 pc_orangefs_obj =
531 container_of(kobj, struct pc_orangefs_obj, kobj);
532
533 if (!attribute->show) {
534 rc = -EIO;
535 goto out;
536 }
537
538 rc = attribute->show(pc_orangefs_obj, attribute, buf);
539
540out:
541 return rc;
542}
543
544static const struct sysfs_ops pc_orangefs_sysfs_ops = {
545 .show = pc_orangefs_attr_show,
546};
547
548static ssize_t stats_orangefs_attr_show(struct kobject *kobj,
549 struct attribute *attr,
550 char *buf)
551{
552 struct stats_orangefs_attribute *attribute;
553 struct stats_orangefs_obj *stats_orangefs_obj;
554 int rc;
555
556 attribute = container_of(attr, struct stats_orangefs_attribute, attr);
557 stats_orangefs_obj =
558 container_of(kobj, struct stats_orangefs_obj, kobj);
559
560 if (!attribute->show) {
561 rc = -EIO;
562 goto out;
563 }
564
565 rc = attribute->show(stats_orangefs_obj, attribute, buf);
566
567out:
568 return rc;
569}
570
571static const struct sysfs_ops stats_orangefs_sysfs_ops = {
572 .show = stats_orangefs_attr_show,
573};
574
575static void orangefs_release(struct kobject *kobj)
576{
577 struct orangefs_obj *orangefs_obj;
578
579 orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
580 kfree(orangefs_obj);
581}
582
583static void acache_orangefs_release(struct kobject *kobj)
584{
585 struct acache_orangefs_obj *acache_orangefs_obj;
586
587 acache_orangefs_obj =
588 container_of(kobj, struct acache_orangefs_obj, kobj);
589 kfree(acache_orangefs_obj);
590}
591
592static void capcache_orangefs_release(struct kobject *kobj)
593{
594 struct capcache_orangefs_obj *capcache_orangefs_obj;
595
596 capcache_orangefs_obj =
597 container_of(kobj, struct capcache_orangefs_obj, kobj);
598 kfree(capcache_orangefs_obj);
599}
600
601static void ccache_orangefs_release(struct kobject *kobj)
602{
603 struct ccache_orangefs_obj *ccache_orangefs_obj;
604
605 ccache_orangefs_obj =
606 container_of(kobj, struct ccache_orangefs_obj, kobj);
607 kfree(ccache_orangefs_obj);
608}
609
610static void ncache_orangefs_release(struct kobject *kobj)
611{
612 struct ncache_orangefs_obj *ncache_orangefs_obj;
613
614 ncache_orangefs_obj =
615 container_of(kobj, struct ncache_orangefs_obj, kobj);
616 kfree(ncache_orangefs_obj);
617}
618
619static void pc_orangefs_release(struct kobject *kobj)
620{
621 struct pc_orangefs_obj *pc_orangefs_obj;
622
623 pc_orangefs_obj =
624 container_of(kobj, struct pc_orangefs_obj, kobj);
625 kfree(pc_orangefs_obj);
626}
627
628static void stats_orangefs_release(struct kobject *kobj)
629{
630 struct stats_orangefs_obj *stats_orangefs_obj;
631
632 stats_orangefs_obj =
633 container_of(kobj, struct stats_orangefs_obj, kobj);
634 kfree(stats_orangefs_obj);
635}
636
637static ssize_t sysfs_int_show(char *kobj_id, char *buf, void *attr)
638{
639 int rc = -EIO;
640 struct orangefs_attribute *orangefs_attr;
641 struct stats_orangefs_attribute *stats_orangefs_attr;
642
643 gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_int_show: id:%s:\n", kobj_id);
644
645 if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
646 orangefs_attr = (struct orangefs_attribute *)attr;
647
648 if (!strcmp(orangefs_attr->attr.name, "op_timeout_secs")) {
649 rc = scnprintf(buf,
650 PAGE_SIZE,
651 "%d\n",
652 op_timeout_secs);
653 goto out;
654 } else if (!strcmp(orangefs_attr->attr.name,
655 "slot_timeout_secs")) {
656 rc = scnprintf(buf,
657 PAGE_SIZE,
658 "%d\n",
659 slot_timeout_secs);
660 goto out;
661 } else {
662 goto out;
663 }
664
665 } else if (!strcmp(kobj_id, STATS_KOBJ_ID)) {
666 stats_orangefs_attr = (struct stats_orangefs_attribute *)attr;
667
668 if (!strcmp(stats_orangefs_attr->attr.name, "reads")) {
669 rc = scnprintf(buf,
670 PAGE_SIZE,
671 "%lu\n",
672 g_orangefs_stats.reads);
673 goto out;
674 } else if (!strcmp(stats_orangefs_attr->attr.name, "writes")) {
675 rc = scnprintf(buf,
676 PAGE_SIZE,
677 "%lu\n",
678 g_orangefs_stats.writes);
679 goto out;
680 } else {
681 goto out;
682 }
683 }
684
685out:
686
687 return rc;
688}
689
690static ssize_t int_orangefs_show(struct orangefs_obj *orangefs_obj,
691 struct orangefs_attribute *attr,
692 char *buf)
693{
694 int rc;
695
696 gossip_debug(GOSSIP_SYSFS_DEBUG,
697 "int_orangefs_show:start attr->attr.name:%s:\n",
698 attr->attr.name);
699
700 rc = sysfs_int_show(ORANGEFS_KOBJ_ID, buf, (void *) attr);
701
702 return rc;
703}
704
705static ssize_t int_stats_show(struct stats_orangefs_obj *stats_orangefs_obj,
706 struct stats_orangefs_attribute *attr,
707 char *buf)
708{
709 int rc;
710
711 gossip_debug(GOSSIP_SYSFS_DEBUG,
712 "int_stats_show:start attr->attr.name:%s:\n",
713 attr->attr.name);
714
715 rc = sysfs_int_show(STATS_KOBJ_ID, buf, (void *) attr);
716
717 return rc;
718}
719
720static ssize_t int_store(struct orangefs_obj *orangefs_obj,
721 struct orangefs_attribute *attr,
722 const char *buf,
723 size_t count)
724{
725 int rc = 0;
726
727 gossip_debug(GOSSIP_SYSFS_DEBUG,
728 "int_store: start attr->attr.name:%s: buf:%s:\n",
729 attr->attr.name, buf);
730
731 if (!strcmp(attr->attr.name, "op_timeout_secs")) {
732 rc = kstrtoint(buf, 0, &op_timeout_secs);
733 goto out;
734 } else if (!strcmp(attr->attr.name, "slot_timeout_secs")) {
735 rc = kstrtoint(buf, 0, &slot_timeout_secs);
736 goto out;
737 } else {
738 goto out;
739 }
740
741out:
742 if (rc)
743 rc = -EINVAL;
744 else
745 rc = count;
746
747 return rc;
748}
749
750/*
751 * obtain attribute values from userspace with a service operation.
752 */
753static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr)
754{
755 struct orangefs_kernel_op_s *new_op = NULL;
756 int rc = 0;
757 char *ser_op_type = NULL;
758 struct orangefs_attribute *orangefs_attr;
759 struct acache_orangefs_attribute *acache_attr;
760 struct capcache_orangefs_attribute *capcache_attr;
761 struct ccache_orangefs_attribute *ccache_attr;
762 struct ncache_orangefs_attribute *ncache_attr;
763 struct pc_orangefs_attribute *pc_attr;
764 __u32 op_alloc_type;
765
766 gossip_debug(GOSSIP_SYSFS_DEBUG,
767 "sysfs_service_op_show: id:%s:\n",
768 kobj_id);
769
770 if (strcmp(kobj_id, PC_KOBJ_ID))
771 op_alloc_type = ORANGEFS_VFS_OP_PARAM;
772 else
773 op_alloc_type = ORANGEFS_VFS_OP_PERF_COUNT;
774
775 new_op = op_alloc(op_alloc_type);
776 if (!new_op)
777 return -ENOMEM;
778
779 /* Can't do a service_operation if the client is not running... */
780 rc = is_daemon_in_service();
781 if (rc) {
782 pr_info("%s: Client not running :%d:\n",
783 __func__,
784 is_daemon_in_service());
785 goto out;
786 }
787
788 if (strcmp(kobj_id, PC_KOBJ_ID))
789 new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_GET;
790
791 if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
792 orangefs_attr = (struct orangefs_attribute *)attr;
793
794 if (!strcmp(orangefs_attr->attr.name, "perf_history_size"))
795 new_op->upcall.req.param.op =
796 ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE;
797 else if (!strcmp(orangefs_attr->attr.name,
798 "perf_time_interval_secs"))
799 new_op->upcall.req.param.op =
800 ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS;
801 else if (!strcmp(orangefs_attr->attr.name,
802 "perf_counter_reset"))
803 new_op->upcall.req.param.op =
804 ORANGEFS_PARAM_REQUEST_OP_PERF_RESET;
805
806 } else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) {
807 acache_attr = (struct acache_orangefs_attribute *)attr;
808
809 if (!strcmp(acache_attr->attr.name, "timeout_msecs"))
810 new_op->upcall.req.param.op =
811 ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS;
812
813 if (!strcmp(acache_attr->attr.name, "hard_limit"))
814 new_op->upcall.req.param.op =
815 ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT;
816
817 if (!strcmp(acache_attr->attr.name, "soft_limit"))
818 new_op->upcall.req.param.op =
819 ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT;
820
821 if (!strcmp(acache_attr->attr.name, "reclaim_percentage"))
822 new_op->upcall.req.param.op =
823 ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE;
824
825 } else if (!strcmp(kobj_id, CAPCACHE_KOBJ_ID)) {
826 capcache_attr = (struct capcache_orangefs_attribute *)attr;
827
828 if (!strcmp(capcache_attr->attr.name, "timeout_secs"))
829 new_op->upcall.req.param.op =
830 ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS;
831
832 if (!strcmp(capcache_attr->attr.name, "hard_limit"))
833 new_op->upcall.req.param.op =
834 ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT;
835
836 if (!strcmp(capcache_attr->attr.name, "soft_limit"))
837 new_op->upcall.req.param.op =
838 ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT;
839
840 if (!strcmp(capcache_attr->attr.name, "reclaim_percentage"))
841 new_op->upcall.req.param.op =
842 ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE;
843
844 } else if (!strcmp(kobj_id, CCACHE_KOBJ_ID)) {
845 ccache_attr = (struct ccache_orangefs_attribute *)attr;
846
847 if (!strcmp(ccache_attr->attr.name, "timeout_secs"))
848 new_op->upcall.req.param.op =
849 ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS;
850
851 if (!strcmp(ccache_attr->attr.name, "hard_limit"))
852 new_op->upcall.req.param.op =
853 ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT;
854
855 if (!strcmp(ccache_attr->attr.name, "soft_limit"))
856 new_op->upcall.req.param.op =
857 ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT;
858
859 if (!strcmp(ccache_attr->attr.name, "reclaim_percentage"))
860 new_op->upcall.req.param.op =
861 ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE;
862
863 } else if (!strcmp(kobj_id, NCACHE_KOBJ_ID)) {
864 ncache_attr = (struct ncache_orangefs_attribute *)attr;
865
866 if (!strcmp(ncache_attr->attr.name, "timeout_msecs"))
867 new_op->upcall.req.param.op =
868 ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS;
869
870 if (!strcmp(ncache_attr->attr.name, "hard_limit"))
871 new_op->upcall.req.param.op =
872 ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT;
873
874 if (!strcmp(ncache_attr->attr.name, "soft_limit"))
875 new_op->upcall.req.param.op =
876 ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT;
877
878 if (!strcmp(ncache_attr->attr.name, "reclaim_percentage"))
879 new_op->upcall.req.param.op =
880 ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE;
881
882 } else if (!strcmp(kobj_id, PC_KOBJ_ID)) {
883 pc_attr = (struct pc_orangefs_attribute *)attr;
884
885 if (!strcmp(pc_attr->attr.name, ACACHE_KOBJ_ID))
886 new_op->upcall.req.perf_count.type =
887 ORANGEFS_PERF_COUNT_REQUEST_ACACHE;
888
889 if (!strcmp(pc_attr->attr.name, CAPCACHE_KOBJ_ID))
890 new_op->upcall.req.perf_count.type =
891 ORANGEFS_PERF_COUNT_REQUEST_CAPCACHE;
892
893 if (!strcmp(pc_attr->attr.name, NCACHE_KOBJ_ID))
894 new_op->upcall.req.perf_count.type =
895 ORANGEFS_PERF_COUNT_REQUEST_NCACHE;
896
897 } else {
898 gossip_err("sysfs_service_op_show: unknown kobj_id:%s:\n",
899 kobj_id);
900 rc = -EINVAL;
901 goto out;
902 }
903
904
905 if (strcmp(kobj_id, PC_KOBJ_ID))
906 ser_op_type = "orangefs_param";
907 else
908 ser_op_type = "orangefs_perf_count";
909
910 /*
911 * The service_operation will return an errno return code on
912 * error, and zero on success.
913 */
914 rc = service_operation(new_op, ser_op_type, ORANGEFS_OP_INTERRUPTIBLE);
915
916out:
917 if (!rc) {
918 if (strcmp(kobj_id, PC_KOBJ_ID)) {
919 rc = scnprintf(buf,
920 PAGE_SIZE,
921 "%d\n",
922 (int)new_op->downcall.resp.param.value);
923 } else {
924 rc = scnprintf(
925 buf,
926 PAGE_SIZE,
927 "%s",
928 new_op->downcall.resp.perf_count.buffer);
929 }
930 }
931
932 op_release(new_op);
933
934 return rc;
935
936}
937
938static ssize_t service_orangefs_show(struct orangefs_obj *orangefs_obj,
939 struct orangefs_attribute *attr,
940 char *buf)
941{
942 int rc = 0;
943
944 rc = sysfs_service_op_show(ORANGEFS_KOBJ_ID, buf, (void *)attr);
945
946 return rc;
947}
948
949static ssize_t
950 service_acache_show(struct acache_orangefs_obj *acache_orangefs_obj,
951 struct acache_orangefs_attribute *attr,
952 char *buf)
953{
954 int rc = 0;
955
956 rc = sysfs_service_op_show(ACACHE_KOBJ_ID, buf, (void *)attr);
957
958 return rc;
959}
960
961static ssize_t service_capcache_show(struct capcache_orangefs_obj
962 *capcache_orangefs_obj,
963 struct capcache_orangefs_attribute *attr,
964 char *buf)
965{
966 int rc = 0;
967
968 rc = sysfs_service_op_show(CAPCACHE_KOBJ_ID, buf, (void *)attr);
969
970 return rc;
971}
972
973static ssize_t service_ccache_show(struct ccache_orangefs_obj
974 *ccache_orangefs_obj,
975 struct ccache_orangefs_attribute *attr,
976 char *buf)
977{
978 int rc = 0;
979
980 rc = sysfs_service_op_show(CCACHE_KOBJ_ID, buf, (void *)attr);
981
982 return rc;
983}
984
985static ssize_t
986 service_ncache_show(struct ncache_orangefs_obj *ncache_orangefs_obj,
987 struct ncache_orangefs_attribute *attr,
988 char *buf)
989{
990 int rc = 0;
991
992 rc = sysfs_service_op_show(NCACHE_KOBJ_ID, buf, (void *)attr);
993
994 return rc;
995}
996
997static ssize_t
998 service_pc_show(struct pc_orangefs_obj *pc_orangefs_obj,
999 struct pc_orangefs_attribute *attr,
1000 char *buf)
1001{
1002 int rc = 0;
1003
1004 rc = sysfs_service_op_show(PC_KOBJ_ID, buf, (void *)attr);
1005
1006 return rc;
1007}
1008
1009/*
1010 * pass attribute values back to userspace with a service operation.
1011 *
1012 * We have to do a memory allocation, an sscanf and a service operation.
1013 * And we have to evaluate what the user entered, to make sure the
1014 * value is within the range supported by the attribute. So, there's
1015 * a lot of return code checking and mapping going on here.
1016 *
1017 * We want to return 1 if we think everything went OK, and
1018 * EINVAL if not.
1019 */
1020static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
1021{
1022 struct orangefs_kernel_op_s *new_op = NULL;
1023 int val = 0;
1024 int rc = 0;
1025 struct orangefs_attribute *orangefs_attr;
1026 struct acache_orangefs_attribute *acache_attr;
1027 struct capcache_orangefs_attribute *capcache_attr;
1028 struct ccache_orangefs_attribute *ccache_attr;
1029 struct ncache_orangefs_attribute *ncache_attr;
1030
1031 gossip_debug(GOSSIP_SYSFS_DEBUG,
1032 "sysfs_service_op_store: id:%s:\n",
1033 kobj_id);
1034
1035 new_op = op_alloc(ORANGEFS_VFS_OP_PARAM);
1036 if (!new_op)
1037 return -EINVAL; /* sic */
1038
1039 /* Can't do a service_operation if the client is not running... */
1040 rc = is_daemon_in_service();
1041 if (rc) {
1042 pr_info("%s: Client not running :%d:\n",
1043 __func__,
1044 is_daemon_in_service());
1045 goto out;
1046 }
1047
1048 /*
1049 * The value we want to send back to userspace is in buf.
1050 */
1051 rc = kstrtoint(buf, 0, &val);
1052 if (rc)
1053 goto out;
1054
1055 if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
1056 orangefs_attr = (struct orangefs_attribute *)attr;
1057
1058 if (!strcmp(orangefs_attr->attr.name, "perf_history_size")) {
1059 if (val > 0) {
1060 new_op->upcall.req.param.op =
1061 ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE;
1062 } else {
1063 rc = 0;
1064 goto out;
1065 }
1066 } else if (!strcmp(orangefs_attr->attr.name,
1067 "perf_time_interval_secs")) {
1068 if (val > 0) {
1069 new_op->upcall.req.param.op =
1070 ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS;
1071 } else {
1072 rc = 0;
1073 goto out;
1074 }
1075 } else if (!strcmp(orangefs_attr->attr.name,
1076 "perf_counter_reset")) {
1077 if ((val == 0) || (val == 1)) {
1078 new_op->upcall.req.param.op =
1079 ORANGEFS_PARAM_REQUEST_OP_PERF_RESET;
1080 } else {
1081 rc = 0;
1082 goto out;
1083 }
1084 }
1085
1086 } else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) {
1087 acache_attr = (struct acache_orangefs_attribute *)attr;
1088
1089 if (!strcmp(acache_attr->attr.name, "hard_limit")) {
1090 if (val > -1) {
1091 new_op->upcall.req.param.op =
1092 ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT;
1093 } else {
1094 rc = 0;
1095 goto out;
1096 }
1097 } else if (!strcmp(acache_attr->attr.name, "soft_limit")) {
1098 if (val > -1) {
1099 new_op->upcall.req.param.op =
1100 ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT;
1101 } else {
1102 rc = 0;
1103 goto out;
1104 }
1105 } else if (!strcmp(acache_attr->attr.name,
1106 "reclaim_percentage")) {
1107 if ((val > -1) && (val < 101)) {
1108 new_op->upcall.req.param.op =
1109 ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE;
1110 } else {
1111 rc = 0;
1112 goto out;
1113 }
1114 } else if (!strcmp(acache_attr->attr.name, "timeout_msecs")) {
1115 if (val > -1) {
1116 new_op->upcall.req.param.op =
1117 ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS;
1118 } else {
1119 rc = 0;
1120 goto out;
1121 }
1122 }
1123
1124 } else if (!strcmp(kobj_id, CAPCACHE_KOBJ_ID)) {
1125 capcache_attr = (struct capcache_orangefs_attribute *)attr;
1126
1127 if (!strcmp(capcache_attr->attr.name, "hard_limit")) {
1128 if (val > -1) {
1129 new_op->upcall.req.param.op =
1130 ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT;
1131 } else {
1132 rc = 0;
1133 goto out;
1134 }
1135 } else if (!strcmp(capcache_attr->attr.name, "soft_limit")) {
1136 if (val > -1) {
1137 new_op->upcall.req.param.op =
1138 ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT;
1139 } else {
1140 rc = 0;
1141 goto out;
1142 }
1143 } else if (!strcmp(capcache_attr->attr.name,
1144 "reclaim_percentage")) {
1145 if ((val > -1) && (val < 101)) {
1146 new_op->upcall.req.param.op =
1147 ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE;
1148 } else {
1149 rc = 0;
1150 goto out;
1151 }
1152 } else if (!strcmp(capcache_attr->attr.name, "timeout_secs")) {
1153 if (val > -1) {
1154 new_op->upcall.req.param.op =
1155 ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS;
1156 } else {
1157 rc = 0;
1158 goto out;
1159 }
1160 }
1161
1162 } else if (!strcmp(kobj_id, CCACHE_KOBJ_ID)) {
1163 ccache_attr = (struct ccache_orangefs_attribute *)attr;
1164
1165 if (!strcmp(ccache_attr->attr.name, "hard_limit")) {
1166 if (val > -1) {
1167 new_op->upcall.req.param.op =
1168 ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT;
1169 } else {
1170 rc = 0;
1171 goto out;
1172 }
1173 } else if (!strcmp(ccache_attr->attr.name, "soft_limit")) {
1174 if (val > -1) {
1175 new_op->upcall.req.param.op =
1176 ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT;
1177 } else {
1178 rc = 0;
1179 goto out;
1180 }
1181 } else if (!strcmp(ccache_attr->attr.name,
1182 "reclaim_percentage")) {
1183 if ((val > -1) && (val < 101)) {
1184 new_op->upcall.req.param.op =
1185 ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE;
1186 } else {
1187 rc = 0;
1188 goto out;
1189 }
1190 } else if (!strcmp(ccache_attr->attr.name, "timeout_secs")) {
1191 if (val > -1) {
1192 new_op->upcall.req.param.op =
1193 ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS;
1194 } else {
1195 rc = 0;
1196 goto out;
1197 }
1198 }
1199
1200 } else if (!strcmp(kobj_id, NCACHE_KOBJ_ID)) {
1201 ncache_attr = (struct ncache_orangefs_attribute *)attr;
1202
1203 if (!strcmp(ncache_attr->attr.name, "hard_limit")) {
1204 if (val > -1) {
1205 new_op->upcall.req.param.op =
1206 ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT;
1207 } else {
1208 rc = 0;
1209 goto out;
1210 }
1211 } else if (!strcmp(ncache_attr->attr.name, "soft_limit")) {
1212 if (val > -1) {
1213 new_op->upcall.req.param.op =
1214 ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT;
1215 } else {
1216 rc = 0;
1217 goto out;
1218 }
1219 } else if (!strcmp(ncache_attr->attr.name,
1220 "reclaim_percentage")) {
1221 if ((val > -1) && (val < 101)) {
1222 new_op->upcall.req.param.op =
1223 ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE;
1224 } else {
1225 rc = 0;
1226 goto out;
1227 }
1228 } else if (!strcmp(ncache_attr->attr.name, "timeout_msecs")) {
1229 if (val > -1) {
1230 new_op->upcall.req.param.op =
1231 ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS;
1232 } else {
1233 rc = 0;
1234 goto out;
1235 }
1236 }
1237
1238 } else {
1239 gossip_err("sysfs_service_op_store: unknown kobj_id:%s:\n",
1240 kobj_id);
1241 rc = -EINVAL;
1242 goto out;
1243 }
1244
1245 new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET;
1246
1247 new_op->upcall.req.param.value = val;
1248
1249 /*
1250 * The service_operation will return a errno return code on
1251 * error, and zero on success.
1252 */
1253 rc = service_operation(new_op, "orangefs_param", ORANGEFS_OP_INTERRUPTIBLE);
1254
1255 if (rc < 0) {
1256 gossip_err("sysfs_service_op_store: service op returned:%d:\n",
1257 rc);
1258 rc = 0;
1259 } else {
1260 rc = 1;
1261 }
1262
1263out:
1264 op_release(new_op);
1265
1266 if (rc == -ENOMEM || rc == 0)
1267 rc = -EINVAL;
1268
1269 return rc;
1270}
1271
1272static ssize_t
1273 service_orangefs_store(struct orangefs_obj *orangefs_obj,
1274 struct orangefs_attribute *attr,
1275 const char *buf,
1276 size_t count)
1277{
1278 int rc = 0;
1279
1280 rc = sysfs_service_op_store(ORANGEFS_KOBJ_ID, buf, (void *) attr);
1281
1282 /* rc should have an errno value if the service_op went bad. */
1283 if (rc == 1)
1284 rc = count;
1285
1286 return rc;
1287}
1288
1289static ssize_t
1290 service_acache_store(struct acache_orangefs_obj *acache_orangefs_obj,
1291 struct acache_orangefs_attribute *attr,
1292 const char *buf,
1293 size_t count)
1294{
1295 int rc = 0;
1296
1297 rc = sysfs_service_op_store(ACACHE_KOBJ_ID, buf, (void *) attr);
1298
1299 /* rc should have an errno value if the service_op went bad. */
1300 if (rc == 1)
1301 rc = count;
1302
1303 return rc;
1304}
1305
1306static ssize_t
1307 service_capcache_store(struct capcache_orangefs_obj
1308 *capcache_orangefs_obj,
1309 struct capcache_orangefs_attribute *attr,
1310 const char *buf,
1311 size_t count)
1312{
1313 int rc = 0;
1314
1315 rc = sysfs_service_op_store(CAPCACHE_KOBJ_ID, buf, (void *) attr);
1316
1317 /* rc should have an errno value if the service_op went bad. */
1318 if (rc == 1)
1319 rc = count;
1320
1321 return rc;
1322}
1323
1324static ssize_t service_ccache_store(struct ccache_orangefs_obj
1325 *ccache_orangefs_obj,
1326 struct ccache_orangefs_attribute *attr,
1327 const char *buf,
1328 size_t count)
1329{
1330 int rc = 0;
1331
1332 rc = sysfs_service_op_store(CCACHE_KOBJ_ID, buf, (void *) attr);
1333
1334 /* rc should have an errno value if the service_op went bad. */
1335 if (rc == 1)
1336 rc = count;
1337
1338 return rc;
1339}
1340
1341static ssize_t
1342 service_ncache_store(struct ncache_orangefs_obj *ncache_orangefs_obj,
1343 struct ncache_orangefs_attribute *attr,
1344 const char *buf,
1345 size_t count)
1346{
1347 int rc = 0;
1348
1349 rc = sysfs_service_op_store(NCACHE_KOBJ_ID, buf, (void *) attr);
1350
1351 /* rc should have an errno value if the service_op went bad. */
1352 if (rc == 1)
1353 rc = count;
1354
1355 return rc;
1356}
1357
1358static struct orangefs_attribute op_timeout_secs_attribute =
1359 __ATTR(op_timeout_secs, 0664, int_orangefs_show, int_store);
1360
1361static struct orangefs_attribute slot_timeout_secs_attribute =
1362 __ATTR(slot_timeout_secs, 0664, int_orangefs_show, int_store);
1363
1364static struct orangefs_attribute perf_counter_reset_attribute =
1365 __ATTR(perf_counter_reset,
1366 0664,
1367 service_orangefs_show,
1368 service_orangefs_store);
1369
1370static struct orangefs_attribute perf_history_size_attribute =
1371 __ATTR(perf_history_size,
1372 0664,
1373 service_orangefs_show,
1374 service_orangefs_store);
1375
1376static struct orangefs_attribute perf_time_interval_secs_attribute =
1377 __ATTR(perf_time_interval_secs,
1378 0664,
1379 service_orangefs_show,
1380 service_orangefs_store);
1381
1382static struct attribute *orangefs_default_attrs[] = {
1383 &op_timeout_secs_attribute.attr,
1384 &slot_timeout_secs_attribute.attr,
1385 &perf_counter_reset_attribute.attr,
1386 &perf_history_size_attribute.attr,
1387 &perf_time_interval_secs_attribute.attr,
1388 NULL,
1389};
1390
1391static struct kobj_type orangefs_ktype = {
1392 .sysfs_ops = &orangefs_sysfs_ops,
1393 .release = orangefs_release,
1394 .default_attrs = orangefs_default_attrs,
1395};
1396
1397static struct acache_orangefs_attribute acache_hard_limit_attribute =
1398 __ATTR(hard_limit,
1399 0664,
1400 service_acache_show,
1401 service_acache_store);
1402
1403static struct acache_orangefs_attribute acache_reclaim_percent_attribute =
1404 __ATTR(reclaim_percentage,
1405 0664,
1406 service_acache_show,
1407 service_acache_store);
1408
1409static struct acache_orangefs_attribute acache_soft_limit_attribute =
1410 __ATTR(soft_limit,
1411 0664,
1412 service_acache_show,
1413 service_acache_store);
1414
1415static struct acache_orangefs_attribute acache_timeout_msecs_attribute =
1416 __ATTR(timeout_msecs,
1417 0664,
1418 service_acache_show,
1419 service_acache_store);
1420
1421static struct attribute *acache_orangefs_default_attrs[] = {
1422 &acache_hard_limit_attribute.attr,
1423 &acache_reclaim_percent_attribute.attr,
1424 &acache_soft_limit_attribute.attr,
1425 &acache_timeout_msecs_attribute.attr,
1426 NULL,
1427};
1428
1429static struct kobj_type acache_orangefs_ktype = {
1430 .sysfs_ops = &acache_orangefs_sysfs_ops,
1431 .release = acache_orangefs_release,
1432 .default_attrs = acache_orangefs_default_attrs,
1433};
1434
1435static struct capcache_orangefs_attribute capcache_hard_limit_attribute =
1436 __ATTR(hard_limit,
1437 0664,
1438 service_capcache_show,
1439 service_capcache_store);
1440
1441static struct capcache_orangefs_attribute capcache_reclaim_percent_attribute =
1442 __ATTR(reclaim_percentage,
1443 0664,
1444 service_capcache_show,
1445 service_capcache_store);
1446
1447static struct capcache_orangefs_attribute capcache_soft_limit_attribute =
1448 __ATTR(soft_limit,
1449 0664,
1450 service_capcache_show,
1451 service_capcache_store);
1452
1453static struct capcache_orangefs_attribute capcache_timeout_secs_attribute =
1454 __ATTR(timeout_secs,
1455 0664,
1456 service_capcache_show,
1457 service_capcache_store);
1458
1459static struct attribute *capcache_orangefs_default_attrs[] = {
1460 &capcache_hard_limit_attribute.attr,
1461 &capcache_reclaim_percent_attribute.attr,
1462 &capcache_soft_limit_attribute.attr,
1463 &capcache_timeout_secs_attribute.attr,
1464 NULL,
1465};
1466
1467static struct kobj_type capcache_orangefs_ktype = {
1468 .sysfs_ops = &capcache_orangefs_sysfs_ops,
1469 .release = capcache_orangefs_release,
1470 .default_attrs = capcache_orangefs_default_attrs,
1471};
1472
1473static struct ccache_orangefs_attribute ccache_hard_limit_attribute =
1474 __ATTR(hard_limit,
1475 0664,
1476 service_ccache_show,
1477 service_ccache_store);
1478
1479static struct ccache_orangefs_attribute ccache_reclaim_percent_attribute =
1480 __ATTR(reclaim_percentage,
1481 0664,
1482 service_ccache_show,
1483 service_ccache_store);
1484
1485static struct ccache_orangefs_attribute ccache_soft_limit_attribute =
1486 __ATTR(soft_limit,
1487 0664,
1488 service_ccache_show,
1489 service_ccache_store);
1490
1491static struct ccache_orangefs_attribute ccache_timeout_secs_attribute =
1492 __ATTR(timeout_secs,
1493 0664,
1494 service_ccache_show,
1495 service_ccache_store);
1496
1497static struct attribute *ccache_orangefs_default_attrs[] = {
1498 &ccache_hard_limit_attribute.attr,
1499 &ccache_reclaim_percent_attribute.attr,
1500 &ccache_soft_limit_attribute.attr,
1501 &ccache_timeout_secs_attribute.attr,
1502 NULL,
1503};
1504
1505static struct kobj_type ccache_orangefs_ktype = {
1506 .sysfs_ops = &ccache_orangefs_sysfs_ops,
1507 .release = ccache_orangefs_release,
1508 .default_attrs = ccache_orangefs_default_attrs,
1509};
1510
1511static struct ncache_orangefs_attribute ncache_hard_limit_attribute =
1512 __ATTR(hard_limit,
1513 0664,
1514 service_ncache_show,
1515 service_ncache_store);
1516
1517static struct ncache_orangefs_attribute ncache_reclaim_percent_attribute =
1518 __ATTR(reclaim_percentage,
1519 0664,
1520 service_ncache_show,
1521 service_ncache_store);
1522
1523static struct ncache_orangefs_attribute ncache_soft_limit_attribute =
1524 __ATTR(soft_limit,
1525 0664,
1526 service_ncache_show,
1527 service_ncache_store);
1528
1529static struct ncache_orangefs_attribute ncache_timeout_msecs_attribute =
1530 __ATTR(timeout_msecs,
1531 0664,
1532 service_ncache_show,
1533 service_ncache_store);
1534
1535static struct attribute *ncache_orangefs_default_attrs[] = {
1536 &ncache_hard_limit_attribute.attr,
1537 &ncache_reclaim_percent_attribute.attr,
1538 &ncache_soft_limit_attribute.attr,
1539 &ncache_timeout_msecs_attribute.attr,
1540 NULL,
1541};
1542
1543static struct kobj_type ncache_orangefs_ktype = {
1544 .sysfs_ops = &ncache_orangefs_sysfs_ops,
1545 .release = ncache_orangefs_release,
1546 .default_attrs = ncache_orangefs_default_attrs,
1547};
1548
1549static struct pc_orangefs_attribute pc_acache_attribute =
1550 __ATTR(acache,
1551 0664,
1552 service_pc_show,
1553 NULL);
1554
1555static struct pc_orangefs_attribute pc_capcache_attribute =
1556 __ATTR(capcache,
1557 0664,
1558 service_pc_show,
1559 NULL);
1560
1561static struct pc_orangefs_attribute pc_ncache_attribute =
1562 __ATTR(ncache,
1563 0664,
1564 service_pc_show,
1565 NULL);
1566
1567static struct attribute *pc_orangefs_default_attrs[] = {
1568 &pc_acache_attribute.attr,
1569 &pc_capcache_attribute.attr,
1570 &pc_ncache_attribute.attr,
1571 NULL,
1572};
1573
1574static struct kobj_type pc_orangefs_ktype = {
1575 .sysfs_ops = &pc_orangefs_sysfs_ops,
1576 .release = pc_orangefs_release,
1577 .default_attrs = pc_orangefs_default_attrs,
1578};
1579
1580static struct stats_orangefs_attribute stats_reads_attribute =
1581 __ATTR(reads,
1582 0664,
1583 int_stats_show,
1584 NULL);
1585
1586static struct stats_orangefs_attribute stats_writes_attribute =
1587 __ATTR(writes,
1588 0664,
1589 int_stats_show,
1590 NULL);
1591
1592static struct attribute *stats_orangefs_default_attrs[] = {
1593 &stats_reads_attribute.attr,
1594 &stats_writes_attribute.attr,
1595 NULL,
1596};
1597
1598static struct kobj_type stats_orangefs_ktype = {
1599 .sysfs_ops = &stats_orangefs_sysfs_ops,
1600 .release = stats_orangefs_release,
1601 .default_attrs = stats_orangefs_default_attrs,
1602};
1603
1604static struct orangefs_obj *orangefs_obj;
1605static struct acache_orangefs_obj *acache_orangefs_obj;
1606static struct capcache_orangefs_obj *capcache_orangefs_obj;
1607static struct ccache_orangefs_obj *ccache_orangefs_obj;
1608static struct ncache_orangefs_obj *ncache_orangefs_obj;
1609static struct pc_orangefs_obj *pc_orangefs_obj;
1610static struct stats_orangefs_obj *stats_orangefs_obj;
1611
1612int orangefs_sysfs_init(void)
1613{
1614 int rc = -EINVAL;
1615
1616 gossip_debug(GOSSIP_SYSFS_DEBUG, "orangefs_sysfs_init: start\n");
1617
1618 /* create /sys/fs/orangefs. */
1619 orangefs_obj = kzalloc(sizeof(*orangefs_obj), GFP_KERNEL);
1620 if (!orangefs_obj)
1621 goto out;
1622
1623 rc = kobject_init_and_add(&orangefs_obj->kobj,
1624 &orangefs_ktype,
1625 fs_kobj,
1626 ORANGEFS_KOBJ_ID);
1627
1628 if (rc)
1629 goto ofs_obj_bail;
1630
1631 kobject_uevent(&orangefs_obj->kobj, KOBJ_ADD);
1632
1633 /* create /sys/fs/orangefs/acache. */
1634 acache_orangefs_obj = kzalloc(sizeof(*acache_orangefs_obj), GFP_KERNEL);
1635 if (!acache_orangefs_obj) {
1636 rc = -EINVAL;
1637 goto ofs_obj_bail;
1638 }
1639
1640 rc = kobject_init_and_add(&acache_orangefs_obj->kobj,
1641 &acache_orangefs_ktype,
1642 &orangefs_obj->kobj,
1643 ACACHE_KOBJ_ID);
1644
1645 if (rc)
1646 goto acache_obj_bail;
1647
1648 kobject_uevent(&acache_orangefs_obj->kobj, KOBJ_ADD);
1649
1650 /* create /sys/fs/orangefs/capcache. */
1651 capcache_orangefs_obj =
1652 kzalloc(sizeof(*capcache_orangefs_obj), GFP_KERNEL);
1653 if (!capcache_orangefs_obj) {
1654 rc = -EINVAL;
1655 goto acache_obj_bail;
1656 }
1657
1658 rc = kobject_init_and_add(&capcache_orangefs_obj->kobj,
1659 &capcache_orangefs_ktype,
1660 &orangefs_obj->kobj,
1661 CAPCACHE_KOBJ_ID);
1662 if (rc)
1663 goto capcache_obj_bail;
1664
1665 kobject_uevent(&capcache_orangefs_obj->kobj, KOBJ_ADD);
1666
1667 /* create /sys/fs/orangefs/ccache. */
1668 ccache_orangefs_obj =
1669 kzalloc(sizeof(*ccache_orangefs_obj), GFP_KERNEL);
1670 if (!ccache_orangefs_obj) {
1671 rc = -EINVAL;
1672 goto capcache_obj_bail;
1673 }
1674
1675 rc = kobject_init_and_add(&ccache_orangefs_obj->kobj,
1676 &ccache_orangefs_ktype,
1677 &orangefs_obj->kobj,
1678 CCACHE_KOBJ_ID);
1679 if (rc)
1680 goto ccache_obj_bail;
1681
1682 kobject_uevent(&ccache_orangefs_obj->kobj, KOBJ_ADD);
1683
1684 /* create /sys/fs/orangefs/ncache. */
1685 ncache_orangefs_obj = kzalloc(sizeof(*ncache_orangefs_obj), GFP_KERNEL);
1686 if (!ncache_orangefs_obj) {
1687 rc = -EINVAL;
1688 goto ccache_obj_bail;
1689 }
1690
1691 rc = kobject_init_and_add(&ncache_orangefs_obj->kobj,
1692 &ncache_orangefs_ktype,
1693 &orangefs_obj->kobj,
1694 NCACHE_KOBJ_ID);
1695
1696 if (rc)
1697 goto ncache_obj_bail;
1698
1699 kobject_uevent(&ncache_orangefs_obj->kobj, KOBJ_ADD);
1700
1701 /* create /sys/fs/orangefs/perf_counters. */
1702 pc_orangefs_obj = kzalloc(sizeof(*pc_orangefs_obj), GFP_KERNEL);
1703 if (!pc_orangefs_obj) {
1704 rc = -EINVAL;
1705 goto ncache_obj_bail;
1706 }
1707
1708 rc = kobject_init_and_add(&pc_orangefs_obj->kobj,
1709 &pc_orangefs_ktype,
1710 &orangefs_obj->kobj,
1711 "perf_counters");
1712
1713 if (rc)
1714 goto pc_obj_bail;
1715
1716 kobject_uevent(&pc_orangefs_obj->kobj, KOBJ_ADD);
1717
1718 /* create /sys/fs/orangefs/stats. */
1719 stats_orangefs_obj = kzalloc(sizeof(*stats_orangefs_obj), GFP_KERNEL);
1720 if (!stats_orangefs_obj) {
1721 rc = -EINVAL;
1722 goto pc_obj_bail;
1723 }
1724
1725 rc = kobject_init_and_add(&stats_orangefs_obj->kobj,
1726 &stats_orangefs_ktype,
1727 &orangefs_obj->kobj,
1728 STATS_KOBJ_ID);
1729
1730 if (rc)
1731 goto stats_obj_bail;
1732
1733 kobject_uevent(&stats_orangefs_obj->kobj, KOBJ_ADD);
1734 goto out;
1735
1736stats_obj_bail:
1737 kobject_put(&stats_orangefs_obj->kobj);
1738
1739pc_obj_bail:
1740 kobject_put(&pc_orangefs_obj->kobj);
1741
1742ncache_obj_bail:
1743 kobject_put(&ncache_orangefs_obj->kobj);
1744
1745ccache_obj_bail:
1746 kobject_put(&ccache_orangefs_obj->kobj);
1747
1748capcache_obj_bail:
1749 kobject_put(&capcache_orangefs_obj->kobj);
1750
1751acache_obj_bail:
1752 kobject_put(&acache_orangefs_obj->kobj);
1753
1754ofs_obj_bail:
1755 kobject_put(&orangefs_obj->kobj);
1756out:
1757 return rc;
1758}
1759
1760void orangefs_sysfs_exit(void)
1761{
1762 gossip_debug(GOSSIP_SYSFS_DEBUG, "orangefs_sysfs_exit: start\n");
1763
1764 kobject_put(&acache_orangefs_obj->kobj);
1765 kobject_put(&capcache_orangefs_obj->kobj);
1766 kobject_put(&ccache_orangefs_obj->kobj);
1767 kobject_put(&ncache_orangefs_obj->kobj);
1768 kobject_put(&pc_orangefs_obj->kobj);
1769 kobject_put(&stats_orangefs_obj->kobj);
1770
1771 kobject_put(&orangefs_obj->kobj);
1772}
diff --git a/fs/orangefs/orangefs-sysfs.h b/fs/orangefs/orangefs-sysfs.h
new file mode 100644
index 000000000000..f0b76382db02
--- /dev/null
+++ b/fs/orangefs/orangefs-sysfs.h
@@ -0,0 +1,2 @@
1extern int orangefs_sysfs_init(void);
2extern void orangefs_sysfs_exit(void);
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
new file mode 100644
index 000000000000..40f5163b56aa
--- /dev/null
+++ b/fs/orangefs/orangefs-utils.c
@@ -0,0 +1,1048 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6#include "protocol.h"
7#include "orangefs-kernel.h"
8#include "orangefs-dev-proto.h"
9#include "orangefs-bufmap.h"
10
11__s32 fsid_of_op(struct orangefs_kernel_op_s *op)
12{
13 __s32 fsid = ORANGEFS_FS_ID_NULL;
14
15 if (op) {
16 switch (op->upcall.type) {
17 case ORANGEFS_VFS_OP_FILE_IO:
18 fsid = op->upcall.req.io.refn.fs_id;
19 break;
20 case ORANGEFS_VFS_OP_LOOKUP:
21 fsid = op->upcall.req.lookup.parent_refn.fs_id;
22 break;
23 case ORANGEFS_VFS_OP_CREATE:
24 fsid = op->upcall.req.create.parent_refn.fs_id;
25 break;
26 case ORANGEFS_VFS_OP_GETATTR:
27 fsid = op->upcall.req.getattr.refn.fs_id;
28 break;
29 case ORANGEFS_VFS_OP_REMOVE:
30 fsid = op->upcall.req.remove.parent_refn.fs_id;
31 break;
32 case ORANGEFS_VFS_OP_MKDIR:
33 fsid = op->upcall.req.mkdir.parent_refn.fs_id;
34 break;
35 case ORANGEFS_VFS_OP_READDIR:
36 fsid = op->upcall.req.readdir.refn.fs_id;
37 break;
38 case ORANGEFS_VFS_OP_SETATTR:
39 fsid = op->upcall.req.setattr.refn.fs_id;
40 break;
41 case ORANGEFS_VFS_OP_SYMLINK:
42 fsid = op->upcall.req.sym.parent_refn.fs_id;
43 break;
44 case ORANGEFS_VFS_OP_RENAME:
45 fsid = op->upcall.req.rename.old_parent_refn.fs_id;
46 break;
47 case ORANGEFS_VFS_OP_STATFS:
48 fsid = op->upcall.req.statfs.fs_id;
49 break;
50 case ORANGEFS_VFS_OP_TRUNCATE:
51 fsid = op->upcall.req.truncate.refn.fs_id;
52 break;
53 case ORANGEFS_VFS_OP_MMAP_RA_FLUSH:
54 fsid = op->upcall.req.ra_cache_flush.refn.fs_id;
55 break;
56 case ORANGEFS_VFS_OP_FS_UMOUNT:
57 fsid = op->upcall.req.fs_umount.fs_id;
58 break;
59 case ORANGEFS_VFS_OP_GETXATTR:
60 fsid = op->upcall.req.getxattr.refn.fs_id;
61 break;
62 case ORANGEFS_VFS_OP_SETXATTR:
63 fsid = op->upcall.req.setxattr.refn.fs_id;
64 break;
65 case ORANGEFS_VFS_OP_LISTXATTR:
66 fsid = op->upcall.req.listxattr.refn.fs_id;
67 break;
68 case ORANGEFS_VFS_OP_REMOVEXATTR:
69 fsid = op->upcall.req.removexattr.refn.fs_id;
70 break;
71 case ORANGEFS_VFS_OP_FSYNC:
72 fsid = op->upcall.req.fsync.refn.fs_id;
73 break;
74 default:
75 break;
76 }
77 }
78 return fsid;
79}
80
81static int orangefs_inode_flags(struct ORANGEFS_sys_attr_s *attrs)
82{
83 int flags = 0;
84 if (attrs->flags & ORANGEFS_IMMUTABLE_FL)
85 flags |= S_IMMUTABLE;
86 else
87 flags &= ~S_IMMUTABLE;
88 if (attrs->flags & ORANGEFS_APPEND_FL)
89 flags |= S_APPEND;
90 else
91 flags &= ~S_APPEND;
92 if (attrs->flags & ORANGEFS_NOATIME_FL)
93 flags |= S_NOATIME;
94 else
95 flags &= ~S_NOATIME;
96 return flags;
97}
98
99static int orangefs_inode_perms(struct ORANGEFS_sys_attr_s *attrs)
100{
101 int perm_mode = 0;
102
103 if (attrs->perms & ORANGEFS_O_EXECUTE)
104 perm_mode |= S_IXOTH;
105 if (attrs->perms & ORANGEFS_O_WRITE)
106 perm_mode |= S_IWOTH;
107 if (attrs->perms & ORANGEFS_O_READ)
108 perm_mode |= S_IROTH;
109
110 if (attrs->perms & ORANGEFS_G_EXECUTE)
111 perm_mode |= S_IXGRP;
112 if (attrs->perms & ORANGEFS_G_WRITE)
113 perm_mode |= S_IWGRP;
114 if (attrs->perms & ORANGEFS_G_READ)
115 perm_mode |= S_IRGRP;
116
117 if (attrs->perms & ORANGEFS_U_EXECUTE)
118 perm_mode |= S_IXUSR;
119 if (attrs->perms & ORANGEFS_U_WRITE)
120 perm_mode |= S_IWUSR;
121 if (attrs->perms & ORANGEFS_U_READ)
122 perm_mode |= S_IRUSR;
123
124 if (attrs->perms & ORANGEFS_G_SGID)
125 perm_mode |= S_ISGID;
126 if (attrs->perms & ORANGEFS_U_SUID)
127 perm_mode |= S_ISUID;
128
129 return perm_mode;
130}
131
132/*
133 * NOTE: in kernel land, we never use the sys_attr->link_target for
134 * anything, so don't bother copying it into the sys_attr object here.
135 */
136static inline int copy_attributes_from_inode(struct inode *inode,
137 struct ORANGEFS_sys_attr_s *attrs,
138 struct iattr *iattr)
139{
140 umode_t tmp_mode;
141
142 if (!iattr || !inode || !attrs) {
143 gossip_err("NULL iattr (%p), inode (%p), attrs (%p) "
144 "in copy_attributes_from_inode!\n",
145 iattr,
146 inode,
147 attrs);
148 return -EINVAL;
149 }
150 /*
151 * We need to be careful to only copy the attributes out of the
152 * iattr object that we know are valid.
153 */
154 attrs->mask = 0;
155 if (iattr->ia_valid & ATTR_UID) {
156 attrs->owner = from_kuid(current_user_ns(), iattr->ia_uid);
157 attrs->mask |= ORANGEFS_ATTR_SYS_UID;
158 gossip_debug(GOSSIP_UTILS_DEBUG, "(UID) %d\n", attrs->owner);
159 }
160 if (iattr->ia_valid & ATTR_GID) {
161 attrs->group = from_kgid(current_user_ns(), iattr->ia_gid);
162 attrs->mask |= ORANGEFS_ATTR_SYS_GID;
163 gossip_debug(GOSSIP_UTILS_DEBUG, "(GID) %d\n", attrs->group);
164 }
165
166 if (iattr->ia_valid & ATTR_ATIME) {
167 attrs->mask |= ORANGEFS_ATTR_SYS_ATIME;
168 if (iattr->ia_valid & ATTR_ATIME_SET) {
169 attrs->atime = (time64_t)iattr->ia_atime.tv_sec;
170 attrs->mask |= ORANGEFS_ATTR_SYS_ATIME_SET;
171 }
172 }
173 if (iattr->ia_valid & ATTR_MTIME) {
174 attrs->mask |= ORANGEFS_ATTR_SYS_MTIME;
175 if (iattr->ia_valid & ATTR_MTIME_SET) {
176 attrs->mtime = (time64_t)iattr->ia_mtime.tv_sec;
177 attrs->mask |= ORANGEFS_ATTR_SYS_MTIME_SET;
178 }
179 }
180 if (iattr->ia_valid & ATTR_CTIME)
181 attrs->mask |= ORANGEFS_ATTR_SYS_CTIME;
182
183 /*
184 * ORANGEFS cannot set size with a setattr operation. Probably not likely
185 * to be requested through the VFS, but just in case, don't worry about
186 * ATTR_SIZE
187 */
188
189 if (iattr->ia_valid & ATTR_MODE) {
190 tmp_mode = iattr->ia_mode;
191 if (tmp_mode & (S_ISVTX)) {
192 if (is_root_handle(inode)) {
193 /*
194 * allow sticky bit to be set on root (since
195 * it shows up that way by default anyhow),
196 * but don't show it to the server
197 */
198 tmp_mode -= S_ISVTX;
199 } else {
200 gossip_debug(GOSSIP_UTILS_DEBUG,
201 "User attempted to set sticky bit on non-root directory; returning EINVAL.\n");
202 return -EINVAL;
203 }
204 }
205
206 if (tmp_mode & (S_ISUID)) {
207 gossip_debug(GOSSIP_UTILS_DEBUG,
208 "Attempting to set setuid bit (not supported); returning EINVAL.\n");
209 return -EINVAL;
210 }
211
212 attrs->perms = ORANGEFS_util_translate_mode(tmp_mode);
213 attrs->mask |= ORANGEFS_ATTR_SYS_PERM;
214 }
215
216 return 0;
217}
218
219static int orangefs_inode_type(enum orangefs_ds_type objtype)
220{
221 if (objtype == ORANGEFS_TYPE_METAFILE)
222 return S_IFREG;
223 else if (objtype == ORANGEFS_TYPE_DIRECTORY)
224 return S_IFDIR;
225 else if (objtype == ORANGEFS_TYPE_SYMLINK)
226 return S_IFLNK;
227 else
228 return -1;
229}
230
231static int orangefs_inode_is_stale(struct inode *inode, int new,
232 struct ORANGEFS_sys_attr_s *attrs, char *link_target)
233{
234 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
235 int type = orangefs_inode_type(attrs->objtype);
236 if (!new) {
237 /*
238 * If the inode type or symlink target have changed then this
239 * inode is stale.
240 */
241 if (type == -1 || !(inode->i_mode & type)) {
242 orangefs_make_bad_inode(inode);
243 return 1;
244 }
245 if (type == S_IFLNK && strncmp(orangefs_inode->link_target,
246 link_target, ORANGEFS_NAME_MAX)) {
247 orangefs_make_bad_inode(inode);
248 return 1;
249 }
250 }
251 return 0;
252}
253
254int orangefs_inode_getattr(struct inode *inode, int new, int size)
255{
256 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
257 struct orangefs_kernel_op_s *new_op;
258 loff_t inode_size, rounded_up_size;
259 int ret, type;
260
261 gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__,
262 get_khandle_from_ino(inode));
263
264 new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR);
265 if (!new_op)
266 return -ENOMEM;
267 new_op->upcall.req.getattr.refn = orangefs_inode->refn;
268 new_op->upcall.req.getattr.mask = size ?
269 ORANGEFS_ATTR_SYS_ALL_NOHINT : ORANGEFS_ATTR_SYS_ALL_NOHINT_NOSIZE;
270
271 ret = service_operation(new_op, __func__,
272 get_interruptible_flag(inode));
273 if (ret != 0)
274 goto out;
275
276 type = orangefs_inode_type(new_op->
277 downcall.resp.getattr.attributes.objtype);
278 ret = orangefs_inode_is_stale(inode, new,
279 &new_op->downcall.resp.getattr.attributes,
280 new_op->downcall.resp.getattr.link_target);
281 if (ret) {
282 ret = -ESTALE;
283 goto out;
284 }
285
286 switch (type) {
287 case S_IFREG:
288 inode->i_flags = orangefs_inode_flags(&new_op->
289 downcall.resp.getattr.attributes);
290 if (size) {
291 inode_size = (loff_t)new_op->
292 downcall.resp.getattr.attributes.size;
293 rounded_up_size =
294 (inode_size + (4096 - (inode_size % 4096)));
295 inode->i_size = inode_size;
296 orangefs_inode->blksize =
297 new_op->downcall.resp.getattr.attributes.blksize;
298 spin_lock(&inode->i_lock);
299 inode->i_bytes = inode_size;
300 inode->i_blocks =
301 (unsigned long)(rounded_up_size / 512);
302 spin_unlock(&inode->i_lock);
303 }
304 break;
305 case S_IFDIR:
306 inode->i_size = PAGE_CACHE_SIZE;
307 orangefs_inode->blksize = (1 << inode->i_blkbits);
308 spin_lock(&inode->i_lock);
309 inode_set_bytes(inode, inode->i_size);
310 spin_unlock(&inode->i_lock);
311 set_nlink(inode, 1);
312 break;
313 case S_IFLNK:
314 if (new) {
315 inode->i_size = (loff_t)strlen(new_op->
316 downcall.resp.getattr.link_target);
317 orangefs_inode->blksize = (1 << inode->i_blkbits);
318 strlcpy(orangefs_inode->link_target,
319 new_op->downcall.resp.getattr.link_target,
320 ORANGEFS_NAME_MAX);
321 inode->i_link = orangefs_inode->link_target;
322 }
323 break;
324 }
325
326 inode->i_uid = make_kuid(&init_user_ns, new_op->
327 downcall.resp.getattr.attributes.owner);
328 inode->i_gid = make_kgid(&init_user_ns, new_op->
329 downcall.resp.getattr.attributes.group);
330 inode->i_atime.tv_sec = (time64_t)new_op->
331 downcall.resp.getattr.attributes.atime;
332 inode->i_mtime.tv_sec = (time64_t)new_op->
333 downcall.resp.getattr.attributes.mtime;
334 inode->i_ctime.tv_sec = (time64_t)new_op->
335 downcall.resp.getattr.attributes.ctime;
336 inode->i_atime.tv_nsec = 0;
337 inode->i_mtime.tv_nsec = 0;
338 inode->i_ctime.tv_nsec = 0;
339
340 /* special case: mark the root inode as sticky */
341 inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) |
342 orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes);
343
344 ret = 0;
345out:
346 op_release(new_op);
347 return ret;
348}
349
350int orangefs_inode_check_changed(struct inode *inode)
351{
352 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
353 struct orangefs_kernel_op_s *new_op;
354 int ret;
355
356 gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__,
357 get_khandle_from_ino(inode));
358
359 new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR);
360 if (!new_op)
361 return -ENOMEM;
362 new_op->upcall.req.getattr.refn = orangefs_inode->refn;
363 new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_TYPE |
364 ORANGEFS_ATTR_SYS_LNK_TARGET;
365
366 ret = service_operation(new_op, __func__,
367 get_interruptible_flag(inode));
368 if (ret != 0)
369 goto out;
370
371 ret = orangefs_inode_is_stale(inode, 0,
372 &new_op->downcall.resp.getattr.attributes,
373 new_op->downcall.resp.getattr.link_target);
374out:
375 op_release(new_op);
376 return ret;
377}
378
379/*
380 * issues a orangefs setattr request to make sure the new attribute values
381 * take effect if successful. returns 0 on success; -errno otherwise
382 */
383int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr)
384{
385 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
386 struct orangefs_kernel_op_s *new_op;
387 int ret;
388
389 new_op = op_alloc(ORANGEFS_VFS_OP_SETATTR);
390 if (!new_op)
391 return -ENOMEM;
392
393 new_op->upcall.req.setattr.refn = orangefs_inode->refn;
394 ret = copy_attributes_from_inode(inode,
395 &new_op->upcall.req.setattr.attributes,
396 iattr);
397 if (ret >= 0) {
398 ret = service_operation(new_op, __func__,
399 get_interruptible_flag(inode));
400
401 gossip_debug(GOSSIP_UTILS_DEBUG,
402 "orangefs_inode_setattr: returning %d\n",
403 ret);
404 }
405
406 op_release(new_op);
407
408 /*
409 * successful setattr should clear the atime, mtime and
410 * ctime flags.
411 */
412 if (ret == 0) {
413 ClearAtimeFlag(orangefs_inode);
414 ClearMtimeFlag(orangefs_inode);
415 ClearCtimeFlag(orangefs_inode);
416 ClearModeFlag(orangefs_inode);
417 }
418
419 return ret;
420}
421
422int orangefs_flush_inode(struct inode *inode)
423{
424 /*
425 * If it is a dirty inode, this function gets called.
426 * Gather all the information that needs to be setattr'ed
427 * Right now, this will only be used for mode, atime, mtime
428 * and/or ctime.
429 */
430 struct iattr wbattr;
431 int ret;
432 int mtime_flag;
433 int ctime_flag;
434 int atime_flag;
435 int mode_flag;
436 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
437
438 memset(&wbattr, 0, sizeof(wbattr));
439
440 /*
441 * check inode flags up front, and clear them if they are set. This
442 * will prevent multiple processes from all trying to flush the same
443 * inode if they call close() simultaneously
444 */
445 mtime_flag = MtimeFlag(orangefs_inode);
446 ClearMtimeFlag(orangefs_inode);
447 ctime_flag = CtimeFlag(orangefs_inode);
448 ClearCtimeFlag(orangefs_inode);
449 atime_flag = AtimeFlag(orangefs_inode);
450 ClearAtimeFlag(orangefs_inode);
451 mode_flag = ModeFlag(orangefs_inode);
452 ClearModeFlag(orangefs_inode);
453
454 /* -- Lazy atime,mtime and ctime update --
455 * Note: all times are dictated by server in the new scheme
456 * and not by the clients
457 *
458 * Also mode updates are being handled now..
459 */
460
461 if (mtime_flag)
462 wbattr.ia_valid |= ATTR_MTIME;
463 if (ctime_flag)
464 wbattr.ia_valid |= ATTR_CTIME;
465 if (atime_flag)
466 wbattr.ia_valid |= ATTR_ATIME;
467
468 if (mode_flag) {
469 wbattr.ia_mode = inode->i_mode;
470 wbattr.ia_valid |= ATTR_MODE;
471 }
472
473 gossip_debug(GOSSIP_UTILS_DEBUG,
474 "*********** orangefs_flush_inode: %pU "
475 "(ia_valid %d)\n",
476 get_khandle_from_ino(inode),
477 wbattr.ia_valid);
478 if (wbattr.ia_valid == 0) {
479 gossip_debug(GOSSIP_UTILS_DEBUG,
480 "orangefs_flush_inode skipping setattr()\n");
481 return 0;
482 }
483
484 gossip_debug(GOSSIP_UTILS_DEBUG,
485 "orangefs_flush_inode (%pU) writing mode %o\n",
486 get_khandle_from_ino(inode),
487 inode->i_mode);
488
489 ret = orangefs_inode_setattr(inode, &wbattr);
490
491 return ret;
492}
493
494int orangefs_unmount_sb(struct super_block *sb)
495{
496 int ret = -EINVAL;
497 struct orangefs_kernel_op_s *new_op = NULL;
498
499 gossip_debug(GOSSIP_UTILS_DEBUG,
500 "orangefs_unmount_sb called on sb %p\n",
501 sb);
502
503 new_op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT);
504 if (!new_op)
505 return -ENOMEM;
506 new_op->upcall.req.fs_umount.id = ORANGEFS_SB(sb)->id;
507 new_op->upcall.req.fs_umount.fs_id = ORANGEFS_SB(sb)->fs_id;
508 strncpy(new_op->upcall.req.fs_umount.orangefs_config_server,
509 ORANGEFS_SB(sb)->devname,
510 ORANGEFS_MAX_SERVER_ADDR_LEN);
511
512 gossip_debug(GOSSIP_UTILS_DEBUG,
513 "Attempting ORANGEFS Unmount via host %s\n",
514 new_op->upcall.req.fs_umount.orangefs_config_server);
515
516 ret = service_operation(new_op, "orangefs_fs_umount", 0);
517
518 gossip_debug(GOSSIP_UTILS_DEBUG,
519 "orangefs_unmount: got return value of %d\n", ret);
520 if (ret)
521 sb = ERR_PTR(ret);
522 else
523 ORANGEFS_SB(sb)->mount_pending = 1;
524
525 op_release(new_op);
526 return ret;
527}
528
529void orangefs_make_bad_inode(struct inode *inode)
530{
531 if (is_root_handle(inode)) {
532 /*
533 * if this occurs, the pvfs2-client-core was killed but we
534 * can't afford to lose the inode operations and such
535 * associated with the root handle in any case.
536 */
537 gossip_debug(GOSSIP_UTILS_DEBUG,
538 "*** NOT making bad root inode %pU\n",
539 get_khandle_from_ino(inode));
540 } else {
541 gossip_debug(GOSSIP_UTILS_DEBUG,
542 "*** making bad inode %pU\n",
543 get_khandle_from_ino(inode));
544 make_bad_inode(inode);
545 }
546}
547
548/*
549 * The following is a very dirty hack that is now a permanent part of the
550 * ORANGEFS protocol. See protocol.h for more error definitions.
551 */
552
553/* The order matches include/orangefs-types.h in the OrangeFS source. */
554static int PINT_errno_mapping[] = {
555 0, EPERM, ENOENT, EINTR, EIO, ENXIO, EBADF, EAGAIN, ENOMEM,
556 EFAULT, EBUSY, EEXIST, ENODEV, ENOTDIR, EISDIR, EINVAL, EMFILE,
557 EFBIG, ENOSPC, EROFS, EMLINK, EPIPE, EDEADLK, ENAMETOOLONG,
558 ENOLCK, ENOSYS, ENOTEMPTY, ELOOP, EWOULDBLOCK, ENOMSG, EUNATCH,
559 EBADR, EDEADLOCK, ENODATA, ETIME, ENONET, EREMOTE, ECOMM,
560 EPROTO, EBADMSG, EOVERFLOW, ERESTART, EMSGSIZE, EPROTOTYPE,
561 ENOPROTOOPT, EPROTONOSUPPORT, EOPNOTSUPP, EADDRINUSE,
562 EADDRNOTAVAIL, ENETDOWN, ENETUNREACH, ENETRESET, ENOBUFS,
563 ETIMEDOUT, ECONNREFUSED, EHOSTDOWN, EHOSTUNREACH, EALREADY,
564 EACCES, ECONNRESET, ERANGE
565};
566
567int orangefs_normalize_to_errno(__s32 error_code)
568{
569 __u32 i;
570
571 /* Success */
572 if (error_code == 0) {
573 return 0;
574 /*
575 * This shouldn't ever happen. If it does it should be fixed on the
576 * server.
577 */
578 } else if (error_code > 0) {
579 gossip_err("orangefs: error status receieved.\n");
580 gossip_err("orangefs: assuming error code is inverted.\n");
581 error_code = -error_code;
582 }
583
584 /*
585 * XXX: This is very bad since error codes from ORANGEFS may not be
586 * suitable for return into userspace.
587 */
588
589 /*
590 * Convert ORANGEFS error values into errno values suitable for return
591 * from the kernel.
592 */
593 if ((-error_code) & ORANGEFS_NON_ERRNO_ERROR_BIT) {
594 if (((-error_code) &
595 (ORANGEFS_ERROR_NUMBER_BITS|ORANGEFS_NON_ERRNO_ERROR_BIT|
596 ORANGEFS_ERROR_BIT)) == ORANGEFS_ECANCEL) {
597 /*
598 * cancellation error codes generally correspond to
599 * a timeout from the client's perspective
600 */
601 error_code = -ETIMEDOUT;
602 } else {
603 /* assume a default error code */
604 gossip_err("orangefs: warning: got error code without errno equivalent: %d.\n", error_code);
605 error_code = -EINVAL;
606 }
607
608 /* Convert ORANGEFS encoded errno values into regular errno values. */
609 } else if ((-error_code) & ORANGEFS_ERROR_BIT) {
610 i = (-error_code) & ~(ORANGEFS_ERROR_BIT|ORANGEFS_ERROR_CLASS_BITS);
611 if (i < sizeof(PINT_errno_mapping)/sizeof(*PINT_errno_mapping))
612 error_code = -PINT_errno_mapping[i];
613 else
614 error_code = -EINVAL;
615
616 /*
617 * Only ORANGEFS protocol error codes should ever come here. Otherwise
618 * there is a bug somewhere.
619 */
620 } else {
621 gossip_err("orangefs: orangefs_normalize_to_errno: got error code which is not from ORANGEFS.\n");
622 }
623 return error_code;
624}
625
626#define NUM_MODES 11
627__s32 ORANGEFS_util_translate_mode(int mode)
628{
629 int ret = 0;
630 int i = 0;
631 static int modes[NUM_MODES] = {
632 S_IXOTH, S_IWOTH, S_IROTH,
633 S_IXGRP, S_IWGRP, S_IRGRP,
634 S_IXUSR, S_IWUSR, S_IRUSR,
635 S_ISGID, S_ISUID
636 };
637 static int orangefs_modes[NUM_MODES] = {
638 ORANGEFS_O_EXECUTE, ORANGEFS_O_WRITE, ORANGEFS_O_READ,
639 ORANGEFS_G_EXECUTE, ORANGEFS_G_WRITE, ORANGEFS_G_READ,
640 ORANGEFS_U_EXECUTE, ORANGEFS_U_WRITE, ORANGEFS_U_READ,
641 ORANGEFS_G_SGID, ORANGEFS_U_SUID
642 };
643
644 for (i = 0; i < NUM_MODES; i++)
645 if (mode & modes[i])
646 ret |= orangefs_modes[i];
647
648 return ret;
649}
650#undef NUM_MODES
651
652/*
653 * After obtaining a string representation of the client's debug
654 * keywords and their associated masks, this function is called to build an
655 * array of these values.
656 */
657int orangefs_prepare_cdm_array(char *debug_array_string)
658{
659 int i;
660 int rc = -EINVAL;
661 char *cds_head = NULL;
662 char *cds_delimiter = NULL;
663 int keyword_len = 0;
664
665 gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
666
667 /*
668 * figure out how many elements the cdm_array needs.
669 */
670 for (i = 0; i < strlen(debug_array_string); i++)
671 if (debug_array_string[i] == '\n')
672 cdm_element_count++;
673
674 if (!cdm_element_count) {
675 pr_info("No elements in client debug array string!\n");
676 goto out;
677 }
678
679 cdm_array =
680 kzalloc(cdm_element_count * sizeof(struct client_debug_mask),
681 GFP_KERNEL);
682 if (!cdm_array) {
683 pr_info("malloc failed for cdm_array!\n");
684 rc = -ENOMEM;
685 goto out;
686 }
687
688 cds_head = debug_array_string;
689
690 for (i = 0; i < cdm_element_count; i++) {
691 cds_delimiter = strchr(cds_head, '\n');
692 *cds_delimiter = '\0';
693
694 keyword_len = strcspn(cds_head, " ");
695
696 cdm_array[i].keyword = kzalloc(keyword_len + 1, GFP_KERNEL);
697 if (!cdm_array[i].keyword) {
698 rc = -ENOMEM;
699 goto out;
700 }
701
702 sscanf(cds_head,
703 "%s %llx %llx",
704 cdm_array[i].keyword,
705 (unsigned long long *)&(cdm_array[i].mask1),
706 (unsigned long long *)&(cdm_array[i].mask2));
707
708 if (!strcmp(cdm_array[i].keyword, ORANGEFS_VERBOSE))
709 client_verbose_index = i;
710
711 if (!strcmp(cdm_array[i].keyword, ORANGEFS_ALL))
712 client_all_index = i;
713
714 cds_head = cds_delimiter + 1;
715 }
716
717 rc = cdm_element_count;
718
719 gossip_debug(GOSSIP_UTILS_DEBUG, "%s: rc:%d:\n", __func__, rc);
720
721out:
722
723 return rc;
724
725}
726
727/*
728 * /sys/kernel/debug/orangefs/debug-help can be catted to
729 * see all the available kernel and client debug keywords.
730 *
731 * When the kernel boots, we have no idea what keywords the
732 * client supports, nor their associated masks.
733 *
734 * We pass through this function once at boot and stamp a
735 * boilerplate "we don't know" message for the client in the
736 * debug-help file. We pass through here again when the client
737 * starts and then we can fill out the debug-help file fully.
738 *
739 * The client might be restarted any number of times between
740 * reboots, we only build the debug-help file the first time.
741 */
742int orangefs_prepare_debugfs_help_string(int at_boot)
743{
744 int rc = -EINVAL;
745 int i;
746 int byte_count = 0;
747 char *client_title = "Client Debug Keywords:\n";
748 char *kernel_title = "Kernel Debug Keywords:\n";
749
750 gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
751
752 if (at_boot) {
753 byte_count += strlen(HELP_STRING_UNINITIALIZED);
754 client_title = HELP_STRING_UNINITIALIZED;
755 } else {
756 /*
757 * fill the client keyword/mask array and remember
758 * how many elements there were.
759 */
760 cdm_element_count =
761 orangefs_prepare_cdm_array(client_debug_array_string);
762 if (cdm_element_count <= 0)
763 goto out;
764
765 /* Count the bytes destined for debug_help_string. */
766 byte_count += strlen(client_title);
767
768 for (i = 0; i < cdm_element_count; i++) {
769 byte_count += strlen(cdm_array[i].keyword + 2);
770 if (byte_count >= DEBUG_HELP_STRING_SIZE) {
771 pr_info("%s: overflow 1!\n", __func__);
772 goto out;
773 }
774 }
775
776 gossip_debug(GOSSIP_UTILS_DEBUG,
777 "%s: cdm_element_count:%d:\n",
778 __func__,
779 cdm_element_count);
780 }
781
782 byte_count += strlen(kernel_title);
783 for (i = 0; i < num_kmod_keyword_mask_map; i++) {
784 byte_count +=
785 strlen(s_kmod_keyword_mask_map[i].keyword + 2);
786 if (byte_count >= DEBUG_HELP_STRING_SIZE) {
787 pr_info("%s: overflow 2!\n", __func__);
788 goto out;
789 }
790 }
791
792 /* build debug_help_string. */
793 debug_help_string = kzalloc(DEBUG_HELP_STRING_SIZE, GFP_KERNEL);
794 if (!debug_help_string) {
795 rc = -ENOMEM;
796 goto out;
797 }
798
799 strcat(debug_help_string, client_title);
800
801 if (!at_boot) {
802 for (i = 0; i < cdm_element_count; i++) {
803 strcat(debug_help_string, "\t");
804 strcat(debug_help_string, cdm_array[i].keyword);
805 strcat(debug_help_string, "\n");
806 }
807 }
808
809 strcat(debug_help_string, "\n");
810 strcat(debug_help_string, kernel_title);
811
812 for (i = 0; i < num_kmod_keyword_mask_map; i++) {
813 strcat(debug_help_string, "\t");
814 strcat(debug_help_string, s_kmod_keyword_mask_map[i].keyword);
815 strcat(debug_help_string, "\n");
816 }
817
818 rc = 0;
819
820out:
821
822 return rc;
823
824}
825
826/*
827 * kernel = type 0
828 * client = type 1
829 */
830void debug_mask_to_string(void *mask, int type)
831{
832 int i;
833 int len = 0;
834 char *debug_string;
835 int element_count = 0;
836
837 gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
838
839 if (type) {
840 debug_string = client_debug_string;
841 element_count = cdm_element_count;
842 } else {
843 debug_string = kernel_debug_string;
844 element_count = num_kmod_keyword_mask_map;
845 }
846
847 memset(debug_string, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
848
849 /*
850 * Some keywords, like "all" or "verbose", are amalgams of
851 * numerous other keywords. Make a special check for those
852 * before grinding through the whole mask only to find out
853 * later...
854 */
855 if (check_amalgam_keyword(mask, type))
856 goto out;
857
858 /* Build the debug string. */
859 for (i = 0; i < element_count; i++)
860 if (type)
861 do_c_string(mask, i);
862 else
863 do_k_string(mask, i);
864
865 len = strlen(debug_string);
866
867 if ((len) && (type))
868 client_debug_string[len - 1] = '\0';
869 else if (len)
870 kernel_debug_string[len - 1] = '\0';
871 else if (type)
872 strcpy(client_debug_string, "none");
873 else
874 strcpy(kernel_debug_string, "none");
875
876out:
877gossip_debug(GOSSIP_UTILS_DEBUG, "%s: string:%s:\n", __func__, debug_string);
878
879 return;
880
881}
882
883void do_k_string(void *k_mask, int index)
884{
885 __u64 *mask = (__u64 *) k_mask;
886
887 if (keyword_is_amalgam((char *) s_kmod_keyword_mask_map[index].keyword))
888 goto out;
889
890 if (*mask & s_kmod_keyword_mask_map[index].mask_val) {
891 if ((strlen(kernel_debug_string) +
892 strlen(s_kmod_keyword_mask_map[index].keyword))
893 < ORANGEFS_MAX_DEBUG_STRING_LEN - 1) {
894 strcat(kernel_debug_string,
895 s_kmod_keyword_mask_map[index].keyword);
896 strcat(kernel_debug_string, ",");
897 } else {
898 gossip_err("%s: overflow!\n", __func__);
899 strcpy(kernel_debug_string, ORANGEFS_ALL);
900 goto out;
901 }
902 }
903
904out:
905
906 return;
907}
908
909void do_c_string(void *c_mask, int index)
910{
911 struct client_debug_mask *mask = (struct client_debug_mask *) c_mask;
912
913 if (keyword_is_amalgam(cdm_array[index].keyword))
914 goto out;
915
916 if ((mask->mask1 & cdm_array[index].mask1) ||
917 (mask->mask2 & cdm_array[index].mask2)) {
918 if ((strlen(client_debug_string) +
919 strlen(cdm_array[index].keyword) + 1)
920 < ORANGEFS_MAX_DEBUG_STRING_LEN - 2) {
921 strcat(client_debug_string,
922 cdm_array[index].keyword);
923 strcat(client_debug_string, ",");
924 } else {
925 gossip_err("%s: overflow!\n", __func__);
926 strcpy(client_debug_string, ORANGEFS_ALL);
927 goto out;
928 }
929 }
930out:
931 return;
932}
933
934int keyword_is_amalgam(char *keyword)
935{
936 int rc = 0;
937
938 if ((!strcmp(keyword, ORANGEFS_ALL)) || (!strcmp(keyword, ORANGEFS_VERBOSE)))
939 rc = 1;
940
941 return rc;
942}
943
944/*
945 * kernel = type 0
946 * client = type 1
947 *
948 * return 1 if we found an amalgam.
949 */
950int check_amalgam_keyword(void *mask, int type)
951{
952 __u64 *k_mask;
953 struct client_debug_mask *c_mask;
954 int k_all_index = num_kmod_keyword_mask_map - 1;
955 int rc = 0;
956
957 if (type) {
958 c_mask = (struct client_debug_mask *) mask;
959
960 if ((c_mask->mask1 == cdm_array[client_all_index].mask1) &&
961 (c_mask->mask2 == cdm_array[client_all_index].mask2)) {
962 strcpy(client_debug_string, ORANGEFS_ALL);
963 rc = 1;
964 goto out;
965 }
966
967 if ((c_mask->mask1 == cdm_array[client_verbose_index].mask1) &&
968 (c_mask->mask2 == cdm_array[client_verbose_index].mask2)) {
969 strcpy(client_debug_string, ORANGEFS_VERBOSE);
970 rc = 1;
971 goto out;
972 }
973
974 } else {
975 k_mask = (__u64 *) mask;
976
977 if (*k_mask >= s_kmod_keyword_mask_map[k_all_index].mask_val) {
978 strcpy(kernel_debug_string, ORANGEFS_ALL);
979 rc = 1;
980 goto out;
981 }
982 }
983
984out:
985
986 return rc;
987}
988
989/*
990 * kernel = type 0
991 * client = type 1
992 */
993void debug_string_to_mask(char *debug_string, void *mask, int type)
994{
995 char *unchecked_keyword;
996 int i;
997 char *strsep_fodder = kstrdup(debug_string, GFP_KERNEL);
998 char *original_pointer;
999 int element_count = 0;
1000 struct client_debug_mask *c_mask;
1001 __u64 *k_mask;
1002
1003 gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
1004
1005 if (type) {
1006 c_mask = (struct client_debug_mask *)mask;
1007 element_count = cdm_element_count;
1008 } else {
1009 k_mask = (__u64 *)mask;
1010 *k_mask = 0;
1011 element_count = num_kmod_keyword_mask_map;
1012 }
1013
1014 original_pointer = strsep_fodder;
1015 while ((unchecked_keyword = strsep(&strsep_fodder, ",")))
1016 if (strlen(unchecked_keyword)) {
1017 for (i = 0; i < element_count; i++)
1018 if (type)
1019 do_c_mask(i,
1020 unchecked_keyword,
1021 &c_mask);
1022 else
1023 do_k_mask(i,
1024 unchecked_keyword,
1025 &k_mask);
1026 }
1027
1028 kfree(original_pointer);
1029}
1030
1031void do_c_mask(int i,
1032 char *unchecked_keyword,
1033 struct client_debug_mask **sane_mask)
1034{
1035
1036 if (!strcmp(cdm_array[i].keyword, unchecked_keyword)) {
1037 (**sane_mask).mask1 = (**sane_mask).mask1 | cdm_array[i].mask1;
1038 (**sane_mask).mask2 = (**sane_mask).mask2 | cdm_array[i].mask2;
1039 }
1040}
1041
1042void do_k_mask(int i, char *unchecked_keyword, __u64 **sane_mask)
1043{
1044
1045 if (!strcmp(s_kmod_keyword_mask_map[i].keyword, unchecked_keyword))
1046 **sane_mask = (**sane_mask) |
1047 s_kmod_keyword_mask_map[i].mask_val;
1048}
diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h
new file mode 100644
index 000000000000..45ce4ff4cbc7
--- /dev/null
+++ b/fs/orangefs/protocol.h
@@ -0,0 +1,452 @@
1#include <linux/types.h>
2#include <linux/spinlock_types.h>
3#include <linux/slab.h>
4#include <linux/ioctl.h>
5
6extern struct client_debug_mask *cdm_array;
7extern char *debug_help_string;
8extern int help_string_initialized;
9extern struct dentry *debug_dir;
10extern struct dentry *help_file_dentry;
11extern struct dentry *client_debug_dentry;
12extern const struct file_operations debug_help_fops;
13extern int client_all_index;
14extern int client_verbose_index;
15extern int cdm_element_count;
16#define DEBUG_HELP_STRING_SIZE 4096
17#define HELP_STRING_UNINITIALIZED \
18 "Client Debug Keywords are unknown until the first time\n" \
19 "the client is started after boot.\n"
20#define ORANGEFS_KMOD_DEBUG_HELP_FILE "debug-help"
21#define ORANGEFS_KMOD_DEBUG_FILE "kernel-debug"
22#define ORANGEFS_CLIENT_DEBUG_FILE "client-debug"
23#define ORANGEFS_VERBOSE "verbose"
24#define ORANGEFS_ALL "all"
25
26/* pvfs2-config.h ***********************************************************/
27#define ORANGEFS_VERSION_MAJOR 2
28#define ORANGEFS_VERSION_MINOR 9
29#define ORANGEFS_VERSION_SUB 0
30
31/* khandle stuff ***********************************************************/
32
33/*
34 * The 2.9 core will put 64 bit handles in here like this:
35 * 1234 0000 0000 5678
36 * The 3.0 and beyond cores will put 128 bit handles in here like this:
37 * 1234 5678 90AB CDEF
38 * The kernel module will always use the first four bytes and
39 * the last four bytes as an inum.
40 */
41struct orangefs_khandle {
42 unsigned char u[16];
43} __aligned(8);
44
45/*
46 * kernel version of an object ref.
47 */
48struct orangefs_object_kref {
49 struct orangefs_khandle khandle;
50 __s32 fs_id;
51 __s32 __pad1;
52};
53
54/*
55 * compare 2 khandles assumes little endian thus from large address to
56 * small address
57 */
58static inline int ORANGEFS_khandle_cmp(const struct orangefs_khandle *kh1,
59 const struct orangefs_khandle *kh2)
60{
61 int i;
62
63 for (i = 15; i >= 0; i--) {
64 if (kh1->u[i] > kh2->u[i])
65 return 1;
66 if (kh1->u[i] < kh2->u[i])
67 return -1;
68 }
69
70 return 0;
71}
72
73static inline void ORANGEFS_khandle_to(const struct orangefs_khandle *kh,
74 void *p, int size)
75{
76
77 memset(p, 0, size);
78 memcpy(p, kh->u, 16);
79
80}
81
82static inline void ORANGEFS_khandle_from(struct orangefs_khandle *kh,
83 void *p, int size)
84{
85 memset(kh, 0, 16);
86 memcpy(kh->u, p, 16);
87
88}
89
90/* pvfs2-types.h ************************************************************/
91typedef __u32 ORANGEFS_uid;
92typedef __u32 ORANGEFS_gid;
93typedef __s32 ORANGEFS_fs_id;
94typedef __u32 ORANGEFS_permissions;
95typedef __u64 ORANGEFS_time;
96typedef __s64 ORANGEFS_size;
97typedef __u64 ORANGEFS_flags;
98typedef __u64 ORANGEFS_ds_position;
99typedef __s32 ORANGEFS_error;
100typedef __s64 ORANGEFS_offset;
101
102#define ORANGEFS_SUPER_MAGIC 0x20030528
103
104/*
105 * ORANGEFS error codes are a signed 32-bit integer. Error codes are negative, but
106 * the sign is stripped before decoding.
107 */
108
109/* Bit 31 is not used since it is the sign. */
110
111/*
112 * Bit 30 specifies that this is a ORANGEFS error. A ORANGEFS error is either an
113 * encoded errno value or a ORANGEFS protocol error.
114 */
115#define ORANGEFS_ERROR_BIT (1 << 30)
116
117/*
118 * Bit 29 specifies that this is a ORANGEFS protocol error and not an encoded
119 * errno value.
120 */
121#define ORANGEFS_NON_ERRNO_ERROR_BIT (1 << 29)
122
123/*
124 * Bits 9, 8, and 7 specify the error class, which encodes the section of
125 * server code the error originated in for logging purposes. It is not used
126 * in the kernel except to be masked out.
127 */
128#define ORANGEFS_ERROR_CLASS_BITS 0x380
129
130/* Bits 6 - 0 are reserved for the actual error code. */
131#define ORANGEFS_ERROR_NUMBER_BITS 0x7f
132
133/* Encoded errno values decoded by PINT_errno_mapping in orangefs-utils.c. */
134
135/* Our own ORANGEFS protocol error codes. */
136#define ORANGEFS_ECANCEL (1|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
137#define ORANGEFS_EDEVINIT (2|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
138#define ORANGEFS_EDETAIL (3|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
139#define ORANGEFS_EHOSTNTFD (4|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
140#define ORANGEFS_EADDRNTFD (5|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
141#define ORANGEFS_ENORECVR (6|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
142#define ORANGEFS_ETRYAGAIN (7|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
143#define ORANGEFS_ENOTPVFS (8|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
144#define ORANGEFS_ESECURITY (9|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
145
146/* permission bits */
147#define ORANGEFS_O_EXECUTE (1 << 0)
148#define ORANGEFS_O_WRITE (1 << 1)
149#define ORANGEFS_O_READ (1 << 2)
150#define ORANGEFS_G_EXECUTE (1 << 3)
151#define ORANGEFS_G_WRITE (1 << 4)
152#define ORANGEFS_G_READ (1 << 5)
153#define ORANGEFS_U_EXECUTE (1 << 6)
154#define ORANGEFS_U_WRITE (1 << 7)
155#define ORANGEFS_U_READ (1 << 8)
156/* no ORANGEFS_U_VTX (sticky bit) */
157#define ORANGEFS_G_SGID (1 << 10)
158#define ORANGEFS_U_SUID (1 << 11)
159
160/* definition taken from stdint.h */
161#define INT32_MAX (2147483647)
162#define ORANGEFS_ITERATE_START (INT32_MAX - 1)
163#define ORANGEFS_ITERATE_END (INT32_MAX - 2)
164#define ORANGEFS_ITERATE_NEXT (INT32_MAX - 3)
165#define ORANGEFS_READDIR_START ORANGEFS_ITERATE_START
166#define ORANGEFS_READDIR_END ORANGEFS_ITERATE_END
167#define ORANGEFS_IMMUTABLE_FL FS_IMMUTABLE_FL
168#define ORANGEFS_APPEND_FL FS_APPEND_FL
169#define ORANGEFS_NOATIME_FL FS_NOATIME_FL
170#define ORANGEFS_MIRROR_FL 0x01000000ULL
171#define ORANGEFS_O_EXECUTE (1 << 0)
172#define ORANGEFS_FS_ID_NULL ((__s32)0)
173#define ORANGEFS_ATTR_SYS_UID (1 << 0)
174#define ORANGEFS_ATTR_SYS_GID (1 << 1)
175#define ORANGEFS_ATTR_SYS_PERM (1 << 2)
176#define ORANGEFS_ATTR_SYS_ATIME (1 << 3)
177#define ORANGEFS_ATTR_SYS_CTIME (1 << 4)
178#define ORANGEFS_ATTR_SYS_MTIME (1 << 5)
179#define ORANGEFS_ATTR_SYS_TYPE (1 << 6)
180#define ORANGEFS_ATTR_SYS_ATIME_SET (1 << 7)
181#define ORANGEFS_ATTR_SYS_MTIME_SET (1 << 8)
182#define ORANGEFS_ATTR_SYS_SIZE (1 << 20)
183#define ORANGEFS_ATTR_SYS_LNK_TARGET (1 << 24)
184#define ORANGEFS_ATTR_SYS_DFILE_COUNT (1 << 25)
185#define ORANGEFS_ATTR_SYS_DIRENT_COUNT (1 << 26)
186#define ORANGEFS_ATTR_SYS_BLKSIZE (1 << 28)
187#define ORANGEFS_ATTR_SYS_MIRROR_COPIES_COUNT (1 << 29)
188#define ORANGEFS_ATTR_SYS_COMMON_ALL \
189 (ORANGEFS_ATTR_SYS_UID | \
190 ORANGEFS_ATTR_SYS_GID | \
191 ORANGEFS_ATTR_SYS_PERM | \
192 ORANGEFS_ATTR_SYS_ATIME | \
193 ORANGEFS_ATTR_SYS_CTIME | \
194 ORANGEFS_ATTR_SYS_MTIME | \
195 ORANGEFS_ATTR_SYS_TYPE)
196
197#define ORANGEFS_ATTR_SYS_ALL_SETABLE \
198(ORANGEFS_ATTR_SYS_COMMON_ALL-ORANGEFS_ATTR_SYS_TYPE)
199
200#define ORANGEFS_ATTR_SYS_ALL_NOHINT \
201 (ORANGEFS_ATTR_SYS_COMMON_ALL | \
202 ORANGEFS_ATTR_SYS_SIZE | \
203 ORANGEFS_ATTR_SYS_LNK_TARGET | \
204 ORANGEFS_ATTR_SYS_DFILE_COUNT | \
205 ORANGEFS_ATTR_SYS_MIRROR_COPIES_COUNT | \
206 ORANGEFS_ATTR_SYS_DIRENT_COUNT | \
207 ORANGEFS_ATTR_SYS_BLKSIZE)
208
209#define ORANGEFS_ATTR_SYS_ALL_NOHINT_NOSIZE \
210 (ORANGEFS_ATTR_SYS_COMMON_ALL | \
211 ORANGEFS_ATTR_SYS_LNK_TARGET | \
212 ORANGEFS_ATTR_SYS_DFILE_COUNT | \
213 ORANGEFS_ATTR_SYS_MIRROR_COPIES_COUNT | \
214 ORANGEFS_ATTR_SYS_DIRENT_COUNT | \
215 ORANGEFS_ATTR_SYS_BLKSIZE)
216
217#define ORANGEFS_XATTR_REPLACE 0x2
218#define ORANGEFS_XATTR_CREATE 0x1
219#define ORANGEFS_MAX_SERVER_ADDR_LEN 256
220#define ORANGEFS_NAME_MAX 256
221/*
222 * max extended attribute name len as imposed by the VFS and exploited for the
223 * upcall request types.
224 * NOTE: Please retain them as multiples of 8 even if you wish to change them
225 * This is *NECESSARY* for supporting 32 bit user-space binaries on a 64-bit
226 * kernel. Due to implementation within DBPF, this really needs to be
227 * ORANGEFS_NAME_MAX, which it was the same value as, but no reason to let it
228 * break if that changes in the future.
229 */
230#define ORANGEFS_MAX_XATTR_NAMELEN ORANGEFS_NAME_MAX /* Not the same as
231 * XATTR_NAME_MAX defined
232 * by <linux/xattr.h>
233 */
234#define ORANGEFS_MAX_XATTR_VALUELEN 8192 /* Not the same as XATTR_SIZE_MAX
235 * defined by <linux/xattr.h>
236 */
237#define ORANGEFS_MAX_XATTR_LISTLEN 16 /* Not the same as XATTR_LIST_MAX
238 * defined by <linux/xattr.h>
239 */
240/*
241 * ORANGEFS I/O operation types, used in both system and server interfaces.
242 */
243enum ORANGEFS_io_type {
244 ORANGEFS_IO_READ = 1,
245 ORANGEFS_IO_WRITE = 2
246};
247
248/*
249 * If this enum is modified the server parameters related to the precreate pool
250 * batch and low threshold sizes may need to be modified to reflect this
251 * change.
252 */
253enum orangefs_ds_type {
254 ORANGEFS_TYPE_NONE = 0,
255 ORANGEFS_TYPE_METAFILE = (1 << 0),
256 ORANGEFS_TYPE_DATAFILE = (1 << 1),
257 ORANGEFS_TYPE_DIRECTORY = (1 << 2),
258 ORANGEFS_TYPE_SYMLINK = (1 << 3),
259 ORANGEFS_TYPE_DIRDATA = (1 << 4),
260 ORANGEFS_TYPE_INTERNAL = (1 << 5) /* for the server's private use */
261};
262
263/*
264 * ORANGEFS_certificate simply stores a buffer with the buffer size.
265 * The buffer can be converted to an OpenSSL X509 struct for use.
266 */
267struct ORANGEFS_certificate {
268 __u32 buf_size;
269 unsigned char *buf;
270};
271
272/*
273 * A credential identifies a user and is signed by the client/user
274 * private key.
275 */
276struct ORANGEFS_credential {
277 __u32 userid; /* user id */
278 __u32 num_groups; /* length of group_array */
279 __u32 *group_array; /* groups for which the user is a member */
280 char *issuer; /* alias of the issuing server */
281 __u64 timeout; /* seconds after epoch to time out */
282 __u32 sig_size; /* length of the signature in bytes */
283 unsigned char *signature; /* digital signature */
284 struct ORANGEFS_certificate certificate; /* user certificate buffer */
285};
286#define extra_size_ORANGEFS_credential (ORANGEFS_REQ_LIMIT_GROUPS * \
287 sizeof(__u32) + \
288 ORANGEFS_REQ_LIMIT_ISSUER + \
289 ORANGEFS_REQ_LIMIT_SIGNATURE + \
290 extra_size_ORANGEFS_certificate)
291
292/* This structure is used by the VFS-client interaction alone */
293struct ORANGEFS_keyval_pair {
294 char key[ORANGEFS_MAX_XATTR_NAMELEN];
295 __s32 key_sz; /* __s32 for portable, fixed-size structures */
296 __s32 val_sz;
297 char val[ORANGEFS_MAX_XATTR_VALUELEN];
298};
299
300/* pvfs2-sysint.h ***********************************************************/
301/* Describes attributes for a file, directory, or symlink. */
302struct ORANGEFS_sys_attr_s {
303 __u32 owner;
304 __u32 group;
305 __u32 perms;
306 __u64 atime;
307 __u64 mtime;
308 __u64 ctime;
309 __s64 size;
310
311 /* NOTE: caller must free if valid */
312 char *link_target;
313
314 /* Changed to __s32 so that size of structure does not change */
315 __s32 dfile_count;
316
317 /* Changed to __s32 so that size of structure does not change */
318 __s32 distr_dir_servers_initial;
319
320 /* Changed to __s32 so that size of structure does not change */
321 __s32 distr_dir_servers_max;
322
323 /* Changed to __s32 so that size of structure does not change */
324 __s32 distr_dir_split_size;
325
326 __u32 mirror_copies_count;
327
328 /* NOTE: caller must free if valid */
329 char *dist_name;
330
331 /* NOTE: caller must free if valid */
332 char *dist_params;
333
334 __s64 dirent_count;
335 enum orangefs_ds_type objtype;
336 __u64 flags;
337 __u32 mask;
338 __s64 blksize;
339};
340
341#define ORANGEFS_LOOKUP_LINK_NO_FOLLOW 0
342
343/* pint-dev.h ***************************************************************/
344
345/* parameter structure used in ORANGEFS_DEV_DEBUG ioctl command */
346struct dev_mask_info_s {
347 enum {
348 KERNEL_MASK,
349 CLIENT_MASK,
350 } mask_type;
351 __u64 mask_value;
352};
353
354struct dev_mask2_info_s {
355 __u64 mask1_value;
356 __u64 mask2_value;
357};
358
359/* pvfs2-util.h *************************************************************/
360__s32 ORANGEFS_util_translate_mode(int mode);
361
362/* pvfs2-debug.h ************************************************************/
363#include "orangefs-debug.h"
364
365/* pvfs2-internal.h *********************************************************/
366#define llu(x) (unsigned long long)(x)
367#define lld(x) (long long)(x)
368
369/* pint-dev-shared.h ********************************************************/
370#define ORANGEFS_DEV_MAGIC 'k'
371
372#define ORANGEFS_READDIR_DEFAULT_DESC_COUNT 5
373
374#define DEV_GET_MAGIC 0x1
375#define DEV_GET_MAX_UPSIZE 0x2
376#define DEV_GET_MAX_DOWNSIZE 0x3
377#define DEV_MAP 0x4
378#define DEV_REMOUNT_ALL 0x5
379#define DEV_DEBUG 0x6
380#define DEV_UPSTREAM 0x7
381#define DEV_CLIENT_MASK 0x8
382#define DEV_CLIENT_STRING 0x9
383#define DEV_MAX_NR 0xa
384
385/* supported ioctls, codes are with respect to user-space */
386enum {
387 ORANGEFS_DEV_GET_MAGIC = _IOW(ORANGEFS_DEV_MAGIC, DEV_GET_MAGIC, __s32),
388 ORANGEFS_DEV_GET_MAX_UPSIZE =
389 _IOW(ORANGEFS_DEV_MAGIC, DEV_GET_MAX_UPSIZE, __s32),
390 ORANGEFS_DEV_GET_MAX_DOWNSIZE =
391 _IOW(ORANGEFS_DEV_MAGIC, DEV_GET_MAX_DOWNSIZE, __s32),
392 ORANGEFS_DEV_MAP = _IO(ORANGEFS_DEV_MAGIC, DEV_MAP),
393 ORANGEFS_DEV_REMOUNT_ALL = _IO(ORANGEFS_DEV_MAGIC, DEV_REMOUNT_ALL),
394 ORANGEFS_DEV_DEBUG = _IOR(ORANGEFS_DEV_MAGIC, DEV_DEBUG, __s32),
395 ORANGEFS_DEV_UPSTREAM = _IOW(ORANGEFS_DEV_MAGIC, DEV_UPSTREAM, int),
396 ORANGEFS_DEV_CLIENT_MASK = _IOW(ORANGEFS_DEV_MAGIC,
397 DEV_CLIENT_MASK,
398 struct dev_mask2_info_s),
399 ORANGEFS_DEV_CLIENT_STRING = _IOW(ORANGEFS_DEV_MAGIC,
400 DEV_CLIENT_STRING,
401 char *),
402 ORANGEFS_DEV_MAXNR = DEV_MAX_NR,
403};
404
405/*
406 * version number for use in communicating between kernel space and user
407 * space. Zero signifies the upstream version of the kernel module.
408 */
409#define ORANGEFS_KERNEL_PROTO_VERSION 0
410#define ORANGEFS_MINIMUM_USERSPACE_VERSION 20904
411
412/*
413 * describes memory regions to map in the ORANGEFS_DEV_MAP ioctl.
414 * NOTE: See devorangefs-req.c for 32 bit compat structure.
415 * Since this structure has a variable-sized layout that is different
416 * on 32 and 64 bit platforms, we need to normalize to a 64 bit layout
417 * on such systems before servicing ioctl calls from user-space binaries
418 * that may be 32 bit!
419 */
420struct ORANGEFS_dev_map_desc {
421 void *ptr;
422 __s32 total_size;
423 __s32 size;
424 __s32 count;
425};
426
427/* gossip.h *****************************************************************/
428
429#ifdef GOSSIP_DISABLE_DEBUG
430#define gossip_debug(mask, format, f...) do {} while (0)
431#else
432extern __u64 gossip_debug_mask;
433extern struct client_debug_mask client_debug_mask;
434
435/* try to avoid function call overhead by checking masks in macro */
436#define gossip_debug(mask, format, f...) \
437do { \
438 if (gossip_debug_mask & mask) \
439 printk(format, ##f); \
440} while (0)
441#endif /* GOSSIP_DISABLE_DEBUG */
442
443/* do file and line number printouts w/ the GNU preprocessor */
444#define gossip_ldebug(mask, format, f...) \
445 gossip_debug(mask, "%s: " format, __func__, ##f)
446
447#define gossip_err printk
448#define gossip_lerr(format, f...) \
449 gossip_err("%s line %d: " format, \
450 __FILE__, \
451 __LINE__, \
452 ##f)
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
new file mode 100644
index 000000000000..b9da9a0281c9
--- /dev/null
+++ b/fs/orangefs/super.c
@@ -0,0 +1,559 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7#include "protocol.h"
8#include "orangefs-kernel.h"
9#include "orangefs-bufmap.h"
10
11#include <linux/parser.h>
12
13/* a cache for orangefs-inode objects (i.e. orangefs inode private data) */
14static struct kmem_cache *orangefs_inode_cache;
15
16/* list for storing orangefs specific superblocks in use */
17LIST_HEAD(orangefs_superblocks);
18
19DEFINE_SPINLOCK(orangefs_superblocks_lock);
20
21enum {
22 Opt_intr,
23 Opt_acl,
24 Opt_local_lock,
25
26 Opt_err
27};
28
29static const match_table_t tokens = {
30 { Opt_acl, "acl" },
31 { Opt_intr, "intr" },
32 { Opt_local_lock, "local_lock" },
33 { Opt_err, NULL }
34};
35
36
37static int parse_mount_options(struct super_block *sb, char *options,
38 int silent)
39{
40 struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(sb);
41 substring_t args[MAX_OPT_ARGS];
42 char *p;
43
44 /*
45 * Force any potential flags that might be set from the mount
46 * to zero, ie, initialize to unset.
47 */
48 sb->s_flags &= ~MS_POSIXACL;
49 orangefs_sb->flags &= ~ORANGEFS_OPT_INTR;
50 orangefs_sb->flags &= ~ORANGEFS_OPT_LOCAL_LOCK;
51
52 while ((p = strsep(&options, ",")) != NULL) {
53 int token;
54
55 if (!*p)
56 continue;
57
58 token = match_token(p, tokens, args);
59 switch (token) {
60 case Opt_acl:
61 sb->s_flags |= MS_POSIXACL;
62 break;
63 case Opt_intr:
64 orangefs_sb->flags |= ORANGEFS_OPT_INTR;
65 break;
66 case Opt_local_lock:
67 orangefs_sb->flags |= ORANGEFS_OPT_LOCAL_LOCK;
68 break;
69 default:
70 goto fail;
71 }
72 }
73
74 return 0;
75fail:
76 if (!silent)
77 gossip_err("Error: mount option [%s] is not supported.\n", p);
78 return -EINVAL;
79}
80
81static void orangefs_inode_cache_ctor(void *req)
82{
83 struct orangefs_inode_s *orangefs_inode = req;
84
85 inode_init_once(&orangefs_inode->vfs_inode);
86 init_rwsem(&orangefs_inode->xattr_sem);
87
88 orangefs_inode->vfs_inode.i_version = 1;
89}
90
91static struct inode *orangefs_alloc_inode(struct super_block *sb)
92{
93 struct orangefs_inode_s *orangefs_inode;
94
95 orangefs_inode = kmem_cache_alloc(orangefs_inode_cache, GFP_KERNEL);
96 if (orangefs_inode == NULL) {
97 gossip_err("Failed to allocate orangefs_inode\n");
98 return NULL;
99 }
100
101 /*
102 * We want to clear everything except for rw_semaphore and the
103 * vfs_inode.
104 */
105 memset(&orangefs_inode->refn.khandle, 0, 16);
106 orangefs_inode->refn.fs_id = ORANGEFS_FS_ID_NULL;
107 orangefs_inode->last_failed_block_index_read = 0;
108 memset(orangefs_inode->link_target, 0, sizeof(orangefs_inode->link_target));
109 orangefs_inode->pinode_flags = 0;
110
111 gossip_debug(GOSSIP_SUPER_DEBUG,
112 "orangefs_alloc_inode: allocated %p\n",
113 &orangefs_inode->vfs_inode);
114 return &orangefs_inode->vfs_inode;
115}
116
117static void orangefs_destroy_inode(struct inode *inode)
118{
119 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
120
121 gossip_debug(GOSSIP_SUPER_DEBUG,
122 "%s: deallocated %p destroying inode %pU\n",
123 __func__, orangefs_inode, get_khandle_from_ino(inode));
124
125 kmem_cache_free(orangefs_inode_cache, orangefs_inode);
126}
127
128/*
129 * NOTE: information filled in here is typically reflected in the
130 * output of the system command 'df'
131*/
132static int orangefs_statfs(struct dentry *dentry, struct kstatfs *buf)
133{
134 int ret = -ENOMEM;
135 struct orangefs_kernel_op_s *new_op = NULL;
136 int flags = 0;
137 struct super_block *sb = NULL;
138
139 sb = dentry->d_sb;
140
141 gossip_debug(GOSSIP_SUPER_DEBUG,
142 "orangefs_statfs: called on sb %p (fs_id is %d)\n",
143 sb,
144 (int)(ORANGEFS_SB(sb)->fs_id));
145
146 new_op = op_alloc(ORANGEFS_VFS_OP_STATFS);
147 if (!new_op)
148 return ret;
149 new_op->upcall.req.statfs.fs_id = ORANGEFS_SB(sb)->fs_id;
150
151 if (ORANGEFS_SB(sb)->flags & ORANGEFS_OPT_INTR)
152 flags = ORANGEFS_OP_INTERRUPTIBLE;
153
154 ret = service_operation(new_op, "orangefs_statfs", flags);
155
156 if (new_op->downcall.status < 0)
157 goto out_op_release;
158
159 gossip_debug(GOSSIP_SUPER_DEBUG,
160 "%s: got %ld blocks available | "
161 "%ld blocks total | %ld block size | "
162 "%ld files total | %ld files avail\n",
163 __func__,
164 (long)new_op->downcall.resp.statfs.blocks_avail,
165 (long)new_op->downcall.resp.statfs.blocks_total,
166 (long)new_op->downcall.resp.statfs.block_size,
167 (long)new_op->downcall.resp.statfs.files_total,
168 (long)new_op->downcall.resp.statfs.files_avail);
169
170 buf->f_type = sb->s_magic;
171 memcpy(&buf->f_fsid, &ORANGEFS_SB(sb)->fs_id, sizeof(buf->f_fsid));
172 buf->f_bsize = new_op->downcall.resp.statfs.block_size;
173 buf->f_namelen = ORANGEFS_NAME_MAX;
174
175 buf->f_blocks = (sector_t) new_op->downcall.resp.statfs.blocks_total;
176 buf->f_bfree = (sector_t) new_op->downcall.resp.statfs.blocks_avail;
177 buf->f_bavail = (sector_t) new_op->downcall.resp.statfs.blocks_avail;
178 buf->f_files = (sector_t) new_op->downcall.resp.statfs.files_total;
179 buf->f_ffree = (sector_t) new_op->downcall.resp.statfs.files_avail;
180 buf->f_frsize = sb->s_blocksize;
181
182out_op_release:
183 op_release(new_op);
184 gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_statfs: returning %d\n", ret);
185 return ret;
186}
187
188/*
189 * Remount as initiated by VFS layer. We just need to reparse the mount
190 * options, no need to signal pvfs2-client-core about it.
191 */
192static int orangefs_remount_fs(struct super_block *sb, int *flags, char *data)
193{
194 gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount_fs: called\n");
195 return parse_mount_options(sb, data, 1);
196}
197
198/*
199 * Remount as initiated by pvfs2-client-core on restart. This is used to
200 * repopulate mount information left from previous pvfs2-client-core.
201 *
202 * the idea here is that given a valid superblock, we're
203 * re-initializing the user space client with the initial mount
204 * information specified when the super block was first initialized.
205 * this is very different than the first initialization/creation of a
206 * superblock. we use the special service_priority_operation to make
207 * sure that the mount gets ahead of any other pending operation that
208 * is waiting for servicing. this means that the pvfs2-client won't
209 * fail to start several times for all other pending operations before
210 * the client regains all of the mount information from us.
211 * NOTE: this function assumes that the request_mutex is already acquired!
212 */
213int orangefs_remount(struct orangefs_sb_info_s *orangefs_sb)
214{
215 struct orangefs_kernel_op_s *new_op;
216 int ret = -EINVAL;
217
218 gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount: called\n");
219
220 new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT);
221 if (!new_op)
222 return -ENOMEM;
223 strncpy(new_op->upcall.req.fs_mount.orangefs_config_server,
224 orangefs_sb->devname,
225 ORANGEFS_MAX_SERVER_ADDR_LEN);
226
227 gossip_debug(GOSSIP_SUPER_DEBUG,
228 "Attempting ORANGEFS Remount via host %s\n",
229 new_op->upcall.req.fs_mount.orangefs_config_server);
230
231 /*
232 * we assume that the calling function has already acquired the
233 * request_mutex to prevent other operations from bypassing
234 * this one
235 */
236 ret = service_operation(new_op, "orangefs_remount",
237 ORANGEFS_OP_PRIORITY | ORANGEFS_OP_NO_MUTEX);
238 gossip_debug(GOSSIP_SUPER_DEBUG,
239 "orangefs_remount: mount got return value of %d\n",
240 ret);
241 if (ret == 0) {
242 /*
243 * store the id assigned to this sb -- it's just a
244 * short-lived mapping that the system interface uses
245 * to map this superblock to a particular mount entry
246 */
247 orangefs_sb->id = new_op->downcall.resp.fs_mount.id;
248 orangefs_sb->mount_pending = 0;
249 }
250
251 op_release(new_op);
252 return ret;
253}
254
255int fsid_key_table_initialize(void)
256{
257 return 0;
258}
259
260void fsid_key_table_finalize(void)
261{
262}
263
264/* Called whenever the VFS dirties the inode in response to atime updates */
265static void orangefs_dirty_inode(struct inode *inode, int flags)
266{
267 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
268
269 gossip_debug(GOSSIP_SUPER_DEBUG,
270 "orangefs_dirty_inode: %pU\n",
271 get_khandle_from_ino(inode));
272 SetAtimeFlag(orangefs_inode);
273}
274
275static const struct super_operations orangefs_s_ops = {
276 .alloc_inode = orangefs_alloc_inode,
277 .destroy_inode = orangefs_destroy_inode,
278 .dirty_inode = orangefs_dirty_inode,
279 .drop_inode = generic_delete_inode,
280 .statfs = orangefs_statfs,
281 .remount_fs = orangefs_remount_fs,
282 .show_options = generic_show_options,
283};
284
285static struct dentry *orangefs_fh_to_dentry(struct super_block *sb,
286 struct fid *fid,
287 int fh_len,
288 int fh_type)
289{
290 struct orangefs_object_kref refn;
291
292 if (fh_len < 5 || fh_type > 2)
293 return NULL;
294
295 ORANGEFS_khandle_from(&(refn.khandle), fid->raw, 16);
296 refn.fs_id = (u32) fid->raw[4];
297 gossip_debug(GOSSIP_SUPER_DEBUG,
298 "fh_to_dentry: handle %pU, fs_id %d\n",
299 &refn.khandle,
300 refn.fs_id);
301
302 return d_obtain_alias(orangefs_iget(sb, &refn));
303}
304
305static int orangefs_encode_fh(struct inode *inode,
306 __u32 *fh,
307 int *max_len,
308 struct inode *parent)
309{
310 int len = parent ? 10 : 5;
311 int type = 1;
312 struct orangefs_object_kref refn;
313
314 if (*max_len < len) {
315 gossip_lerr("fh buffer is too small for encoding\n");
316 *max_len = len;
317 type = 255;
318 goto out;
319 }
320
321 refn = ORANGEFS_I(inode)->refn;
322 ORANGEFS_khandle_to(&refn.khandle, fh, 16);
323 fh[4] = refn.fs_id;
324
325 gossip_debug(GOSSIP_SUPER_DEBUG,
326 "Encoding fh: handle %pU, fsid %u\n",
327 &refn.khandle,
328 refn.fs_id);
329
330
331 if (parent) {
332 refn = ORANGEFS_I(parent)->refn;
333 ORANGEFS_khandle_to(&refn.khandle, (char *) fh + 20, 16);
334 fh[9] = refn.fs_id;
335
336 type = 2;
337 gossip_debug(GOSSIP_SUPER_DEBUG,
338 "Encoding parent: handle %pU, fsid %u\n",
339 &refn.khandle,
340 refn.fs_id);
341 }
342 *max_len = len;
343
344out:
345 return type;
346}
347
348static const struct export_operations orangefs_export_ops = {
349 .encode_fh = orangefs_encode_fh,
350 .fh_to_dentry = orangefs_fh_to_dentry,
351};
352
353static int orangefs_fill_sb(struct super_block *sb,
354 struct orangefs_fs_mount_response *fs_mount,
355 void *data, int silent)
356{
357 int ret = -EINVAL;
358 struct inode *root = NULL;
359 struct dentry *root_dentry = NULL;
360 struct orangefs_object_kref root_object;
361
362 /* alloc and init our private orangefs sb info */
363 sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL);
364 if (!ORANGEFS_SB(sb))
365 return -ENOMEM;
366 ORANGEFS_SB(sb)->sb = sb;
367
368 ORANGEFS_SB(sb)->root_khandle = fs_mount->root_khandle;
369 ORANGEFS_SB(sb)->fs_id = fs_mount->fs_id;
370 ORANGEFS_SB(sb)->id = fs_mount->id;
371
372 if (data) {
373 ret = parse_mount_options(sb, data, silent);
374 if (ret)
375 return ret;
376 }
377
378 /* Hang the xattr handlers off the superblock */
379 sb->s_xattr = orangefs_xattr_handlers;
380 sb->s_magic = ORANGEFS_SUPER_MAGIC;
381 sb->s_op = &orangefs_s_ops;
382 sb->s_d_op = &orangefs_dentry_operations;
383
384 sb->s_blocksize = orangefs_bufmap_size_query();
385 sb->s_blocksize_bits = orangefs_bufmap_shift_query();
386 sb->s_maxbytes = MAX_LFS_FILESIZE;
387
388 root_object.khandle = ORANGEFS_SB(sb)->root_khandle;
389 root_object.fs_id = ORANGEFS_SB(sb)->fs_id;
390 gossip_debug(GOSSIP_SUPER_DEBUG,
391 "get inode %pU, fsid %d\n",
392 &root_object.khandle,
393 root_object.fs_id);
394
395 root = orangefs_iget(sb, &root_object);
396 if (IS_ERR(root))
397 return PTR_ERR(root);
398
399 gossip_debug(GOSSIP_SUPER_DEBUG,
400 "Allocated root inode [%p] with mode %x\n",
401 root,
402 root->i_mode);
403
404 /* allocates and places root dentry in dcache */
405 root_dentry = d_make_root(root);
406 if (!root_dentry)
407 return -ENOMEM;
408
409 sb->s_export_op = &orangefs_export_ops;
410 sb->s_root = root_dentry;
411 return 0;
412}
413
414struct dentry *orangefs_mount(struct file_system_type *fst,
415 int flags,
416 const char *devname,
417 void *data)
418{
419 int ret = -EINVAL;
420 struct super_block *sb = ERR_PTR(-EINVAL);
421 struct orangefs_kernel_op_s *new_op;
422 struct dentry *d = ERR_PTR(-EINVAL);
423
424 gossip_debug(GOSSIP_SUPER_DEBUG,
425 "orangefs_mount: called with devname %s\n",
426 devname);
427
428 if (!devname) {
429 gossip_err("ERROR: device name not specified.\n");
430 return ERR_PTR(-EINVAL);
431 }
432
433 new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT);
434 if (!new_op)
435 return ERR_PTR(-ENOMEM);
436
437 strncpy(new_op->upcall.req.fs_mount.orangefs_config_server,
438 devname,
439 ORANGEFS_MAX_SERVER_ADDR_LEN);
440
441 gossip_debug(GOSSIP_SUPER_DEBUG,
442 "Attempting ORANGEFS Mount via host %s\n",
443 new_op->upcall.req.fs_mount.orangefs_config_server);
444
445 ret = service_operation(new_op, "orangefs_mount", 0);
446 gossip_debug(GOSSIP_SUPER_DEBUG,
447 "orangefs_mount: mount got return value of %d\n", ret);
448 if (ret)
449 goto free_op;
450
451 if (new_op->downcall.resp.fs_mount.fs_id == ORANGEFS_FS_ID_NULL) {
452 gossip_err("ERROR: Retrieved null fs_id\n");
453 ret = -EINVAL;
454 goto free_op;
455 }
456
457 sb = sget(fst, NULL, set_anon_super, flags, NULL);
458
459 if (IS_ERR(sb)) {
460 d = ERR_CAST(sb);
461 goto free_op;
462 }
463
464 ret = orangefs_fill_sb(sb,
465 &new_op->downcall.resp.fs_mount, data,
466 flags & MS_SILENT ? 1 : 0);
467
468 if (ret) {
469 d = ERR_PTR(ret);
470 goto free_op;
471 }
472
473 /*
474 * on successful mount, store the devname and data
475 * used
476 */
477 strncpy(ORANGEFS_SB(sb)->devname,
478 devname,
479 ORANGEFS_MAX_SERVER_ADDR_LEN);
480
481 /* mount_pending must be cleared */
482 ORANGEFS_SB(sb)->mount_pending = 0;
483
484 /*
485 * finally, add this sb to our list of known orangefs
486 * sb's
487 */
488 gossip_debug(GOSSIP_SUPER_DEBUG,
489 "Adding SB %p to orangefs superblocks\n",
490 ORANGEFS_SB(sb));
491 spin_lock(&orangefs_superblocks_lock);
492 list_add_tail(&ORANGEFS_SB(sb)->list, &orangefs_superblocks);
493 spin_unlock(&orangefs_superblocks_lock);
494 op_release(new_op);
495 return dget(sb->s_root);
496
497free_op:
498 gossip_err("orangefs_mount: mount request failed with %d\n", ret);
499 if (ret == -EINVAL) {
500 gossip_err("Ensure that all orangefs-servers have the same FS configuration files\n");
501 gossip_err("Look at pvfs2-client-core log file (typically /tmp/pvfs2-client.log) for more details\n");
502 }
503
504 op_release(new_op);
505
506 return d;
507}
508
509void orangefs_kill_sb(struct super_block *sb)
510{
511 gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_kill_sb: called\n");
512
513 /* provided sb cleanup */
514 kill_anon_super(sb);
515
516 /*
517 * issue the unmount to userspace to tell it to remove the
518 * dynamic mount info it has for this superblock
519 */
520 orangefs_unmount_sb(sb);
521
522 /* remove the sb from our list of orangefs specific sb's */
523
524 spin_lock(&orangefs_superblocks_lock);
525 __list_del_entry(&ORANGEFS_SB(sb)->list); /* not list_del_init */
526 ORANGEFS_SB(sb)->list.prev = NULL;
527 spin_unlock(&orangefs_superblocks_lock);
528
529 /*
530 * make sure that ORANGEFS_DEV_REMOUNT_ALL loop that might've seen us
531 * gets completed before we free the dang thing.
532 */
533 mutex_lock(&request_mutex);
534 mutex_unlock(&request_mutex);
535
536 /* free the orangefs superblock private data */
537 kfree(ORANGEFS_SB(sb));
538}
539
540int orangefs_inode_cache_initialize(void)
541{
542 orangefs_inode_cache = kmem_cache_create("orangefs_inode_cache",
543 sizeof(struct orangefs_inode_s),
544 0,
545 ORANGEFS_CACHE_CREATE_FLAGS,
546 orangefs_inode_cache_ctor);
547
548 if (!orangefs_inode_cache) {
549 gossip_err("Cannot create orangefs_inode_cache\n");
550 return -ENOMEM;
551 }
552 return 0;
553}
554
555int orangefs_inode_cache_finalize(void)
556{
557 kmem_cache_destroy(orangefs_inode_cache);
558 return 0;
559}
diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c
new file mode 100644
index 000000000000..6418dd638680
--- /dev/null
+++ b/fs/orangefs/symlink.c
@@ -0,0 +1,19 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7#include "protocol.h"
8#include "orangefs-kernel.h"
9#include "orangefs-bufmap.h"
10
11struct inode_operations orangefs_symlink_inode_operations = {
12 .readlink = generic_readlink,
13 .get_link = simple_get_link,
14 .setattr = orangefs_setattr,
15 .getattr = orangefs_getattr,
16 .listxattr = orangefs_listxattr,
17 .setxattr = generic_setxattr,
18 .permission = orangefs_permission,
19};
diff --git a/fs/orangefs/upcall.h b/fs/orangefs/upcall.h
new file mode 100644
index 000000000000..001b20239407
--- /dev/null
+++ b/fs/orangefs/upcall.h
@@ -0,0 +1,246 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7#ifndef __UPCALL_H
8#define __UPCALL_H
9
10/*
11 * Sanitized this header file to fix
12 * 32-64 bit interaction issues between
13 * client-core and device
14 */
15struct orangefs_io_request_s {
16 __s32 __pad1;
17 __s32 buf_index;
18 __s32 count;
19 __s32 __pad2;
20 __s64 offset;
21 struct orangefs_object_kref refn;
22 enum ORANGEFS_io_type io_type;
23 __s32 readahead_size;
24};
25
26struct orangefs_lookup_request_s {
27 __s32 sym_follow;
28 __s32 __pad1;
29 struct orangefs_object_kref parent_refn;
30 char d_name[ORANGEFS_NAME_MAX];
31};
32
33struct orangefs_create_request_s {
34 struct orangefs_object_kref parent_refn;
35 struct ORANGEFS_sys_attr_s attributes;
36 char d_name[ORANGEFS_NAME_MAX];
37};
38
39struct orangefs_symlink_request_s {
40 struct orangefs_object_kref parent_refn;
41 struct ORANGEFS_sys_attr_s attributes;
42 char entry_name[ORANGEFS_NAME_MAX];
43 char target[ORANGEFS_NAME_MAX];
44};
45
46struct orangefs_getattr_request_s {
47 struct orangefs_object_kref refn;
48 __u32 mask;
49 __u32 __pad1;
50};
51
52struct orangefs_setattr_request_s {
53 struct orangefs_object_kref refn;
54 struct ORANGEFS_sys_attr_s attributes;
55};
56
57struct orangefs_remove_request_s {
58 struct orangefs_object_kref parent_refn;
59 char d_name[ORANGEFS_NAME_MAX];
60};
61
62struct orangefs_mkdir_request_s {
63 struct orangefs_object_kref parent_refn;
64 struct ORANGEFS_sys_attr_s attributes;
65 char d_name[ORANGEFS_NAME_MAX];
66};
67
68struct orangefs_readdir_request_s {
69 struct orangefs_object_kref refn;
70 __u64 token;
71 __s32 max_dirent_count;
72 __s32 buf_index;
73};
74
75struct orangefs_readdirplus_request_s {
76 struct orangefs_object_kref refn;
77 __u64 token;
78 __s32 max_dirent_count;
79 __u32 mask;
80 __s32 buf_index;
81 __s32 __pad1;
82};
83
84struct orangefs_rename_request_s {
85 struct orangefs_object_kref old_parent_refn;
86 struct orangefs_object_kref new_parent_refn;
87 char d_old_name[ORANGEFS_NAME_MAX];
88 char d_new_name[ORANGEFS_NAME_MAX];
89};
90
91struct orangefs_statfs_request_s {
92 __s32 fs_id;
93 __s32 __pad1;
94};
95
96struct orangefs_truncate_request_s {
97 struct orangefs_object_kref refn;
98 __s64 size;
99};
100
101struct orangefs_mmap_ra_cache_flush_request_s {
102 struct orangefs_object_kref refn;
103};
104
105struct orangefs_fs_mount_request_s {
106 char orangefs_config_server[ORANGEFS_MAX_SERVER_ADDR_LEN];
107};
108
109struct orangefs_fs_umount_request_s {
110 __s32 id;
111 __s32 fs_id;
112 char orangefs_config_server[ORANGEFS_MAX_SERVER_ADDR_LEN];
113};
114
115struct orangefs_getxattr_request_s {
116 struct orangefs_object_kref refn;
117 __s32 key_sz;
118 __s32 __pad1;
119 char key[ORANGEFS_MAX_XATTR_NAMELEN];
120};
121
122struct orangefs_setxattr_request_s {
123 struct orangefs_object_kref refn;
124 struct ORANGEFS_keyval_pair keyval;
125 __s32 flags;
126 __s32 __pad1;
127};
128
129struct orangefs_listxattr_request_s {
130 struct orangefs_object_kref refn;
131 __s32 requested_count;
132 __s32 __pad1;
133 __u64 token;
134};
135
136struct orangefs_removexattr_request_s {
137 struct orangefs_object_kref refn;
138 __s32 key_sz;
139 __s32 __pad1;
140 char key[ORANGEFS_MAX_XATTR_NAMELEN];
141};
142
143struct orangefs_op_cancel_s {
144 __u64 op_tag;
145};
146
147struct orangefs_fsync_request_s {
148 struct orangefs_object_kref refn;
149};
150
151enum orangefs_param_request_type {
152 ORANGEFS_PARAM_REQUEST_SET = 1,
153 ORANGEFS_PARAM_REQUEST_GET = 2
154};
155
156enum orangefs_param_request_op {
157 ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS = 1,
158 ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT = 2,
159 ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT = 3,
160 ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE = 4,
161 ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS = 5,
162 ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE = 6,
163 ORANGEFS_PARAM_REQUEST_OP_PERF_RESET = 7,
164 ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS = 8,
165 ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT = 9,
166 ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT = 10,
167 ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE = 11,
168 ORANGEFS_PARAM_REQUEST_OP_STATIC_ACACHE_TIMEOUT_MSECS = 12,
169 ORANGEFS_PARAM_REQUEST_OP_STATIC_ACACHE_HARD_LIMIT = 13,
170 ORANGEFS_PARAM_REQUEST_OP_STATIC_ACACHE_SOFT_LIMIT = 14,
171 ORANGEFS_PARAM_REQUEST_OP_STATIC_ACACHE_RECLAIM_PERCENTAGE = 15,
172 ORANGEFS_PARAM_REQUEST_OP_CLIENT_DEBUG = 16,
173 ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS = 17,
174 ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT = 18,
175 ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT = 19,
176 ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE = 20,
177 ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS = 21,
178 ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT = 22,
179 ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT = 23,
180 ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE = 24,
181 ORANGEFS_PARAM_REQUEST_OP_TWO_MASK_VALUES = 25,
182};
183
184struct orangefs_param_request_s {
185 enum orangefs_param_request_type type;
186 enum orangefs_param_request_op op;
187 __s64 value;
188 char s_value[ORANGEFS_MAX_DEBUG_STRING_LEN];
189};
190
191enum orangefs_perf_count_request_type {
192 ORANGEFS_PERF_COUNT_REQUEST_ACACHE = 1,
193 ORANGEFS_PERF_COUNT_REQUEST_NCACHE = 2,
194 ORANGEFS_PERF_COUNT_REQUEST_CAPCACHE = 3,
195};
196
197struct orangefs_perf_count_request_s {
198 enum orangefs_perf_count_request_type type;
199 __s32 __pad1;
200};
201
202struct orangefs_fs_key_request_s {
203 __s32 fsid;
204 __s32 __pad1;
205};
206
207struct orangefs_upcall_s {
208 __s32 type;
209 __u32 uid;
210 __u32 gid;
211 int pid;
212 int tgid;
213 /* Trailers unused but must be retained for protocol compatibility. */
214 __s64 trailer_size;
215 char *trailer_buf;
216
217 union {
218 struct orangefs_io_request_s io;
219 struct orangefs_lookup_request_s lookup;
220 struct orangefs_create_request_s create;
221 struct orangefs_symlink_request_s sym;
222 struct orangefs_getattr_request_s getattr;
223 struct orangefs_setattr_request_s setattr;
224 struct orangefs_remove_request_s remove;
225 struct orangefs_mkdir_request_s mkdir;
226 struct orangefs_readdir_request_s readdir;
227 struct orangefs_readdirplus_request_s readdirplus;
228 struct orangefs_rename_request_s rename;
229 struct orangefs_statfs_request_s statfs;
230 struct orangefs_truncate_request_s truncate;
231 struct orangefs_mmap_ra_cache_flush_request_s ra_cache_flush;
232 struct orangefs_fs_mount_request_s fs_mount;
233 struct orangefs_fs_umount_request_s fs_umount;
234 struct orangefs_getxattr_request_s getxattr;
235 struct orangefs_setxattr_request_s setxattr;
236 struct orangefs_listxattr_request_s listxattr;
237 struct orangefs_removexattr_request_s removexattr;
238 struct orangefs_op_cancel_s cancel;
239 struct orangefs_fsync_request_s fsync;
240 struct orangefs_param_request_s param;
241 struct orangefs_perf_count_request_s perf_count;
242 struct orangefs_fs_key_request_s fs_key;
243 } req;
244};
245
246#endif /* __UPCALL_H */
diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c
new file mode 100644
index 000000000000..31635bc303fe
--- /dev/null
+++ b/fs/orangefs/waitqueue.c
@@ -0,0 +1,357 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 * (C) 2011 Omnibond Systems
4 *
5 * Changes by Acxiom Corporation to implement generic service_operation()
6 * function, Copyright Acxiom Corporation, 2005.
7 *
8 * See COPYING in top-level directory.
9 */
10
11/*
12 * In-kernel waitqueue operations.
13 */
14
15#include "protocol.h"
16#include "orangefs-kernel.h"
17#include "orangefs-bufmap.h"
18
19static int wait_for_matching_downcall(struct orangefs_kernel_op_s *, long, bool);
20static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *);
21
22/*
23 * What we do in this function is to walk the list of operations that are
24 * present in the request queue and mark them as purged.
25 * NOTE: This is called from the device close after client-core has
26 * guaranteed that no new operations could appear on the list since the
27 * client-core is anyway going to exit.
28 */
29void purge_waiting_ops(void)
30{
31 struct orangefs_kernel_op_s *op;
32
33 spin_lock(&orangefs_request_list_lock);
34 list_for_each_entry(op, &orangefs_request_list, list) {
35 gossip_debug(GOSSIP_WAIT_DEBUG,
36 "pvfs2-client-core: purging op tag %llu %s\n",
37 llu(op->tag),
38 get_opname_string(op));
39 set_op_state_purged(op);
40 gossip_debug(GOSSIP_DEV_DEBUG,
41 "%s: op:%s: op_state:%d: process:%s:\n",
42 __func__,
43 get_opname_string(op),
44 op->op_state,
45 current->comm);
46 }
47 spin_unlock(&orangefs_request_list_lock);
48}
49
50/*
51 * submits a ORANGEFS operation and waits for it to complete
52 *
53 * Note op->downcall.status will contain the status of the operation (in
54 * errno format), whether provided by pvfs2-client or a result of failure to
55 * service the operation. If the caller wishes to distinguish, then
56 * op->state can be checked to see if it was serviced or not.
57 *
58 * Returns contents of op->downcall.status for convenience
59 */
60int service_operation(struct orangefs_kernel_op_s *op,
61 const char *op_name,
62 int flags)
63{
64 long timeout = MAX_SCHEDULE_TIMEOUT;
65 int ret = 0;
66
67 DEFINE_WAIT(wait_entry);
68
69 op->upcall.tgid = current->tgid;
70 op->upcall.pid = current->pid;
71
72retry_servicing:
73 op->downcall.status = 0;
74 gossip_debug(GOSSIP_WAIT_DEBUG,
75 "%s: %s op:%p: process:%s: pid:%d:\n",
76 __func__,
77 op_name,
78 op,
79 current->comm,
80 current->pid);
81
82 /*
83 * If ORANGEFS_OP_NO_MUTEX was set in flags, we need to avoid
84 * acquiring the request_mutex because we're servicing a
85 * high priority remount operation and the request_mutex is
86 * already taken.
87 */
88 if (!(flags & ORANGEFS_OP_NO_MUTEX)) {
89 if (flags & ORANGEFS_OP_INTERRUPTIBLE)
90 ret = mutex_lock_interruptible(&request_mutex);
91 else
92 ret = mutex_lock_killable(&request_mutex);
93 /*
94 * check to see if we were interrupted while waiting for
95 * mutex
96 */
97 if (ret < 0) {
98 op->downcall.status = ret;
99 gossip_debug(GOSSIP_WAIT_DEBUG,
100 "%s: service_operation interrupted.\n",
101 __func__);
102 return ret;
103 }
104 }
105
106 /* queue up the operation */
107 spin_lock(&orangefs_request_list_lock);
108 spin_lock(&op->lock);
109 set_op_state_waiting(op);
110 gossip_debug(GOSSIP_DEV_DEBUG,
111 "%s: op:%s: op_state:%d: process:%s:\n",
112 __func__,
113 get_opname_string(op),
114 op->op_state,
115 current->comm);
116 /* add high priority remount op to the front of the line. */
117 if (flags & ORANGEFS_OP_PRIORITY)
118 list_add(&op->list, &orangefs_request_list);
119 else
120 list_add_tail(&op->list, &orangefs_request_list);
121 spin_unlock(&op->lock);
122 wake_up_interruptible(&orangefs_request_list_waitq);
123 if (!__is_daemon_in_service()) {
124 gossip_debug(GOSSIP_WAIT_DEBUG,
125 "%s:client core is NOT in service.\n",
126 __func__);
127 timeout = op_timeout_secs * HZ;
128 }
129 spin_unlock(&orangefs_request_list_lock);
130
131 if (!(flags & ORANGEFS_OP_NO_MUTEX))
132 mutex_unlock(&request_mutex);
133
134 ret = wait_for_matching_downcall(op, timeout,
135 flags & ORANGEFS_OP_INTERRUPTIBLE);
136
137 gossip_debug(GOSSIP_WAIT_DEBUG,
138 "%s: wait_for_matching_downcall returned %d for %p\n",
139 __func__,
140 ret,
141 op);
142
143 /* got matching downcall; make sure status is in errno format */
144 if (!ret) {
145 spin_unlock(&op->lock);
146 op->downcall.status =
147 orangefs_normalize_to_errno(op->downcall.status);
148 ret = op->downcall.status;
149 goto out;
150 }
151
152 /* failed to get matching downcall */
153 if (ret == -ETIMEDOUT) {
154 gossip_err("%s: %s -- wait timed out; aborting attempt.\n",
155 __func__,
156 op_name);
157 }
158
159 /*
160 * remove a waiting op from the request list or
161 * remove an in-progress op from the in-progress list.
162 */
163 orangefs_clean_up_interrupted_operation(op);
164
165 op->downcall.status = ret;
166 /* retry if operation has not been serviced and if requested */
167 if (ret == -EAGAIN) {
168 op->attempts++;
169 timeout = op_timeout_secs * HZ;
170 gossip_debug(GOSSIP_WAIT_DEBUG,
171 "orangefs: tag %llu (%s)"
172 " -- operation to be retried (%d attempt)\n",
173 llu(op->tag),
174 op_name,
175 op->attempts);
176
177 /*
178 * io ops (ops that use the shared memory buffer) have
179 * to be returned to their caller for a retry. Other ops
180 * can just be recycled here.
181 */
182 if (!op->uses_shared_memory)
183 goto retry_servicing;
184 }
185
186out:
187 gossip_debug(GOSSIP_WAIT_DEBUG,
188 "%s: %s returning: %d for %p.\n",
189 __func__,
190 op_name,
191 ret,
192 op);
193 return ret;
194}
195
196/* This can get called on an I/O op if it had a bad service_operation. */
197bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op)
198{
199 u64 tag = op->tag;
200 if (!op_state_in_progress(op))
201 return false;
202
203 op->slot_to_free = op->upcall.req.io.buf_index;
204 memset(&op->upcall, 0, sizeof(op->upcall));
205 memset(&op->downcall, 0, sizeof(op->downcall));
206 op->upcall.type = ORANGEFS_VFS_OP_CANCEL;
207 op->upcall.req.cancel.op_tag = tag;
208 op->downcall.type = ORANGEFS_VFS_OP_INVALID;
209 op->downcall.status = -1;
210 orangefs_new_tag(op);
211
212 spin_lock(&orangefs_request_list_lock);
213 /* orangefs_request_list_lock is enough of a barrier here */
214 if (!__is_daemon_in_service()) {
215 spin_unlock(&orangefs_request_list_lock);
216 return false;
217 }
218 spin_lock(&op->lock);
219 set_op_state_waiting(op);
220 gossip_debug(GOSSIP_DEV_DEBUG,
221 "%s: op:%s: op_state:%d: process:%s:\n",
222 __func__,
223 get_opname_string(op),
224 op->op_state,
225 current->comm);
226 list_add(&op->list, &orangefs_request_list);
227 spin_unlock(&op->lock);
228 spin_unlock(&orangefs_request_list_lock);
229
230 gossip_debug(GOSSIP_WAIT_DEBUG,
231 "Attempting ORANGEFS operation cancellation of tag %llu\n",
232 llu(tag));
233 return true;
234}
235
236/*
237 * Change an op to the "given up" state and remove it from its list.
238 */
239static void
240 orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *op)
241{
242 /*
243 * handle interrupted cases depending on what state we were in when
244 * the interruption is detected.
245 *
246 * Called with op->lock held.
247 */
248
249 /*
250 * List manipulation code elsewhere will ignore ops that
251 * have been given up upon.
252 */
253 op->op_state |= OP_VFS_STATE_GIVEN_UP;
254
255 if (list_empty(&op->list)) {
256 /* caught copying to/from daemon */
257 BUG_ON(op_state_serviced(op));
258 spin_unlock(&op->lock);
259 wait_for_completion(&op->waitq);
260 } else if (op_state_waiting(op)) {
261 /*
262 * upcall hasn't been read; remove op from upcall request
263 * list.
264 */
265 spin_unlock(&op->lock);
266 spin_lock(&orangefs_request_list_lock);
267 list_del_init(&op->list);
268 spin_unlock(&orangefs_request_list_lock);
269 gossip_debug(GOSSIP_WAIT_DEBUG,
270 "Interrupted: Removed op %p from request_list\n",
271 op);
272 } else if (op_state_in_progress(op)) {
273 /* op must be removed from the in progress htable */
274 spin_unlock(&op->lock);
275 spin_lock(&htable_ops_in_progress_lock);
276 list_del_init(&op->list);
277 spin_unlock(&htable_ops_in_progress_lock);
278 gossip_debug(GOSSIP_WAIT_DEBUG,
279 "Interrupted: Removed op %p"
280 " from htable_ops_in_progress\n",
281 op);
282 } else {
283 spin_unlock(&op->lock);
284 gossip_err("interrupted operation is in a weird state 0x%x\n",
285 op->op_state);
286 }
287 reinit_completion(&op->waitq);
288}
289
290/*
291 * Sleeps on waitqueue waiting for matching downcall.
292 * If client-core finishes servicing, then we are good to go.
293 * else if client-core exits, we get woken up here, and retry with a timeout
294 *
295 * When this call returns to the caller, the specified op will no
296 * longer be in either the in_progress hash table or on the request list.
297 *
298 * Returns 0 on success and -errno on failure
299 * Errors are:
300 * EAGAIN in case we want the caller to requeue and try again..
301 * EINTR/EIO/ETIMEDOUT indicating we are done trying to service this
302 * operation since client-core seems to be exiting too often
303 * or if we were interrupted.
304 *
305 * Returns with op->lock taken.
306 */
307static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op,
308 long timeout,
309 bool interruptible)
310{
311 long n;
312
313 /*
314 * There's a "schedule_timeout" inside of these wait
315 * primitives, during which the op is out of the hands of the
316 * user process that needs something done and is being
317 * manipulated by the client-core process.
318 */
319 if (interruptible)
320 n = wait_for_completion_interruptible_timeout(&op->waitq,
321 timeout);
322 else
323 n = wait_for_completion_killable_timeout(&op->waitq, timeout);
324
325 spin_lock(&op->lock);
326
327 if (op_state_serviced(op))
328 return 0;
329
330 if (unlikely(n < 0)) {
331 gossip_debug(GOSSIP_WAIT_DEBUG,
332 "%s: operation interrupted, tag %llu, %p\n",
333 __func__,
334 llu(op->tag),
335 op);
336 return -EINTR;
337 }
338 if (op_state_purged(op)) {
339 gossip_debug(GOSSIP_WAIT_DEBUG,
340 "%s: operation purged, tag %llu, %p, %d\n",
341 __func__,
342 llu(op->tag),
343 op,
344 op->attempts);
345 return (op->attempts < ORANGEFS_PURGE_RETRY_COUNT) ?
346 -EAGAIN :
347 -EIO;
348 }
349 /* must have timed out, then... */
350 gossip_debug(GOSSIP_WAIT_DEBUG,
351 "%s: operation timed out, tag %llu, %p, %d)\n",
352 __func__,
353 llu(op->tag),
354 op,
355 op->attempts);
356 return -ETIMEDOUT;
357}
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
new file mode 100644
index 000000000000..ef5da7538cd5
--- /dev/null
+++ b/fs/orangefs/xattr.c
@@ -0,0 +1,545 @@
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7/*
8 * Linux VFS extended attribute operations.
9 */
10
11#include "protocol.h"
12#include "orangefs-kernel.h"
13#include "orangefs-bufmap.h"
14#include <linux/posix_acl_xattr.h>
15#include <linux/xattr.h>
16
17
18#define SYSTEM_ORANGEFS_KEY "system.pvfs2."
19#define SYSTEM_ORANGEFS_KEY_LEN 13
20
21/*
22 * this function returns
23 * 0 if the key corresponding to name is not meant to be printed as part
24 * of a listxattr.
25 * 1 if the key corresponding to name is meant to be returned as part of
26 * a listxattr.
27 * The ones that start SYSTEM_ORANGEFS_KEY are the ones to avoid printing.
28 */
29static int is_reserved_key(const char *key, size_t size)
30{
31
32 if (size < SYSTEM_ORANGEFS_KEY_LEN)
33 return 1;
34
35 return strncmp(key, SYSTEM_ORANGEFS_KEY, SYSTEM_ORANGEFS_KEY_LEN) ? 1 : 0;
36}
37
38static inline int convert_to_internal_xattr_flags(int setxattr_flags)
39{
40 int internal_flag = 0;
41
42 if (setxattr_flags & XATTR_REPLACE) {
43 /* Attribute must exist! */
44 internal_flag = ORANGEFS_XATTR_REPLACE;
45 } else if (setxattr_flags & XATTR_CREATE) {
46 /* Attribute must not exist */
47 internal_flag = ORANGEFS_XATTR_CREATE;
48 }
49 return internal_flag;
50}
51
52
53/*
54 * Tries to get a specified key's attributes of a given
55 * file into a user-specified buffer. Note that the getxattr
56 * interface allows for the users to probe the size of an
57 * extended attribute by passing in a value of 0 to size.
58 * Thus our return value is always the size of the attribute
59 * unless the key does not exist for the file and/or if
60 * there were errors in fetching the attribute value.
61 */
62ssize_t orangefs_inode_getxattr(struct inode *inode, const char *prefix,
63 const char *name, void *buffer, size_t size)
64{
65 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
66 struct orangefs_kernel_op_s *new_op = NULL;
67 ssize_t ret = -ENOMEM;
68 ssize_t length = 0;
69 int fsuid;
70 int fsgid;
71
72 gossip_debug(GOSSIP_XATTR_DEBUG,
73 "%s: prefix %s name %s, buffer_size %zd\n",
74 __func__, prefix, name, size);
75
76 if (name == NULL || (size > 0 && buffer == NULL)) {
77 gossip_err("orangefs_inode_getxattr: bogus NULL pointers\n");
78 return -EINVAL;
79 }
80 if ((strlen(name) + strlen(prefix)) >= ORANGEFS_MAX_XATTR_NAMELEN) {
81 gossip_err("Invalid key length (%d)\n",
82 (int)(strlen(name) + strlen(prefix)));
83 return -EINVAL;
84 }
85
86 fsuid = from_kuid(current_user_ns(), current_fsuid());
87 fsgid = from_kgid(current_user_ns(), current_fsgid());
88
89 gossip_debug(GOSSIP_XATTR_DEBUG,
90 "getxattr on inode %pU, name %s "
91 "(uid %o, gid %o)\n",
92 get_khandle_from_ino(inode),
93 name,
94 fsuid,
95 fsgid);
96
97 down_read(&orangefs_inode->xattr_sem);
98
99 new_op = op_alloc(ORANGEFS_VFS_OP_GETXATTR);
100 if (!new_op)
101 goto out_unlock;
102
103 new_op->upcall.req.getxattr.refn = orangefs_inode->refn;
104 ret = snprintf((char *)new_op->upcall.req.getxattr.key,
105 ORANGEFS_MAX_XATTR_NAMELEN, "%s%s", prefix, name);
106
107 /*
108 * NOTE: Although keys are meant to be NULL terminated textual
109 * strings, I am going to explicitly pass the length just in case
110 * we change this later on...
111 */
112 new_op->upcall.req.getxattr.key_sz = ret + 1;
113
114 ret = service_operation(new_op, "orangefs_inode_getxattr",
115 get_interruptible_flag(inode));
116 if (ret != 0) {
117 if (ret == -ENOENT) {
118 ret = -ENODATA;
119 gossip_debug(GOSSIP_XATTR_DEBUG,
120 "orangefs_inode_getxattr: inode %pU key %s"
121 " does not exist!\n",
122 get_khandle_from_ino(inode),
123 (char *)new_op->upcall.req.getxattr.key);
124 }
125 goto out_release_op;
126 }
127
128 /*
129 * Length returned includes null terminator.
130 */
131 length = new_op->downcall.resp.getxattr.val_sz;
132
133 /*
134 * Just return the length of the queried attribute.
135 */
136 if (size == 0) {
137 ret = length;
138 goto out_release_op;
139 }
140
141 /*
142 * Check to see if key length is > provided buffer size.
143 */
144 if (length > size) {
145 ret = -ERANGE;
146 goto out_release_op;
147 }
148
149 memset(buffer, 0, size);
150 memcpy(buffer, new_op->downcall.resp.getxattr.val, length);
151 gossip_debug(GOSSIP_XATTR_DEBUG,
152 "orangefs_inode_getxattr: inode %pU "
153 "key %s key_sz %d, val_len %d\n",
154 get_khandle_from_ino(inode),
155 (char *)new_op->
156 upcall.req.getxattr.key,
157 (int)new_op->
158 upcall.req.getxattr.key_sz,
159 (int)ret);
160
161 ret = length;
162
163out_release_op:
164 op_release(new_op);
165out_unlock:
166 up_read(&orangefs_inode->xattr_sem);
167 return ret;
168}
169
170static int orangefs_inode_removexattr(struct inode *inode,
171 const char *prefix,
172 const char *name,
173 int flags)
174{
175 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
176 struct orangefs_kernel_op_s *new_op = NULL;
177 int ret = -ENOMEM;
178
179 down_write(&orangefs_inode->xattr_sem);
180 new_op = op_alloc(ORANGEFS_VFS_OP_REMOVEXATTR);
181 if (!new_op)
182 goto out_unlock;
183
184 new_op->upcall.req.removexattr.refn = orangefs_inode->refn;
185 /*
186 * NOTE: Although keys are meant to be NULL terminated
187 * textual strings, I am going to explicitly pass the
188 * length just in case we change this later on...
189 */
190 ret = snprintf((char *)new_op->upcall.req.removexattr.key,
191 ORANGEFS_MAX_XATTR_NAMELEN,
192 "%s%s",
193 (prefix ? prefix : ""),
194 name);
195 new_op->upcall.req.removexattr.key_sz = ret + 1;
196
197 gossip_debug(GOSSIP_XATTR_DEBUG,
198 "orangefs_inode_removexattr: key %s, key_sz %d\n",
199 (char *)new_op->upcall.req.removexattr.key,
200 (int)new_op->upcall.req.removexattr.key_sz);
201
202 ret = service_operation(new_op,
203 "orangefs_inode_removexattr",
204 get_interruptible_flag(inode));
205 if (ret == -ENOENT) {
206 /*
207 * Request to replace a non-existent attribute is an error.
208 */
209 if (flags & XATTR_REPLACE)
210 ret = -ENODATA;
211 else
212 ret = 0;
213 }
214
215 gossip_debug(GOSSIP_XATTR_DEBUG,
216 "orangefs_inode_removexattr: returning %d\n", ret);
217
218 op_release(new_op);
219out_unlock:
220 up_write(&orangefs_inode->xattr_sem);
221 return ret;
222}
223
224/*
225 * Tries to set an attribute for a given key on a file.
226 *
227 * Returns a -ve number on error and 0 on success. Key is text, but value
228 * can be binary!
229 */
230int orangefs_inode_setxattr(struct inode *inode, const char *prefix,
231 const char *name, const void *value, size_t size, int flags)
232{
233 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
234 struct orangefs_kernel_op_s *new_op;
235 int internal_flag = 0;
236 int ret = -ENOMEM;
237
238 gossip_debug(GOSSIP_XATTR_DEBUG,
239 "%s: prefix %s, name %s, buffer_size %zd\n",
240 __func__, prefix, name, size);
241
242 if (size < 0 ||
243 size >= ORANGEFS_MAX_XATTR_VALUELEN ||
244 flags < 0) {
245 gossip_err("orangefs_inode_setxattr: bogus values of size(%d), flags(%d)\n",
246 (int)size,
247 flags);
248 return -EINVAL;
249 }
250
251 if (name == NULL ||
252 (size > 0 && value == NULL)) {
253 gossip_err("orangefs_inode_setxattr: bogus NULL pointers!\n");
254 return -EINVAL;
255 }
256
257 internal_flag = convert_to_internal_xattr_flags(flags);
258
259 if (prefix) {
260 if (strlen(name) + strlen(prefix) >= ORANGEFS_MAX_XATTR_NAMELEN) {
261 gossip_err
262 ("orangefs_inode_setxattr: bogus key size (%d)\n",
263 (int)(strlen(name) + strlen(prefix)));
264 return -EINVAL;
265 }
266 } else {
267 if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) {
268 gossip_err
269 ("orangefs_inode_setxattr: bogus key size (%d)\n",
270 (int)(strlen(name)));
271 return -EINVAL;
272 }
273 }
274
275 /* This is equivalent to a removexattr */
276 if (size == 0 && value == NULL) {
277 gossip_debug(GOSSIP_XATTR_DEBUG,
278 "removing xattr (%s%s)\n",
279 prefix,
280 name);
281 return orangefs_inode_removexattr(inode, prefix, name, flags);
282 }
283
284 gossip_debug(GOSSIP_XATTR_DEBUG,
285 "setxattr on inode %pU, name %s\n",
286 get_khandle_from_ino(inode),
287 name);
288
289 down_write(&orangefs_inode->xattr_sem);
290 new_op = op_alloc(ORANGEFS_VFS_OP_SETXATTR);
291 if (!new_op)
292 goto out_unlock;
293
294
295 new_op->upcall.req.setxattr.refn = orangefs_inode->refn;
296 new_op->upcall.req.setxattr.flags = internal_flag;
297 /*
298 * NOTE: Although keys are meant to be NULL terminated textual
299 * strings, I am going to explicitly pass the length just in
300 * case we change this later on...
301 */
302 ret = snprintf((char *)new_op->upcall.req.setxattr.keyval.key,
303 ORANGEFS_MAX_XATTR_NAMELEN,
304 "%s%s",
305 prefix, name);
306 new_op->upcall.req.setxattr.keyval.key_sz = ret + 1;
307 memcpy(new_op->upcall.req.setxattr.keyval.val, value, size);
308 new_op->upcall.req.setxattr.keyval.val_sz = size;
309
310 gossip_debug(GOSSIP_XATTR_DEBUG,
311 "orangefs_inode_setxattr: key %s, key_sz %d "
312 " value size %zd\n",
313 (char *)new_op->upcall.req.setxattr.keyval.key,
314 (int)new_op->upcall.req.setxattr.keyval.key_sz,
315 size);
316
317 ret = service_operation(new_op,
318 "orangefs_inode_setxattr",
319 get_interruptible_flag(inode));
320
321 gossip_debug(GOSSIP_XATTR_DEBUG,
322 "orangefs_inode_setxattr: returning %d\n",
323 ret);
324
325 /* when request is serviced properly, free req op struct */
326 op_release(new_op);
327out_unlock:
328 up_write(&orangefs_inode->xattr_sem);
329 return ret;
330}
331
332/*
333 * Tries to get a specified object's keys into a user-specified buffer of a
334 * given size. Note that like the previous instances of xattr routines, this
335 * also allows you to pass in a NULL pointer and 0 size to probe the size for
336 * subsequent memory allocations. Thus our return value is always the size of
337 * all the keys unless there were errors in fetching the keys!
338 */
339ssize_t orangefs_listxattr(struct dentry *dentry, char *buffer, size_t size)
340{
341 struct inode *inode = dentry->d_inode;
342 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
343 struct orangefs_kernel_op_s *new_op;
344 __u64 token = ORANGEFS_ITERATE_START;
345 ssize_t ret = -ENOMEM;
346 ssize_t total = 0;
347 int count_keys = 0;
348 int key_size;
349 int i = 0;
350 int returned_count = 0;
351
352 if (size > 0 && buffer == NULL) {
353 gossip_err("%s: bogus NULL pointers\n", __func__);
354 return -EINVAL;
355 }
356 if (size < 0) {
357 gossip_err("Invalid size (%d)\n", (int)size);
358 return -EINVAL;
359 }
360
361 down_read(&orangefs_inode->xattr_sem);
362 new_op = op_alloc(ORANGEFS_VFS_OP_LISTXATTR);
363 if (!new_op)
364 goto out_unlock;
365
366 if (buffer && size > 0)
367 memset(buffer, 0, size);
368
369try_again:
370 key_size = 0;
371 new_op->upcall.req.listxattr.refn = orangefs_inode->refn;
372 new_op->upcall.req.listxattr.token = token;
373 new_op->upcall.req.listxattr.requested_count =
374 (size == 0) ? 0 : ORANGEFS_MAX_XATTR_LISTLEN;
375 ret = service_operation(new_op, __func__,
376 get_interruptible_flag(inode));
377 if (ret != 0)
378 goto done;
379
380 if (size == 0) {
381 /*
382 * This is a bit of a big upper limit, but I did not want to
383 * spend too much time getting this correct, since users end
384 * up allocating memory rather than us...
385 */
386 total = new_op->downcall.resp.listxattr.returned_count *
387 ORANGEFS_MAX_XATTR_NAMELEN;
388 goto done;
389 }
390
391 returned_count = new_op->downcall.resp.listxattr.returned_count;
392 if (returned_count < 0 ||
393 returned_count >= ORANGEFS_MAX_XATTR_LISTLEN) {
394 gossip_err("%s: impossible value for returned_count:%d:\n",
395 __func__,
396 returned_count);
397 ret = -EIO;
398 goto done;
399 }
400
401 /*
402 * Check to see how much can be fit in the buffer. Fit only whole keys.
403 */
404 for (i = 0; i < returned_count; i++) {
405 if (new_op->downcall.resp.listxattr.lengths[i] < 0 ||
406 new_op->downcall.resp.listxattr.lengths[i] >
407 ORANGEFS_MAX_XATTR_NAMELEN) {
408 gossip_err("%s: impossible value for lengths[%d]\n",
409 __func__,
410 new_op->downcall.resp.listxattr.lengths[i]);
411 ret = -EIO;
412 goto done;
413 }
414 if (total + new_op->downcall.resp.listxattr.lengths[i] > size)
415 goto done;
416
417 /*
418 * Since many dumb programs try to setxattr() on our reserved
419 * xattrs this is a feeble attempt at defeating those by not
420 * listing them in the output of listxattr.. sigh
421 */
422 if (is_reserved_key(new_op->downcall.resp.listxattr.key +
423 key_size,
424 new_op->downcall.resp.
425 listxattr.lengths[i])) {
426 gossip_debug(GOSSIP_XATTR_DEBUG, "Copying key %d -> %s\n",
427 i, new_op->downcall.resp.listxattr.key +
428 key_size);
429 memcpy(buffer + total,
430 new_op->downcall.resp.listxattr.key + key_size,
431 new_op->downcall.resp.listxattr.lengths[i]);
432 total += new_op->downcall.resp.listxattr.lengths[i];
433 count_keys++;
434 } else {
435 gossip_debug(GOSSIP_XATTR_DEBUG, "[RESERVED] key %d -> %s\n",
436 i, new_op->downcall.resp.listxattr.key +
437 key_size);
438 }
439 key_size += new_op->downcall.resp.listxattr.lengths[i];
440 }
441
442 /*
443 * Since the buffer was large enough, we might have to continue
444 * fetching more keys!
445 */
446 token = new_op->downcall.resp.listxattr.token;
447 if (token != ORANGEFS_ITERATE_END)
448 goto try_again;
449
450done:
451 gossip_debug(GOSSIP_XATTR_DEBUG, "%s: returning %d"
452 " [size of buffer %ld] (filled in %d keys)\n",
453 __func__,
454 ret ? (int)ret : (int)total,
455 (long)size,
456 count_keys);
457 op_release(new_op);
458 if (ret == 0)
459 ret = total;
460out_unlock:
461 up_read(&orangefs_inode->xattr_sem);
462 return ret;
463}
464
465static int orangefs_xattr_set_default(const struct xattr_handler *handler,
466 struct dentry *dentry,
467 const char *name,
468 const void *buffer,
469 size_t size,
470 int flags)
471{
472 return orangefs_inode_setxattr(dentry->d_inode,
473 ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
474 name,
475 buffer,
476 size,
477 flags);
478}
479
480static int orangefs_xattr_get_default(const struct xattr_handler *handler,
481 struct dentry *dentry,
482 const char *name,
483 void *buffer,
484 size_t size)
485{
486 return orangefs_inode_getxattr(dentry->d_inode,
487 ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
488 name,
489 buffer,
490 size);
491
492}
493
494static int orangefs_xattr_set_trusted(const struct xattr_handler *handler,
495 struct dentry *dentry,
496 const char *name,
497 const void *buffer,
498 size_t size,
499 int flags)
500{
501 return orangefs_inode_setxattr(dentry->d_inode,
502 ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
503 name,
504 buffer,
505 size,
506 flags);
507}
508
509static int orangefs_xattr_get_trusted(const struct xattr_handler *handler,
510 struct dentry *dentry,
511 const char *name,
512 void *buffer,
513 size_t size)
514{
515 return orangefs_inode_getxattr(dentry->d_inode,
516 ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
517 name,
518 buffer,
519 size);
520}
521
522static struct xattr_handler orangefs_xattr_trusted_handler = {
523 .prefix = ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
524 .get = orangefs_xattr_get_trusted,
525 .set = orangefs_xattr_set_trusted,
526};
527
528static struct xattr_handler orangefs_xattr_default_handler = {
529 /*
530 * NOTE: this is set to be the empty string.
531 * so that all un-prefixed xattrs keys get caught
532 * here!
533 */
534 .prefix = ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
535 .get = orangefs_xattr_get_default,
536 .set = orangefs_xattr_set_default,
537};
538
539const struct xattr_handler *orangefs_xattr_handlers[] = {
540 &posix_acl_access_xattr_handler,
541 &posix_acl_default_xattr_handler,
542 &orangefs_xattr_trusted_handler,
543 &orangefs_xattr_default_handler,
544 NULL
545};
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index d894e7cd9a86..cc514da6f3e7 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -7,6 +7,7 @@
7 * the Free Software Foundation. 7 * the Free Software Foundation.
8 */ 8 */
9 9
10#include <linux/module.h>
10#include <linux/fs.h> 11#include <linux/fs.h>
11#include <linux/slab.h> 12#include <linux/slab.h>
12#include <linux/file.h> 13#include <linux/file.h>
@@ -16,10 +17,41 @@
16#include <linux/uaccess.h> 17#include <linux/uaccess.h>
17#include <linux/sched.h> 18#include <linux/sched.h>
18#include <linux/namei.h> 19#include <linux/namei.h>
20#include <linux/fdtable.h>
21#include <linux/ratelimit.h>
19#include "overlayfs.h" 22#include "overlayfs.h"
20 23
21#define OVL_COPY_UP_CHUNK_SIZE (1 << 20) 24#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
22 25
26static bool __read_mostly ovl_check_copy_up;
27module_param_named(check_copy_up, ovl_check_copy_up, bool,
28 S_IWUSR | S_IRUGO);
29MODULE_PARM_DESC(ovl_check_copy_up,
30 "Warn on copy-up when causing process also has a R/O fd open");
31
32static int ovl_check_fd(const void *data, struct file *f, unsigned int fd)
33{
34 const struct dentry *dentry = data;
35
36 if (f->f_inode == d_inode(dentry))
37 pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
38 f, fd, current->pid, current->comm);
39 return 0;
40}
41
42/*
43 * Check the fds open by this process and warn if something like the following
44 * scenario is about to occur:
45 *
46 * fd1 = open("foo", O_RDONLY);
47 * fd2 = open("foo", O_RDWR);
48 */
49static void ovl_do_check_copy_up(struct dentry *dentry)
50{
51 if (ovl_check_copy_up)
52 iterate_fd(current->files, 0, ovl_check_fd, dentry);
53}
54
23int ovl_copy_xattr(struct dentry *old, struct dentry *new) 55int ovl_copy_xattr(struct dentry *old, struct dentry *new)
24{ 56{
25 ssize_t list_size, size, value_size = 0; 57 ssize_t list_size, size, value_size = 0;
@@ -235,6 +267,7 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
235 267
236 if (S_ISREG(stat->mode)) { 268 if (S_ISREG(stat->mode)) {
237 struct path upperpath; 269 struct path upperpath;
270
238 ovl_path_upper(dentry, &upperpath); 271 ovl_path_upper(dentry, &upperpath);
239 BUG_ON(upperpath.dentry != NULL); 272 BUG_ON(upperpath.dentry != NULL);
240 upperpath.dentry = newdentry; 273 upperpath.dentry = newdentry;
@@ -309,6 +342,8 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
309 if (WARN_ON(!workdir)) 342 if (WARN_ON(!workdir))
310 return -EROFS; 343 return -EROFS;
311 344
345 ovl_do_check_copy_up(lowerpath->dentry);
346
312 ovl_path_upper(parent, &parentpath); 347 ovl_path_upper(parent, &parentpath);
313 upperdir = parentpath.dentry; 348 upperdir = parentpath.dentry;
314 349
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 52f6de5d40a9..b3fc0a35bf62 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -596,21 +596,25 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
596{ 596{
597 struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); 597 struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
598 struct inode *dir = upperdir->d_inode; 598 struct inode *dir = upperdir->d_inode;
599 struct dentry *upper = ovl_dentry_upper(dentry); 599 struct dentry *upper;
600 int err; 600 int err;
601 601
602 inode_lock_nested(dir, I_MUTEX_PARENT); 602 inode_lock_nested(dir, I_MUTEX_PARENT);
603 upper = lookup_one_len(dentry->d_name.name, upperdir,
604 dentry->d_name.len);
605 err = PTR_ERR(upper);
606 if (IS_ERR(upper))
607 goto out_unlock;
608
603 err = -ESTALE; 609 err = -ESTALE;
604 if (upper->d_parent == upperdir) { 610 if (upper == ovl_dentry_upper(dentry)) {
605 /* Don't let d_delete() think it can reset d_inode */
606 dget(upper);
607 if (is_dir) 611 if (is_dir)
608 err = vfs_rmdir(dir, upper); 612 err = vfs_rmdir(dir, upper);
609 else 613 else
610 err = vfs_unlink(dir, upper, NULL); 614 err = vfs_unlink(dir, upper, NULL);
611 dput(upper);
612 ovl_dentry_version_inc(dentry->d_parent); 615 ovl_dentry_version_inc(dentry->d_parent);
613 } 616 }
617 dput(upper);
614 618
615 /* 619 /*
616 * Keeping this dentry hashed would mean having to release 620 * Keeping this dentry hashed would mean having to release
@@ -620,6 +624,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
620 */ 624 */
621 if (!err) 625 if (!err)
622 d_drop(dentry); 626 d_drop(dentry);
627out_unlock:
623 inode_unlock(dir); 628 inode_unlock(dir);
624 629
625 return err; 630 return err;
@@ -714,7 +719,6 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
714 struct dentry *trap; 719 struct dentry *trap;
715 bool old_opaque; 720 bool old_opaque;
716 bool new_opaque; 721 bool new_opaque;
717 bool new_create = false;
718 bool cleanup_whiteout = false; 722 bool cleanup_whiteout = false;
719 bool overwrite = !(flags & RENAME_EXCHANGE); 723 bool overwrite = !(flags & RENAME_EXCHANGE);
720 bool is_dir = d_is_dir(old); 724 bool is_dir = d_is_dir(old);
@@ -840,29 +844,38 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
840 844
841 trap = lock_rename(new_upperdir, old_upperdir); 845 trap = lock_rename(new_upperdir, old_upperdir);
842 846
843 olddentry = ovl_dentry_upper(old); 847
844 newdentry = ovl_dentry_upper(new); 848 olddentry = lookup_one_len(old->d_name.name, old_upperdir,
845 if (newdentry) { 849 old->d_name.len);
850 err = PTR_ERR(olddentry);
851 if (IS_ERR(olddentry))
852 goto out_unlock;
853
854 err = -ESTALE;
855 if (olddentry != ovl_dentry_upper(old))
856 goto out_dput_old;
857
858 newdentry = lookup_one_len(new->d_name.name, new_upperdir,
859 new->d_name.len);
860 err = PTR_ERR(newdentry);
861 if (IS_ERR(newdentry))
862 goto out_dput_old;
863
864 err = -ESTALE;
865 if (ovl_dentry_upper(new)) {
846 if (opaquedir) { 866 if (opaquedir) {
847 newdentry = opaquedir; 867 if (newdentry != opaquedir)
848 opaquedir = NULL; 868 goto out_dput;
849 } else { 869 } else {
850 dget(newdentry); 870 if (newdentry != ovl_dentry_upper(new))
871 goto out_dput;
851 } 872 }
852 } else { 873 } else {
853 new_create = true; 874 if (!d_is_negative(newdentry) &&
854 newdentry = lookup_one_len(new->d_name.name, new_upperdir, 875 (!new_opaque || !ovl_is_whiteout(newdentry)))
855 new->d_name.len); 876 goto out_dput;
856 err = PTR_ERR(newdentry);
857 if (IS_ERR(newdentry))
858 goto out_unlock;
859 } 877 }
860 878
861 err = -ESTALE;
862 if (olddentry->d_parent != old_upperdir)
863 goto out_dput;
864 if (newdentry->d_parent != new_upperdir)
865 goto out_dput;
866 if (olddentry == trap) 879 if (olddentry == trap)
867 goto out_dput; 880 goto out_dput;
868 if (newdentry == trap) 881 if (newdentry == trap)
@@ -925,6 +938,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
925 938
926out_dput: 939out_dput:
927 dput(newdentry); 940 dput(newdentry);
941out_dput_old:
942 dput(olddentry);
928out_unlock: 943out_unlock:
929 unlock_rename(new_upperdir, old_upperdir); 944 unlock_rename(new_upperdir, old_upperdir);
930out_revert_creds: 945out_revert_creds:
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 99b4168c36ff..6a7090f4a441 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -166,6 +166,7 @@ extern const struct file_operations ovl_dir_operations;
166int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list); 166int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
167void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list); 167void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
168void ovl_cache_free(struct list_head *list); 168void ovl_cache_free(struct list_head *list);
169int ovl_check_d_type_supported(struct path *realpath);
169 170
170/* inode.c */ 171/* inode.c */
171int ovl_setattr(struct dentry *dentry, struct iattr *attr); 172int ovl_setattr(struct dentry *dentry, struct iattr *attr);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index fdaf28f75e12..6ec1e43a9a54 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -36,13 +36,14 @@ struct ovl_dir_cache {
36 36
37struct ovl_readdir_data { 37struct ovl_readdir_data {
38 struct dir_context ctx; 38 struct dir_context ctx;
39 bool is_merge; 39 bool is_lowest;
40 struct rb_root root; 40 struct rb_root root;
41 struct list_head *list; 41 struct list_head *list;
42 struct list_head middle; 42 struct list_head middle;
43 struct ovl_cache_entry *first_maybe_whiteout; 43 struct ovl_cache_entry *first_maybe_whiteout;
44 int count; 44 int count;
45 int err; 45 int err;
46 bool d_type_supported;
46}; 47};
47 48
48struct ovl_dir_file { 49struct ovl_dir_file {
@@ -139,9 +140,9 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
139 return 0; 140 return 0;
140} 141}
141 142
142static int ovl_fill_lower(struct ovl_readdir_data *rdd, 143static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
143 const char *name, int namelen, 144 const char *name, int namelen,
144 loff_t offset, u64 ino, unsigned int d_type) 145 loff_t offset, u64 ino, unsigned int d_type)
145{ 146{
146 struct ovl_cache_entry *p; 147 struct ovl_cache_entry *p;
147 148
@@ -193,10 +194,10 @@ static int ovl_fill_merge(struct dir_context *ctx, const char *name,
193 container_of(ctx, struct ovl_readdir_data, ctx); 194 container_of(ctx, struct ovl_readdir_data, ctx);
194 195
195 rdd->count++; 196 rdd->count++;
196 if (!rdd->is_merge) 197 if (!rdd->is_lowest)
197 return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type); 198 return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
198 else 199 else
199 return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type); 200 return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type);
200} 201}
201 202
202static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) 203static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
@@ -289,7 +290,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
289 .ctx.actor = ovl_fill_merge, 290 .ctx.actor = ovl_fill_merge,
290 .list = list, 291 .list = list,
291 .root = RB_ROOT, 292 .root = RB_ROOT,
292 .is_merge = false, 293 .is_lowest = false,
293 }; 294 };
294 int idx, next; 295 int idx, next;
295 296
@@ -306,7 +307,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
306 * allows offsets to be reasonably constant 307 * allows offsets to be reasonably constant
307 */ 308 */
308 list_add(&rdd.middle, rdd.list); 309 list_add(&rdd.middle, rdd.list);
309 rdd.is_merge = true; 310 rdd.is_lowest = true;
310 err = ovl_dir_read(&realpath, &rdd); 311 err = ovl_dir_read(&realpath, &rdd);
311 list_del(&rdd.middle); 312 list_del(&rdd.middle);
312 } 313 }
@@ -577,3 +578,39 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
577 } 578 }
578 inode_unlock(upper->d_inode); 579 inode_unlock(upper->d_inode);
579} 580}
581
582static int ovl_check_d_type(struct dir_context *ctx, const char *name,
583 int namelen, loff_t offset, u64 ino,
584 unsigned int d_type)
585{
586 struct ovl_readdir_data *rdd =
587 container_of(ctx, struct ovl_readdir_data, ctx);
588
589 /* Even if d_type is not supported, DT_DIR is returned for . and .. */
590 if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen))
591 return 0;
592
593 if (d_type != DT_UNKNOWN)
594 rdd->d_type_supported = true;
595
596 return 0;
597}
598
599/*
600 * Returns 1 if d_type is supported, 0 not supported/unknown. Negative values
601 * if error is encountered.
602 */
603int ovl_check_d_type_supported(struct path *realpath)
604{
605 int err;
606 struct ovl_readdir_data rdd = {
607 .ctx.actor = ovl_check_d_type,
608 .d_type_supported = false,
609 };
610
611 err = ovl_dir_read(realpath, &rdd);
612 if (err)
613 return err;
614
615 return rdd.d_type_supported;
616}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 619ad4b016d2..ef64984c9bbc 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -936,7 +936,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
936 936
937 err = -EINVAL; 937 err = -EINVAL;
938 if (!ufs->config.lowerdir) { 938 if (!ufs->config.lowerdir) {
939 pr_err("overlayfs: missing 'lowerdir'\n"); 939 if (!silent)
940 pr_err("overlayfs: missing 'lowerdir'\n");
940 goto out_free_config; 941 goto out_free_config;
941 } 942 }
942 943
@@ -1028,6 +1029,21 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
1028 sb->s_flags |= MS_RDONLY; 1029 sb->s_flags |= MS_RDONLY;
1029 ufs->workdir = NULL; 1030 ufs->workdir = NULL;
1030 } 1031 }
1032
1033 /*
1034 * Upper should support d_type, else whiteouts are visible.
1035 * Given workdir and upper are on same fs, we can do
1036 * iterate_dir() on workdir.
1037 */
1038 err = ovl_check_d_type_supported(&workpath);
1039 if (err < 0)
1040 goto out_put_workdir;
1041
1042 if (!err) {
1043 pr_err("overlayfs: upper fs needs to support d_type.\n");
1044 err = -EINVAL;
1045 goto out_put_workdir;
1046 }
1031 } 1047 }
1032 1048
1033 err = -ENOMEM; 1049 err = -ENOMEM;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4f764c2ac1a5..b1755b23893e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -434,7 +434,7 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
434 && !lookup_symbol_name(wchan, symname)) 434 && !lookup_symbol_name(wchan, symname))
435 seq_printf(m, "%s", symname); 435 seq_printf(m, "%s", symname);
436 else 436 else
437 seq_putc(m, '0'); 437 seq_puts(m, "0\n");
438 438
439 return 0; 439 return 0;
440} 440}
@@ -2158,6 +2158,7 @@ static const struct file_operations proc_map_files_operations = {
2158 .llseek = default_llseek, 2158 .llseek = default_llseek,
2159}; 2159};
2160 2160
2161#ifdef CONFIG_CHECKPOINT_RESTORE
2161struct timers_private { 2162struct timers_private {
2162 struct pid *pid; 2163 struct pid *pid;
2163 struct task_struct *task; 2164 struct task_struct *task;
@@ -2256,6 +2257,73 @@ static const struct file_operations proc_timers_operations = {
2256 .llseek = seq_lseek, 2257 .llseek = seq_lseek,
2257 .release = seq_release_private, 2258 .release = seq_release_private,
2258}; 2259};
2260#endif
2261
2262static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
2263 size_t count, loff_t *offset)
2264{
2265 struct inode *inode = file_inode(file);
2266 struct task_struct *p;
2267 u64 slack_ns;
2268 int err;
2269
2270 err = kstrtoull_from_user(buf, count, 10, &slack_ns);
2271 if (err < 0)
2272 return err;
2273
2274 p = get_proc_task(inode);
2275 if (!p)
2276 return -ESRCH;
2277
2278 if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
2279 task_lock(p);
2280 if (slack_ns == 0)
2281 p->timer_slack_ns = p->default_timer_slack_ns;
2282 else
2283 p->timer_slack_ns = slack_ns;
2284 task_unlock(p);
2285 } else
2286 count = -EPERM;
2287
2288 put_task_struct(p);
2289
2290 return count;
2291}
2292
2293static int timerslack_ns_show(struct seq_file *m, void *v)
2294{
2295 struct inode *inode = m->private;
2296 struct task_struct *p;
2297 int err = 0;
2298
2299 p = get_proc_task(inode);
2300 if (!p)
2301 return -ESRCH;
2302
2303 if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
2304 task_lock(p);
2305 seq_printf(m, "%llu\n", p->timer_slack_ns);
2306 task_unlock(p);
2307 } else
2308 err = -EPERM;
2309
2310 put_task_struct(p);
2311
2312 return err;
2313}
2314
2315static int timerslack_ns_open(struct inode *inode, struct file *filp)
2316{
2317 return single_open(filp, timerslack_ns_show, inode);
2318}
2319
2320static const struct file_operations proc_pid_set_timerslack_ns_operations = {
2321 .open = timerslack_ns_open,
2322 .read = seq_read,
2323 .write = timerslack_ns_write,
2324 .llseek = seq_lseek,
2325 .release = single_release,
2326};
2259 2327
2260static int proc_pident_instantiate(struct inode *dir, 2328static int proc_pident_instantiate(struct inode *dir,
2261 struct dentry *dentry, struct task_struct *task, const void *ptr) 2329 struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2831,6 +2899,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2831#ifdef CONFIG_CHECKPOINT_RESTORE 2899#ifdef CONFIG_CHECKPOINT_RESTORE
2832 REG("timers", S_IRUGO, proc_timers_operations), 2900 REG("timers", S_IRUGO, proc_timers_operations),
2833#endif 2901#endif
2902 REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
2834}; 2903};
2835 2904
2836static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) 2905static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index df4661abadc4..83720460c5bc 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -29,10 +29,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
29 unsigned long committed; 29 unsigned long committed;
30 long cached; 30 long cached;
31 long available; 31 long available;
32 unsigned long pagecache;
33 unsigned long wmark_low = 0;
34 unsigned long pages[NR_LRU_LISTS]; 32 unsigned long pages[NR_LRU_LISTS];
35 struct zone *zone;
36 int lru; 33 int lru;
37 34
38/* 35/*
@@ -51,33 +48,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
51 for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) 48 for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
52 pages[lru] = global_page_state(NR_LRU_BASE + lru); 49 pages[lru] = global_page_state(NR_LRU_BASE + lru);
53 50
54 for_each_zone(zone) 51 available = si_mem_available();
55 wmark_low += zone->watermark[WMARK_LOW];
56
57 /*
58 * Estimate the amount of memory available for userspace allocations,
59 * without causing swapping.
60 */
61 available = i.freeram - totalreserve_pages;
62
63 /*
64 * Not all the page cache can be freed, otherwise the system will
65 * start swapping. Assume at least half of the page cache, or the
66 * low watermark worth of cache, needs to stay.
67 */
68 pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
69 pagecache -= min(pagecache / 2, wmark_low);
70 available += pagecache;
71
72 /*
73 * Part of the reclaimable slab consists of items that are in use,
74 * and cannot be freed. Cap this estimate at the low watermark.
75 */
76 available += global_page_state(NR_SLAB_RECLAIMABLE) -
77 min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
78
79 if (available < 0)
80 available = 0;
81 52
82 /* 53 /*
83 * Tagged format, for easy grepping and expansion. 54 * Tagged format, for easy grepping and expansion.
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 276f12431dbf..72cb26f85d58 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -28,6 +28,9 @@ static const struct proc_ns_operations *ns_entries[] = {
28 &userns_operations, 28 &userns_operations,
29#endif 29#endif
30 &mntns_operations, 30 &mntns_operations,
31#ifdef CONFIG_CGROUPS
32 &cgroupns_operations,
33#endif
31}; 34};
32 35
33static const char *proc_ns_get_link(struct dentry *dentry, 36static const char *proc_ns_get_link(struct dentry *dentry,
diff --git a/fs/proc/page.c b/fs/proc/page.c
index b2855eea5405..712f1b9992cc 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page)
103 * pseudo flags for the well known (anonymous) memory mapped pages 103 * pseudo flags for the well known (anonymous) memory mapped pages
104 * 104 *
105 * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the 105 * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
106 * simple test in page_mapcount() is not enough. 106 * simple test in page_mapped() is not enough.
107 */ 107 */
108 if (!PageSlab(page) && page_mapcount(page)) 108 if (!PageSlab(page) && page_mapped(page))
109 u |= 1 << KPF_MMAP; 109 u |= 1 << KPF_MMAP;
110 if (PageAnon(page)) 110 if (PageAnon(page))
111 u |= 1 << KPF_ANON; 111 u |= 1 << KPF_ANON;
@@ -148,6 +148,8 @@ u64 stable_page_flags(struct page *page)
148 */ 148 */
149 if (PageBuddy(page)) 149 if (PageBuddy(page))
150 u |= 1 << KPF_BUDDY; 150 u |= 1 << KPF_BUDDY;
151 else if (page_count(page) == 0 && is_free_buddy_page(page))
152 u |= 1 << KPF_BUDDY;
151 153
152 if (PageBalloon(page)) 154 if (PageBalloon(page))
153 u |= 1 << KPF_BALLOON; 155 u |= 1 << KPF_BALLOON;
@@ -158,6 +160,8 @@ u64 stable_page_flags(struct page *page)
158 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); 160 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
159 161
160 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); 162 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
163 if (PageTail(page) && PageSlab(compound_head(page)))
164 u |= 1 << KPF_SLAB;
161 165
162 u |= kpf_copy_bit(k, KPF_ERROR, PG_error); 166 u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
163 u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); 167 u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fa95ab2d3674..9df431642042 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -660,11 +660,20 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
660 [ilog2(VM_MERGEABLE)] = "mg", 660 [ilog2(VM_MERGEABLE)] = "mg",
661 [ilog2(VM_UFFD_MISSING)]= "um", 661 [ilog2(VM_UFFD_MISSING)]= "um",
662 [ilog2(VM_UFFD_WP)] = "uw", 662 [ilog2(VM_UFFD_WP)] = "uw",
663#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
664 /* These come out via ProtectionKey: */
665 [ilog2(VM_PKEY_BIT0)] = "",
666 [ilog2(VM_PKEY_BIT1)] = "",
667 [ilog2(VM_PKEY_BIT2)] = "",
668 [ilog2(VM_PKEY_BIT3)] = "",
669#endif
663 }; 670 };
664 size_t i; 671 size_t i;
665 672
666 seq_puts(m, "VmFlags: "); 673 seq_puts(m, "VmFlags: ");
667 for (i = 0; i < BITS_PER_LONG; i++) { 674 for (i = 0; i < BITS_PER_LONG; i++) {
675 if (!mnemonics[i][0])
676 continue;
668 if (vma->vm_flags & (1UL << i)) { 677 if (vma->vm_flags & (1UL << i)) {
669 seq_printf(m, "%c%c ", 678 seq_printf(m, "%c%c ",
670 mnemonics[i][0], mnemonics[i][1]); 679 mnemonics[i][0], mnemonics[i][1]);
@@ -702,6 +711,10 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
702} 711}
703#endif /* HUGETLB_PAGE */ 712#endif /* HUGETLB_PAGE */
704 713
714void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
715{
716}
717
705static int show_smap(struct seq_file *m, void *v, int is_pid) 718static int show_smap(struct seq_file *m, void *v, int is_pid)
706{ 719{
707 struct vm_area_struct *vma = v; 720 struct vm_area_struct *vma = v;
@@ -783,6 +796,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
783 (vma->vm_flags & VM_LOCKED) ? 796 (vma->vm_flags & VM_LOCKED) ?
784 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); 797 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
785 798
799 arch_show_smap(m, vma);
786 show_smap_vma_flags(m, vma); 800 show_smap_vma_flags(m, vma);
787 m_cache_vma(m, vma); 801 m_cache_vma(m, vma);
788 return 0; 802 return 0;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 4e61388ec03d..55bb57e6a30d 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -231,7 +231,9 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
231 231
232 list_for_each_entry(m, &vmcore_list, list) { 232 list_for_each_entry(m, &vmcore_list, list) {
233 if (*fpos < m->offset + m->size) { 233 if (*fpos < m->offset + m->size) {
234 tsz = min_t(size_t, m->offset + m->size - *fpos, buflen); 234 tsz = (size_t)min_t(unsigned long long,
235 m->offset + m->size - *fpos,
236 buflen);
235 start = m->paddr + *fpos - m->offset; 237 start = m->paddr + *fpos - m->offset;
236 tmp = read_from_oldmem(buffer, tsz, &start, userbuf); 238 tmp = read_from_oldmem(buffer, tsz, &start, userbuf);
237 if (tmp < 0) 239 if (tmp < 0)
@@ -461,7 +463,8 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
461 if (start < m->offset + m->size) { 463 if (start < m->offset + m->size) {
462 u64 paddr = 0; 464 u64 paddr = 0;
463 465
464 tsz = min_t(size_t, m->offset + m->size - start, size); 466 tsz = (size_t)min_t(unsigned long long,
467 m->offset + m->size - start, size);
465 paddr = m->paddr + start - m->offset; 468 paddr = m->paddr + start - m->offset;
466 if (vmcore_remap_oldmem_pfn(vma, vma->vm_start + len, 469 if (vmcore_remap_oldmem_pfn(vma, vma->vm_start + len,
467 paddr >> PAGE_SHIFT, tsz, 470 paddr >> PAGE_SHIFT, tsz,
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 2256e7e23e67..3f1190d18991 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -199,6 +199,8 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
199 if (sb->s_op->show_devname) { 199 if (sb->s_op->show_devname) {
200 seq_puts(m, "device "); 200 seq_puts(m, "device ");
201 err = sb->s_op->show_devname(m, mnt_path.dentry); 201 err = sb->s_op->show_devname(m, mnt_path.dentry);
202 if (err)
203 goto out;
202 } else { 204 } else {
203 if (r->mnt_devname) { 205 if (r->mnt_devname) {
204 seq_puts(m, "device "); 206 seq_puts(m, "device ");
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 319c3a60cfa5..bd9812e83461 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -55,8 +55,8 @@ static ulong ramoops_pmsg_size = MIN_MEM_SIZE;
55module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400); 55module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400);
56MODULE_PARM_DESC(pmsg_size, "size of user space message log"); 56MODULE_PARM_DESC(pmsg_size, "size of user space message log");
57 57
58static ulong mem_address; 58static unsigned long long mem_address;
59module_param(mem_address, ulong, 0400); 59module_param(mem_address, ullong, 0400);
60MODULE_PARM_DESC(mem_address, 60MODULE_PARM_DESC(mem_address,
61 "start of reserved RAM used to store oops/panic logs"); 61 "start of reserved RAM used to store oops/panic logs");
62 62
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 3c3b81bb6dfe..ba827daea5a0 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -411,6 +411,8 @@ int dquot_acquire(struct dquot *dquot)
411 ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot); 411 ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot);
412 if (ret < 0) 412 if (ret < 0)
413 goto out_iolock; 413 goto out_iolock;
414 /* Make sure flags update is visible after dquot has been filled */
415 smp_mb__before_atomic();
414 set_bit(DQ_READ_B, &dquot->dq_flags); 416 set_bit(DQ_READ_B, &dquot->dq_flags);
415 /* Instantiate dquot if needed */ 417 /* Instantiate dquot if needed */
416 if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && !dquot->dq_off) { 418 if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && !dquot->dq_off) {
@@ -427,6 +429,11 @@ int dquot_acquire(struct dquot *dquot)
427 goto out_iolock; 429 goto out_iolock;
428 } 430 }
429 } 431 }
432 /*
433 * Make sure flags update is visible after on-disk struct has been
434 * allocated. Paired with smp_rmb() in dqget().
435 */
436 smp_mb__before_atomic();
430 set_bit(DQ_ACTIVE_B, &dquot->dq_flags); 437 set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
431out_iolock: 438out_iolock:
432 mutex_unlock(&dqopt->dqio_mutex); 439 mutex_unlock(&dqopt->dqio_mutex);
@@ -887,6 +894,11 @@ we_slept:
887 goto out; 894 goto out;
888 } 895 }
889 } 896 }
897 /*
898 * Make sure following reads see filled structure - paired with
899 * smp_mb__before_atomic() in dquot_acquire().
900 */
901 smp_rmb();
890#ifdef CONFIG_QUOTA_DEBUG 902#ifdef CONFIG_QUOTA_DEBUG
891 BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */ 903 BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */
892#endif 904#endif
@@ -1398,7 +1410,7 @@ static int dquot_active(const struct inode *inode)
1398static int __dquot_initialize(struct inode *inode, int type) 1410static int __dquot_initialize(struct inode *inode, int type)
1399{ 1411{
1400 int cnt, init_needed = 0; 1412 int cnt, init_needed = 0;
1401 struct dquot **dquots, *got[MAXQUOTAS]; 1413 struct dquot **dquots, *got[MAXQUOTAS] = {};
1402 struct super_block *sb = inode->i_sb; 1414 struct super_block *sb = inode->i_sb;
1403 qsize_t rsv; 1415 qsize_t rsv;
1404 int ret = 0; 1416 int ret = 0;
@@ -1415,7 +1427,6 @@ static int __dquot_initialize(struct inode *inode, int type)
1415 int rc; 1427 int rc;
1416 struct dquot *dquot; 1428 struct dquot *dquot;
1417 1429
1418 got[cnt] = NULL;
1419 if (type != -1 && cnt != type) 1430 if (type != -1 && cnt != type)
1420 continue; 1431 continue;
1421 /* 1432 /*
@@ -2031,6 +2042,21 @@ int dquot_commit_info(struct super_block *sb, int type)
2031} 2042}
2032EXPORT_SYMBOL(dquot_commit_info); 2043EXPORT_SYMBOL(dquot_commit_info);
2033 2044
2045int dquot_get_next_id(struct super_block *sb, struct kqid *qid)
2046{
2047 struct quota_info *dqopt = sb_dqopt(sb);
2048 int err;
2049
2050 if (!dqopt->ops[qid->type]->get_next_id)
2051 return -ENOSYS;
2052 mutex_lock(&dqopt->dqio_mutex);
2053 err = dqopt->ops[qid->type]->get_next_id(sb, qid);
2054 mutex_unlock(&dqopt->dqio_mutex);
2055
2056 return err;
2057}
2058EXPORT_SYMBOL(dquot_get_next_id);
2059
2034/* 2060/*
2035 * Definitions of diskquota operations. 2061 * Definitions of diskquota operations.
2036 */ 2062 */
@@ -2042,6 +2068,7 @@ const struct dquot_operations dquot_operations = {
2042 .write_info = dquot_commit_info, 2068 .write_info = dquot_commit_info,
2043 .alloc_dquot = dquot_alloc, 2069 .alloc_dquot = dquot_alloc,
2044 .destroy_dquot = dquot_destroy, 2070 .destroy_dquot = dquot_destroy,
2071 .get_next_id = dquot_get_next_id,
2045}; 2072};
2046EXPORT_SYMBOL(dquot_operations); 2073EXPORT_SYMBOL(dquot_operations);
2047 2074
@@ -2430,9 +2457,7 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
2430 struct dentry *dentry; 2457 struct dentry *dentry;
2431 int error; 2458 int error;
2432 2459
2433 inode_lock(d_inode(sb->s_root)); 2460 dentry = lookup_one_len_unlocked(qf_name, sb->s_root, strlen(qf_name));
2434 dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name));
2435 inode_unlock(d_inode(sb->s_root));
2436 if (IS_ERR(dentry)) 2461 if (IS_ERR(dentry))
2437 return PTR_ERR(dentry); 2462 return PTR_ERR(dentry);
2438 2463
@@ -2565,6 +2590,27 @@ int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
2565} 2590}
2566EXPORT_SYMBOL(dquot_get_dqblk); 2591EXPORT_SYMBOL(dquot_get_dqblk);
2567 2592
2593int dquot_get_next_dqblk(struct super_block *sb, struct kqid *qid,
2594 struct qc_dqblk *di)
2595{
2596 struct dquot *dquot;
2597 int err;
2598
2599 if (!sb->dq_op->get_next_id)
2600 return -ENOSYS;
2601 err = sb->dq_op->get_next_id(sb, qid);
2602 if (err < 0)
2603 return err;
2604 dquot = dqget(sb, *qid);
2605 if (IS_ERR(dquot))
2606 return PTR_ERR(dquot);
2607 do_get_dqblk(dquot, di);
2608 dqput(dquot);
2609
2610 return 0;
2611}
2612EXPORT_SYMBOL(dquot_get_next_dqblk);
2613
2568#define VFS_QC_MASK \ 2614#define VFS_QC_MASK \
2569 (QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \ 2615 (QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \
2570 QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \ 2616 QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \
@@ -2765,6 +2811,7 @@ const struct quotactl_ops dquot_quotactl_ops = {
2765 .get_state = dquot_get_state, 2811 .get_state = dquot_get_state,
2766 .set_info = dquot_set_dqinfo, 2812 .set_info = dquot_set_dqinfo,
2767 .get_dqblk = dquot_get_dqblk, 2813 .get_dqblk = dquot_get_dqblk,
2814 .get_nextdqblk = dquot_get_next_dqblk,
2768 .set_dqblk = dquot_set_dqblk 2815 .set_dqblk = dquot_set_dqblk
2769}; 2816};
2770EXPORT_SYMBOL(dquot_quotactl_ops); 2817EXPORT_SYMBOL(dquot_quotactl_ops);
@@ -2776,6 +2823,7 @@ const struct quotactl_ops dquot_quotactl_sysfile_ops = {
2776 .get_state = dquot_get_state, 2823 .get_state = dquot_get_state,
2777 .set_info = dquot_set_dqinfo, 2824 .set_info = dquot_set_dqinfo,
2778 .get_dqblk = dquot_get_dqblk, 2825 .get_dqblk = dquot_get_dqblk,
2826 .get_nextdqblk = dquot_get_next_dqblk,
2779 .set_dqblk = dquot_set_dqblk 2827 .set_dqblk = dquot_set_dqblk
2780}; 2828};
2781EXPORT_SYMBOL(dquot_quotactl_sysfile_ops); 2829EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 3746367098fd..0f10ee9892ce 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -79,7 +79,7 @@ unsigned int qtype_enforce_flag(int type)
79 return 0; 79 return 0;
80} 80}
81 81
82static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, 82static int quota_quotaon(struct super_block *sb, int type, qid_t id,
83 struct path *path) 83 struct path *path)
84{ 84{
85 if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable) 85 if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable)
@@ -222,6 +222,34 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
222 return 0; 222 return 0;
223} 223}
224 224
225/*
226 * Return quota for next active quota >= this id, if any exists,
227 * otherwise return -ENOENT via ->get_nextdqblk
228 */
229static int quota_getnextquota(struct super_block *sb, int type, qid_t id,
230 void __user *addr)
231{
232 struct kqid qid;
233 struct qc_dqblk fdq;
234 struct if_nextdqblk idq;
235 int ret;
236
237 if (!sb->s_qcop->get_nextdqblk)
238 return -ENOSYS;
239 qid = make_kqid(current_user_ns(), type, id);
240 if (!qid_valid(qid))
241 return -EINVAL;
242 ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq);
243 if (ret)
244 return ret;
245 /* struct if_nextdqblk is a superset of struct if_dqblk */
246 copy_to_if_dqblk((struct if_dqblk *)&idq, &fdq);
247 idq.dqb_id = from_kqid(current_user_ns(), qid);
248 if (copy_to_user(addr, &idq, sizeof(idq)))
249 return -EFAULT;
250 return 0;
251}
252
225static void copy_from_if_dqblk(struct qc_dqblk *dst, struct if_dqblk *src) 253static void copy_from_if_dqblk(struct qc_dqblk *dst, struct if_dqblk *src)
226{ 254{
227 dst->d_spc_hardlimit = qbtos(src->dqb_bhardlimit); 255 dst->d_spc_hardlimit = qbtos(src->dqb_bhardlimit);
@@ -625,6 +653,34 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
625 return ret; 653 return ret;
626} 654}
627 655
656/*
657 * Return quota for next active quota >= this id, if any exists,
658 * otherwise return -ENOENT via ->get_nextdqblk.
659 */
660static int quota_getnextxquota(struct super_block *sb, int type, qid_t id,
661 void __user *addr)
662{
663 struct fs_disk_quota fdq;
664 struct qc_dqblk qdq;
665 struct kqid qid;
666 qid_t id_out;
667 int ret;
668
669 if (!sb->s_qcop->get_nextdqblk)
670 return -ENOSYS;
671 qid = make_kqid(current_user_ns(), type, id);
672 if (!qid_valid(qid))
673 return -EINVAL;
674 ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq);
675 if (ret)
676 return ret;
677 id_out = from_kqid(current_user_ns(), qid);
678 copy_to_xfs_dqblk(&fdq, &qdq, type, id_out);
679 if (copy_to_user(addr, &fdq, sizeof(fdq)))
680 return -EFAULT;
681 return ret;
682}
683
628static int quota_rmxquota(struct super_block *sb, void __user *addr) 684static int quota_rmxquota(struct super_block *sb, void __user *addr)
629{ 685{
630 __u32 flags; 686 __u32 flags;
@@ -659,7 +715,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
659 715
660 switch (cmd) { 716 switch (cmd) {
661 case Q_QUOTAON: 717 case Q_QUOTAON:
662 return quota_quotaon(sb, type, cmd, id, path); 718 return quota_quotaon(sb, type, id, path);
663 case Q_QUOTAOFF: 719 case Q_QUOTAOFF:
664 return quota_quotaoff(sb, type); 720 return quota_quotaoff(sb, type);
665 case Q_GETFMT: 721 case Q_GETFMT:
@@ -670,6 +726,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
670 return quota_setinfo(sb, type, addr); 726 return quota_setinfo(sb, type, addr);
671 case Q_GETQUOTA: 727 case Q_GETQUOTA:
672 return quota_getquota(sb, type, id, addr); 728 return quota_getquota(sb, type, id, addr);
729 case Q_GETNEXTQUOTA:
730 return quota_getnextquota(sb, type, id, addr);
673 case Q_SETQUOTA: 731 case Q_SETQUOTA:
674 return quota_setquota(sb, type, id, addr); 732 return quota_setquota(sb, type, id, addr);
675 case Q_SYNC: 733 case Q_SYNC:
@@ -690,6 +748,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
690 return quota_setxquota(sb, type, id, addr); 748 return quota_setxquota(sb, type, id, addr);
691 case Q_XGETQUOTA: 749 case Q_XGETQUOTA:
692 return quota_getxquota(sb, type, id, addr); 750 return quota_getxquota(sb, type, id, addr);
751 case Q_XGETNEXTQUOTA:
752 return quota_getnextxquota(sb, type, id, addr);
693 case Q_XQUOTASYNC: 753 case Q_XQUOTASYNC:
694 if (sb->s_flags & MS_RDONLY) 754 if (sb->s_flags & MS_RDONLY)
695 return -EROFS; 755 return -EROFS;
@@ -705,6 +765,11 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
705/* Return 1 if 'cmd' will block on frozen filesystem */ 765/* Return 1 if 'cmd' will block on frozen filesystem */
706static int quotactl_cmd_write(int cmd) 766static int quotactl_cmd_write(int cmd)
707{ 767{
768 /*
769 * We cannot allow Q_GETQUOTA and Q_GETNEXTQUOTA without write access
770 * as dquot_acquire() may allocate space for new structure and OCFS2
771 * needs to increment on-disk use count.
772 */
708 switch (cmd) { 773 switch (cmd) {
709 case Q_GETFMT: 774 case Q_GETFMT:
710 case Q_GETINFO: 775 case Q_GETINFO:
@@ -712,6 +777,7 @@ static int quotactl_cmd_write(int cmd)
712 case Q_XGETQSTAT: 777 case Q_XGETQSTAT:
713 case Q_XGETQSTATV: 778 case Q_XGETQSTATV:
714 case Q_XGETQUOTA: 779 case Q_XGETQUOTA:
780 case Q_XGETNEXTQUOTA:
715 case Q_XQUOTASYNC: 781 case Q_XQUOTASYNC:
716 return 0; 782 return 0;
717 } 783 }
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 58efb83dec1c..0738972e8d3f 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -22,10 +22,9 @@ MODULE_LICENSE("GPL");
22 22
23#define __QUOTA_QT_PARANOIA 23#define __QUOTA_QT_PARANOIA
24 24
25static int get_index(struct qtree_mem_dqinfo *info, struct kqid qid, int depth) 25static int __get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
26{ 26{
27 unsigned int epb = info->dqi_usable_bs >> 2; 27 unsigned int epb = info->dqi_usable_bs >> 2;
28 qid_t id = from_kqid(&init_user_ns, qid);
29 28
30 depth = info->dqi_qtree_depth - depth - 1; 29 depth = info->dqi_qtree_depth - depth - 1;
31 while (depth--) 30 while (depth--)
@@ -33,6 +32,13 @@ static int get_index(struct qtree_mem_dqinfo *info, struct kqid qid, int depth)
33 return id % epb; 32 return id % epb;
34} 33}
35 34
35static int get_index(struct qtree_mem_dqinfo *info, struct kqid qid, int depth)
36{
37 qid_t id = from_kqid(&init_user_ns, qid);
38
39 return __get_index(info, id, depth);
40}
41
36/* Number of entries in one blocks */ 42/* Number of entries in one blocks */
37static int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info) 43static int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
38{ 44{
@@ -668,3 +674,60 @@ int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
668 return 0; 674 return 0;
669} 675}
670EXPORT_SYMBOL(qtree_release_dquot); 676EXPORT_SYMBOL(qtree_release_dquot);
677
678static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id,
679 unsigned int blk, int depth)
680{
681 char *buf = getdqbuf(info->dqi_usable_bs);
682 __le32 *ref = (__le32 *)buf;
683 ssize_t ret;
684 unsigned int epb = info->dqi_usable_bs >> 2;
685 unsigned int level_inc = 1;
686 int i;
687
688 if (!buf)
689 return -ENOMEM;
690
691 for (i = depth; i < info->dqi_qtree_depth - 1; i++)
692 level_inc *= epb;
693
694 ret = read_blk(info, blk, buf);
695 if (ret < 0) {
696 quota_error(info->dqi_sb,
697 "Can't read quota tree block %u", blk);
698 goto out_buf;
699 }
700 for (i = __get_index(info, *id, depth); i < epb; i++) {
701 if (ref[i] == cpu_to_le32(0)) {
702 *id += level_inc;
703 continue;
704 }
705 if (depth == info->dqi_qtree_depth - 1) {
706 ret = 0;
707 goto out_buf;
708 }
709 ret = find_next_id(info, id, le32_to_cpu(ref[i]), depth + 1);
710 if (ret != -ENOENT)
711 break;
712 }
713 if (i == epb) {
714 ret = -ENOENT;
715 goto out_buf;
716 }
717out_buf:
718 kfree(buf);
719 return ret;
720}
721
722int qtree_get_next_id(struct qtree_mem_dqinfo *info, struct kqid *qid)
723{
724 qid_t id = from_kqid(&init_user_ns, *qid);
725 int ret;
726
727 ret = find_next_id(info, &id, QT_TREEOFF, 0);
728 if (ret < 0)
729 return ret;
730 *qid = make_kqid(&init_user_ns, qid->type, id);
731 return 0;
732}
733EXPORT_SYMBOL(qtree_get_next_id);
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index ed85d4f35c04..ca71bf881ad1 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -304,6 +304,11 @@ static int v2_free_file_info(struct super_block *sb, int type)
304 return 0; 304 return 0;
305} 305}
306 306
307static int v2_get_next_id(struct super_block *sb, struct kqid *qid)
308{
309 return qtree_get_next_id(sb_dqinfo(sb, qid->type)->dqi_priv, qid);
310}
311
307static const struct quota_format_ops v2_format_ops = { 312static const struct quota_format_ops v2_format_ops = {
308 .check_quota_file = v2_check_quota_file, 313 .check_quota_file = v2_check_quota_file,
309 .read_file_info = v2_read_file_info, 314 .read_file_info = v2_read_file_info,
@@ -312,6 +317,7 @@ static const struct quota_format_ops v2_format_ops = {
312 .read_dqblk = v2_read_dquot, 317 .read_dqblk = v2_read_dquot,
313 .commit_dqblk = v2_write_dquot, 318 .commit_dqblk = v2_write_dquot,
314 .release_dqblk = v2_release_dquot, 319 .release_dqblk = v2_release_dquot,
320 .get_next_id = v2_get_next_id,
315}; 321};
316 322
317static struct quota_format_type v2r0_quota_format = { 323static struct quota_format_type v2r0_quota_format = {
diff --git a/fs/read_write.c b/fs/read_write.c
index dadf24e5c95b..cf377cf9dfe3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -693,12 +693,17 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
693EXPORT_SYMBOL(iov_shorten); 693EXPORT_SYMBOL(iov_shorten);
694 694
695static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, 695static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
696 loff_t *ppos, iter_fn_t fn) 696 loff_t *ppos, iter_fn_t fn, int flags)
697{ 697{
698 struct kiocb kiocb; 698 struct kiocb kiocb;
699 ssize_t ret; 699 ssize_t ret;
700 700
701 if (flags & ~RWF_HIPRI)
702 return -EOPNOTSUPP;
703
701 init_sync_kiocb(&kiocb, filp); 704 init_sync_kiocb(&kiocb, filp);
705 if (flags & RWF_HIPRI)
706 kiocb.ki_flags |= IOCB_HIPRI;
702 kiocb.ki_pos = *ppos; 707 kiocb.ki_pos = *ppos;
703 708
704 ret = fn(&kiocb, iter); 709 ret = fn(&kiocb, iter);
@@ -709,10 +714,13 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
709 714
710/* Do it by hand, with file-ops */ 715/* Do it by hand, with file-ops */
711static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, 716static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
712 loff_t *ppos, io_fn_t fn) 717 loff_t *ppos, io_fn_t fn, int flags)
713{ 718{
714 ssize_t ret = 0; 719 ssize_t ret = 0;
715 720
721 if (flags & ~RWF_HIPRI)
722 return -EOPNOTSUPP;
723
716 while (iov_iter_count(iter)) { 724 while (iov_iter_count(iter)) {
717 struct iovec iovec = iov_iter_iovec(iter); 725 struct iovec iovec = iov_iter_iovec(iter);
718 ssize_t nr; 726 ssize_t nr;
@@ -813,7 +821,8 @@ out:
813 821
814static ssize_t do_readv_writev(int type, struct file *file, 822static ssize_t do_readv_writev(int type, struct file *file,
815 const struct iovec __user * uvector, 823 const struct iovec __user * uvector,
816 unsigned long nr_segs, loff_t *pos) 824 unsigned long nr_segs, loff_t *pos,
825 int flags)
817{ 826{
818 size_t tot_len; 827 size_t tot_len;
819 struct iovec iovstack[UIO_FASTIOV]; 828 struct iovec iovstack[UIO_FASTIOV];
@@ -845,9 +854,9 @@ static ssize_t do_readv_writev(int type, struct file *file,
845 } 854 }
846 855
847 if (iter_fn) 856 if (iter_fn)
848 ret = do_iter_readv_writev(file, &iter, pos, iter_fn); 857 ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags);
849 else 858 else
850 ret = do_loop_readv_writev(file, &iter, pos, fn); 859 ret = do_loop_readv_writev(file, &iter, pos, fn, flags);
851 860
852 if (type != READ) 861 if (type != READ)
853 file_end_write(file); 862 file_end_write(file);
@@ -864,40 +873,40 @@ out:
864} 873}
865 874
866ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, 875ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
867 unsigned long vlen, loff_t *pos) 876 unsigned long vlen, loff_t *pos, int flags)
868{ 877{
869 if (!(file->f_mode & FMODE_READ)) 878 if (!(file->f_mode & FMODE_READ))
870 return -EBADF; 879 return -EBADF;
871 if (!(file->f_mode & FMODE_CAN_READ)) 880 if (!(file->f_mode & FMODE_CAN_READ))
872 return -EINVAL; 881 return -EINVAL;
873 882
874 return do_readv_writev(READ, file, vec, vlen, pos); 883 return do_readv_writev(READ, file, vec, vlen, pos, flags);
875} 884}
876 885
877EXPORT_SYMBOL(vfs_readv); 886EXPORT_SYMBOL(vfs_readv);
878 887
879ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, 888ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
880 unsigned long vlen, loff_t *pos) 889 unsigned long vlen, loff_t *pos, int flags)
881{ 890{
882 if (!(file->f_mode & FMODE_WRITE)) 891 if (!(file->f_mode & FMODE_WRITE))
883 return -EBADF; 892 return -EBADF;
884 if (!(file->f_mode & FMODE_CAN_WRITE)) 893 if (!(file->f_mode & FMODE_CAN_WRITE))
885 return -EINVAL; 894 return -EINVAL;
886 895
887 return do_readv_writev(WRITE, file, vec, vlen, pos); 896 return do_readv_writev(WRITE, file, vec, vlen, pos, flags);
888} 897}
889 898
890EXPORT_SYMBOL(vfs_writev); 899EXPORT_SYMBOL(vfs_writev);
891 900
892SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, 901static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
893 unsigned long, vlen) 902 unsigned long vlen, int flags)
894{ 903{
895 struct fd f = fdget_pos(fd); 904 struct fd f = fdget_pos(fd);
896 ssize_t ret = -EBADF; 905 ssize_t ret = -EBADF;
897 906
898 if (f.file) { 907 if (f.file) {
899 loff_t pos = file_pos_read(f.file); 908 loff_t pos = file_pos_read(f.file);
900 ret = vfs_readv(f.file, vec, vlen, &pos); 909 ret = vfs_readv(f.file, vec, vlen, &pos, flags);
901 if (ret >= 0) 910 if (ret >= 0)
902 file_pos_write(f.file, pos); 911 file_pos_write(f.file, pos);
903 fdput_pos(f); 912 fdput_pos(f);
@@ -909,15 +918,15 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
909 return ret; 918 return ret;
910} 919}
911 920
912SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, 921static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
913 unsigned long, vlen) 922 unsigned long vlen, int flags)
914{ 923{
915 struct fd f = fdget_pos(fd); 924 struct fd f = fdget_pos(fd);
916 ssize_t ret = -EBADF; 925 ssize_t ret = -EBADF;
917 926
918 if (f.file) { 927 if (f.file) {
919 loff_t pos = file_pos_read(f.file); 928 loff_t pos = file_pos_read(f.file);
920 ret = vfs_writev(f.file, vec, vlen, &pos); 929 ret = vfs_writev(f.file, vec, vlen, &pos, flags);
921 if (ret >= 0) 930 if (ret >= 0)
922 file_pos_write(f.file, pos); 931 file_pos_write(f.file, pos);
923 fdput_pos(f); 932 fdput_pos(f);
@@ -935,10 +944,9 @@ static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
935 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; 944 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
936} 945}
937 946
938SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, 947static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
939 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) 948 unsigned long vlen, loff_t pos, int flags)
940{ 949{
941 loff_t pos = pos_from_hilo(pos_h, pos_l);
942 struct fd f; 950 struct fd f;
943 ssize_t ret = -EBADF; 951 ssize_t ret = -EBADF;
944 952
@@ -949,7 +957,7 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
949 if (f.file) { 957 if (f.file) {
950 ret = -ESPIPE; 958 ret = -ESPIPE;
951 if (f.file->f_mode & FMODE_PREAD) 959 if (f.file->f_mode & FMODE_PREAD)
952 ret = vfs_readv(f.file, vec, vlen, &pos); 960 ret = vfs_readv(f.file, vec, vlen, &pos, flags);
953 fdput(f); 961 fdput(f);
954 } 962 }
955 963
@@ -959,10 +967,9 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
959 return ret; 967 return ret;
960} 968}
961 969
962SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, 970static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
963 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) 971 unsigned long vlen, loff_t pos, int flags)
964{ 972{
965 loff_t pos = pos_from_hilo(pos_h, pos_l);
966 struct fd f; 973 struct fd f;
967 ssize_t ret = -EBADF; 974 ssize_t ret = -EBADF;
968 975
@@ -973,7 +980,7 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
973 if (f.file) { 980 if (f.file) {
974 ret = -ESPIPE; 981 ret = -ESPIPE;
975 if (f.file->f_mode & FMODE_PWRITE) 982 if (f.file->f_mode & FMODE_PWRITE)
976 ret = vfs_writev(f.file, vec, vlen, &pos); 983 ret = vfs_writev(f.file, vec, vlen, &pos, flags);
977 fdput(f); 984 fdput(f);
978 } 985 }
979 986
@@ -983,11 +990,64 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
983 return ret; 990 return ret;
984} 991}
985 992
993SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
994 unsigned long, vlen)
995{
996 return do_readv(fd, vec, vlen, 0);
997}
998
999SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1000 unsigned long, vlen)
1001{
1002 return do_writev(fd, vec, vlen, 0);
1003}
1004
1005SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1006 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1007{
1008 loff_t pos = pos_from_hilo(pos_h, pos_l);
1009
1010 return do_preadv(fd, vec, vlen, pos, 0);
1011}
1012
1013SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1014 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1015 int, flags)
1016{
1017 loff_t pos = pos_from_hilo(pos_h, pos_l);
1018
1019 if (pos == -1)
1020 return do_readv(fd, vec, vlen, flags);
1021
1022 return do_preadv(fd, vec, vlen, pos, flags);
1023}
1024
1025SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1026 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1027{
1028 loff_t pos = pos_from_hilo(pos_h, pos_l);
1029
1030 return do_pwritev(fd, vec, vlen, pos, 0);
1031}
1032
1033SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1034 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1035 int, flags)
1036{
1037 loff_t pos = pos_from_hilo(pos_h, pos_l);
1038
1039 if (pos == -1)
1040 return do_writev(fd, vec, vlen, flags);
1041
1042 return do_pwritev(fd, vec, vlen, pos, flags);
1043}
1044
986#ifdef CONFIG_COMPAT 1045#ifdef CONFIG_COMPAT
987 1046
988static ssize_t compat_do_readv_writev(int type, struct file *file, 1047static ssize_t compat_do_readv_writev(int type, struct file *file,
989 const struct compat_iovec __user *uvector, 1048 const struct compat_iovec __user *uvector,
990 unsigned long nr_segs, loff_t *pos) 1049 unsigned long nr_segs, loff_t *pos,
1050 int flags)
991{ 1051{
992 compat_ssize_t tot_len; 1052 compat_ssize_t tot_len;
993 struct iovec iovstack[UIO_FASTIOV]; 1053 struct iovec iovstack[UIO_FASTIOV];
@@ -1019,9 +1079,9 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
1019 } 1079 }
1020 1080
1021 if (iter_fn) 1081 if (iter_fn)
1022 ret = do_iter_readv_writev(file, &iter, pos, iter_fn); 1082 ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags);
1023 else 1083 else
1024 ret = do_loop_readv_writev(file, &iter, pos, fn); 1084 ret = do_loop_readv_writev(file, &iter, pos, fn, flags);
1025 1085
1026 if (type != READ) 1086 if (type != READ)
1027 file_end_write(file); 1087 file_end_write(file);
@@ -1039,7 +1099,7 @@ out:
1039 1099
1040static size_t compat_readv(struct file *file, 1100static size_t compat_readv(struct file *file,
1041 const struct compat_iovec __user *vec, 1101 const struct compat_iovec __user *vec,
1042 unsigned long vlen, loff_t *pos) 1102 unsigned long vlen, loff_t *pos, int flags)
1043{ 1103{
1044 ssize_t ret = -EBADF; 1104 ssize_t ret = -EBADF;
1045 1105
@@ -1050,7 +1110,7 @@ static size_t compat_readv(struct file *file,
1050 if (!(file->f_mode & FMODE_CAN_READ)) 1110 if (!(file->f_mode & FMODE_CAN_READ))
1051 goto out; 1111 goto out;
1052 1112
1053 ret = compat_do_readv_writev(READ, file, vec, vlen, pos); 1113 ret = compat_do_readv_writev(READ, file, vec, vlen, pos, flags);
1054 1114
1055out: 1115out:
1056 if (ret > 0) 1116 if (ret > 0)
@@ -1059,9 +1119,9 @@ out:
1059 return ret; 1119 return ret;
1060} 1120}
1061 1121
1062COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, 1122static size_t do_compat_readv(compat_ulong_t fd,
1063 const struct compat_iovec __user *,vec, 1123 const struct compat_iovec __user *vec,
1064 compat_ulong_t, vlen) 1124 compat_ulong_t vlen, int flags)
1065{ 1125{
1066 struct fd f = fdget_pos(fd); 1126 struct fd f = fdget_pos(fd);
1067 ssize_t ret; 1127 ssize_t ret;
@@ -1070,16 +1130,24 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1070 if (!f.file) 1130 if (!f.file)
1071 return -EBADF; 1131 return -EBADF;
1072 pos = f.file->f_pos; 1132 pos = f.file->f_pos;
1073 ret = compat_readv(f.file, vec, vlen, &pos); 1133 ret = compat_readv(f.file, vec, vlen, &pos, flags);
1074 if (ret >= 0) 1134 if (ret >= 0)
1075 f.file->f_pos = pos; 1135 f.file->f_pos = pos;
1076 fdput_pos(f); 1136 fdput_pos(f);
1077 return ret; 1137 return ret;
1138
1139}
1140
1141COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1142 const struct compat_iovec __user *,vec,
1143 compat_ulong_t, vlen)
1144{
1145 return do_compat_readv(fd, vec, vlen, 0);
1078} 1146}
1079 1147
1080static long __compat_sys_preadv64(unsigned long fd, 1148static long do_compat_preadv64(unsigned long fd,
1081 const struct compat_iovec __user *vec, 1149 const struct compat_iovec __user *vec,
1082 unsigned long vlen, loff_t pos) 1150 unsigned long vlen, loff_t pos, int flags)
1083{ 1151{
1084 struct fd f; 1152 struct fd f;
1085 ssize_t ret; 1153 ssize_t ret;
@@ -1091,7 +1159,7 @@ static long __compat_sys_preadv64(unsigned long fd,
1091 return -EBADF; 1159 return -EBADF;
1092 ret = -ESPIPE; 1160 ret = -ESPIPE;
1093 if (f.file->f_mode & FMODE_PREAD) 1161 if (f.file->f_mode & FMODE_PREAD)
1094 ret = compat_readv(f.file, vec, vlen, &pos); 1162 ret = compat_readv(f.file, vec, vlen, &pos, flags);
1095 fdput(f); 1163 fdput(f);
1096 return ret; 1164 return ret;
1097} 1165}
@@ -1101,7 +1169,7 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1101 const struct compat_iovec __user *,vec, 1169 const struct compat_iovec __user *,vec,
1102 unsigned long, vlen, loff_t, pos) 1170 unsigned long, vlen, loff_t, pos)
1103{ 1171{
1104 return __compat_sys_preadv64(fd, vec, vlen, pos); 1172 return do_compat_preadv64(fd, vec, vlen, pos, 0);
1105} 1173}
1106#endif 1174#endif
1107 1175
@@ -1111,12 +1179,25 @@ COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1111{ 1179{
1112 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1180 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1113 1181
1114 return __compat_sys_preadv64(fd, vec, vlen, pos); 1182 return do_compat_preadv64(fd, vec, vlen, pos, 0);
1183}
1184
1185COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1186 const struct compat_iovec __user *,vec,
1187 compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
1188 int, flags)
1189{
1190 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1191
1192 if (pos == -1)
1193 return do_compat_readv(fd, vec, vlen, flags);
1194
1195 return do_compat_preadv64(fd, vec, vlen, pos, flags);
1115} 1196}
1116 1197
1117static size_t compat_writev(struct file *file, 1198static size_t compat_writev(struct file *file,
1118 const struct compat_iovec __user *vec, 1199 const struct compat_iovec __user *vec,
1119 unsigned long vlen, loff_t *pos) 1200 unsigned long vlen, loff_t *pos, int flags)
1120{ 1201{
1121 ssize_t ret = -EBADF; 1202 ssize_t ret = -EBADF;
1122 1203
@@ -1127,7 +1208,7 @@ static size_t compat_writev(struct file *file,
1127 if (!(file->f_mode & FMODE_CAN_WRITE)) 1208 if (!(file->f_mode & FMODE_CAN_WRITE))
1128 goto out; 1209 goto out;
1129 1210
1130 ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos); 1211 ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, 0);
1131 1212
1132out: 1213out:
1133 if (ret > 0) 1214 if (ret > 0)
@@ -1136,9 +1217,9 @@ out:
1136 return ret; 1217 return ret;
1137} 1218}
1138 1219
1139COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, 1220static size_t do_compat_writev(compat_ulong_t fd,
1140 const struct compat_iovec __user *, vec, 1221 const struct compat_iovec __user* vec,
1141 compat_ulong_t, vlen) 1222 compat_ulong_t vlen, int flags)
1142{ 1223{
1143 struct fd f = fdget_pos(fd); 1224 struct fd f = fdget_pos(fd);
1144 ssize_t ret; 1225 ssize_t ret;
@@ -1147,16 +1228,23 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1147 if (!f.file) 1228 if (!f.file)
1148 return -EBADF; 1229 return -EBADF;
1149 pos = f.file->f_pos; 1230 pos = f.file->f_pos;
1150 ret = compat_writev(f.file, vec, vlen, &pos); 1231 ret = compat_writev(f.file, vec, vlen, &pos, flags);
1151 if (ret >= 0) 1232 if (ret >= 0)
1152 f.file->f_pos = pos; 1233 f.file->f_pos = pos;
1153 fdput_pos(f); 1234 fdput_pos(f);
1154 return ret; 1235 return ret;
1155} 1236}
1156 1237
1157static long __compat_sys_pwritev64(unsigned long fd, 1238COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1239 const struct compat_iovec __user *, vec,
1240 compat_ulong_t, vlen)
1241{
1242 return do_compat_writev(fd, vec, vlen, 0);
1243}
1244
1245static long do_compat_pwritev64(unsigned long fd,
1158 const struct compat_iovec __user *vec, 1246 const struct compat_iovec __user *vec,
1159 unsigned long vlen, loff_t pos) 1247 unsigned long vlen, loff_t pos, int flags)
1160{ 1248{
1161 struct fd f; 1249 struct fd f;
1162 ssize_t ret; 1250 ssize_t ret;
@@ -1168,7 +1256,7 @@ static long __compat_sys_pwritev64(unsigned long fd,
1168 return -EBADF; 1256 return -EBADF;
1169 ret = -ESPIPE; 1257 ret = -ESPIPE;
1170 if (f.file->f_mode & FMODE_PWRITE) 1258 if (f.file->f_mode & FMODE_PWRITE)
1171 ret = compat_writev(f.file, vec, vlen, &pos); 1259 ret = compat_writev(f.file, vec, vlen, &pos, flags);
1172 fdput(f); 1260 fdput(f);
1173 return ret; 1261 return ret;
1174} 1262}
@@ -1178,7 +1266,7 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1178 const struct compat_iovec __user *,vec, 1266 const struct compat_iovec __user *,vec,
1179 unsigned long, vlen, loff_t, pos) 1267 unsigned long, vlen, loff_t, pos)
1180{ 1268{
1181 return __compat_sys_pwritev64(fd, vec, vlen, pos); 1269 return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1182} 1270}
1183#endif 1271#endif
1184 1272
@@ -1188,8 +1276,21 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1188{ 1276{
1189 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1277 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1190 1278
1191 return __compat_sys_pwritev64(fd, vec, vlen, pos); 1279 return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1280}
1281
1282COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1283 const struct compat_iovec __user *,vec,
1284 compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags)
1285{
1286 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1287
1288 if (pos == -1)
1289 return do_compat_writev(fd, vec, vlen, flags);
1290
1291 return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1192} 1292}
1293
1193#endif 1294#endif
1194 1295
1195static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, 1296static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index c0306ec8ed7b..b8f2d1e8c645 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -802,6 +802,7 @@ static const struct dquot_operations reiserfs_quota_operations = {
802 .write_info = reiserfs_write_info, 802 .write_info = reiserfs_write_info,
803 .alloc_dquot = dquot_alloc, 803 .alloc_dquot = dquot_alloc,
804 .destroy_dquot = dquot_destroy, 804 .destroy_dquot = dquot_destroy,
805 .get_next_id = dquot_get_next_id,
805}; 806};
806 807
807static const struct quotactl_ops reiserfs_qctl_operations = { 808static const struct quotactl_ops reiserfs_qctl_operations = {
diff --git a/fs/select.c b/fs/select.c
index 79d0d4953cad..869293988c2a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -70,9 +70,9 @@ static long __estimate_accuracy(struct timespec *tv)
70 return slack; 70 return slack;
71} 71}
72 72
73long select_estimate_accuracy(struct timespec *tv) 73u64 select_estimate_accuracy(struct timespec *tv)
74{ 74{
75 unsigned long ret; 75 u64 ret;
76 struct timespec now; 76 struct timespec now;
77 77
78 /* 78 /*
@@ -402,7 +402,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
402 struct poll_wqueues table; 402 struct poll_wqueues table;
403 poll_table *wait; 403 poll_table *wait;
404 int retval, i, timed_out = 0; 404 int retval, i, timed_out = 0;
405 unsigned long slack = 0; 405 u64 slack = 0;
406 unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; 406 unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
407 unsigned long busy_end = 0; 407 unsigned long busy_end = 0;
408 408
@@ -784,7 +784,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
784 poll_table* pt = &wait->pt; 784 poll_table* pt = &wait->pt;
785 ktime_t expire, *to = NULL; 785 ktime_t expire, *to = NULL;
786 int timed_out = 0, count = 0; 786 int timed_out = 0, count = 0;
787 unsigned long slack = 0; 787 u64 slack = 0;
788 unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; 788 unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
789 unsigned long busy_end = 0; 789 unsigned long busy_end = 0;
790 790
diff --git a/fs/splice.c b/fs/splice.c
index 82bc0d64fc38..9947b5c69664 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -185,6 +185,9 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
185 unsigned int spd_pages = spd->nr_pages; 185 unsigned int spd_pages = spd->nr_pages;
186 int ret, do_wakeup, page_nr; 186 int ret, do_wakeup, page_nr;
187 187
188 if (!spd_pages)
189 return 0;
190
188 ret = 0; 191 ret = 0;
189 do_wakeup = 0; 192 do_wakeup = 0;
190 page_nr = 0; 193 page_nr = 0;
@@ -577,7 +580,7 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
577 old_fs = get_fs(); 580 old_fs = get_fs();
578 set_fs(get_ds()); 581 set_fs(get_ds());
579 /* The cast to a user pointer is valid due to the set_fs() */ 582 /* The cast to a user pointer is valid due to the set_fs() */
580 res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos); 583 res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
581 set_fs(old_fs); 584 set_fs(old_fs);
582 585
583 return res; 586 return res;
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
index 2c6f0cb816b4..c54a24360f85 100644
--- a/fs/ubifs/Makefile
+++ b/fs/ubifs/Makefile
@@ -4,3 +4,4 @@ ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o
4ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o 4ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
5ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o 5ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
6ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o xattr.o debug.o 6ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o xattr.o debug.o
7ubifs-y += misc.o
diff --git a/fs/ubifs/misc.c b/fs/ubifs/misc.c
new file mode 100644
index 000000000000..486a2844949f
--- /dev/null
+++ b/fs/ubifs/misc.c
@@ -0,0 +1,57 @@
1#include <linux/kernel.h>
2#include "ubifs.h"
3
4/* Normal UBIFS messages */
5void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...)
6{
7 struct va_format vaf;
8 va_list args;
9
10 va_start(args, fmt);
11
12 vaf.fmt = fmt;
13 vaf.va = &args;
14
15 pr_notice("UBIFS (ubi%d:%d): %pV\n",
16 c->vi.ubi_num, c->vi.vol_id, &vaf);
17
18 va_end(args);
19} \
20
21/* UBIFS error messages */
22void ubifs_err(const struct ubifs_info *c, const char *fmt, ...)
23{
24 struct va_format vaf;
25 va_list args;
26
27 va_start(args, fmt);
28
29 vaf.fmt = fmt;
30 vaf.va = &args;
31
32 pr_err("UBIFS error (ubi%d:%d pid %d): %ps: %pV\n",
33 c->vi.ubi_num, c->vi.vol_id, current->pid,
34 __builtin_return_address(0),
35 &vaf);
36
37 va_end(args);
38} \
39
40/* UBIFS warning messages */
41void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...)
42{
43 struct va_format vaf;
44 va_list args;
45
46 va_start(args, fmt);
47
48 vaf.fmt = fmt;
49 vaf.va = &args;
50
51 pr_warn("UBIFS warning (ubi%d:%d pid %d): %ps: %pV\n",
52 c->vi.ubi_num, c->vi.vol_id, current->pid,
53 __builtin_return_address(0),
54 &vaf);
55
56 va_end(args);
57}
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index a5697de763f5..c2a57e193a81 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -42,30 +42,6 @@
42/* Version of this UBIFS implementation */ 42/* Version of this UBIFS implementation */
43#define UBIFS_VERSION 1 43#define UBIFS_VERSION 1
44 44
45/* Normal UBIFS messages */
46#define ubifs_msg(c, fmt, ...) \
47 pr_notice("UBIFS (ubi%d:%d): " fmt "\n", \
48 (c)->vi.ubi_num, (c)->vi.vol_id, ##__VA_ARGS__)
49/* UBIFS error messages */
50#define ubifs_err(c, fmt, ...) \
51 pr_err("UBIFS error (ubi%d:%d pid %d): %s: " fmt "\n", \
52 (c)->vi.ubi_num, (c)->vi.vol_id, current->pid, \
53 __func__, ##__VA_ARGS__)
54/* UBIFS warning messages */
55#define ubifs_warn(c, fmt, ...) \
56 pr_warn("UBIFS warning (ubi%d:%d pid %d): %s: " fmt "\n", \
57 (c)->vi.ubi_num, (c)->vi.vol_id, current->pid, \
58 __func__, ##__VA_ARGS__)
59/*
60 * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description
61 * object as an argument.
62 */
63#define ubifs_errc(c, fmt, ...) \
64 do { \
65 if (!(c)->probing) \
66 ubifs_err(c, fmt, ##__VA_ARGS__); \
67 } while (0)
68
69/* UBIFS file system VFS magic number */ 45/* UBIFS file system VFS magic number */
70#define UBIFS_SUPER_MAGIC 0x24051905 46#define UBIFS_SUPER_MAGIC 0x24051905
71 47
@@ -1802,4 +1778,21 @@ int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len,
1802#include "misc.h" 1778#include "misc.h"
1803#include "key.h" 1779#include "key.h"
1804 1780
1781/* Normal UBIFS messages */
1782__printf(2, 3)
1783void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...);
1784__printf(2, 3)
1785void ubifs_err(const struct ubifs_info *c, const char *fmt, ...);
1786__printf(2, 3)
1787void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...);
1788/*
1789 * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description
1790 * object as an argument.
1791 */
1792#define ubifs_errc(c, fmt, ...) \
1793do { \
1794 if (!(c)->probing) \
1795 ubifs_err(c, fmt, ##__VA_ARGS__); \
1796} while (0)
1797
1805#endif /* !__UBIFS_H__ */ 1798#endif /* !__UBIFS_H__ */
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index c7f4d434d098..b043e044121d 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -59,7 +59,6 @@
59#include <linux/fs.h> 59#include <linux/fs.h>
60#include <linux/slab.h> 60#include <linux/slab.h>
61#include <linux/xattr.h> 61#include <linux/xattr.h>
62#include <linux/posix_acl_xattr.h>
63 62
64/* 63/*
65 * Limit the number of extended attributes per inode so that the total size 64 * Limit the number of extended attributes per inode so that the total size
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 541d9c65014d..b51b371b874a 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -45,7 +45,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
45 int block, iblock; 45 int block, iblock;
46 loff_t nf_pos; 46 loff_t nf_pos;
47 int flen; 47 int flen;
48 unsigned char *fname = NULL; 48 unsigned char *fname = NULL, *copy_name = NULL;
49 unsigned char *nameptr; 49 unsigned char *nameptr;
50 uint16_t liu; 50 uint16_t liu;
51 uint8_t lfi; 51 uint8_t lfi;
@@ -143,7 +143,15 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
143 if (poffset >= lfi) { 143 if (poffset >= lfi) {
144 nameptr = (char *)(fibh.ebh->b_data + poffset - lfi); 144 nameptr = (char *)(fibh.ebh->b_data + poffset - lfi);
145 } else { 145 } else {
146 nameptr = fname; 146 if (!copy_name) {
147 copy_name = kmalloc(UDF_NAME_LEN,
148 GFP_NOFS);
149 if (!copy_name) {
150 ret = -ENOMEM;
151 goto out;
152 }
153 }
154 nameptr = copy_name;
147 memcpy(nameptr, fi->fileIdent + liu, 155 memcpy(nameptr, fi->fileIdent + liu,
148 lfi - poffset); 156 lfi - poffset);
149 memcpy(nameptr + lfi - poffset, 157 memcpy(nameptr + lfi - poffset,
@@ -185,6 +193,7 @@ out:
185 brelse(fibh.sbh); 193 brelse(fibh.sbh);
186 brelse(epos.bh); 194 brelse(epos.bh);
187 kfree(fname); 195 kfree(fname);
196 kfree(copy_name);
188 197
189 return ret; 198 return ret;
190} 199}
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 42eafb91f7ff..a2ba11eca995 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -165,7 +165,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
165 struct fileIdentDesc *fi = NULL; 165 struct fileIdentDesc *fi = NULL;
166 loff_t f_pos; 166 loff_t f_pos;
167 int block, flen; 167 int block, flen;
168 unsigned char *fname = NULL; 168 unsigned char *fname = NULL, *copy_name = NULL;
169 unsigned char *nameptr; 169 unsigned char *nameptr;
170 uint8_t lfi; 170 uint8_t lfi;
171 uint16_t liu; 171 uint16_t liu;
@@ -236,7 +236,15 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
236 nameptr = (uint8_t *)(fibh->ebh->b_data + 236 nameptr = (uint8_t *)(fibh->ebh->b_data +
237 poffset - lfi); 237 poffset - lfi);
238 else { 238 else {
239 nameptr = fname; 239 if (!copy_name) {
240 copy_name = kmalloc(UDF_NAME_LEN,
241 GFP_NOFS);
242 if (!copy_name) {
243 fi = ERR_PTR(-ENOMEM);
244 goto out_err;
245 }
246 }
247 nameptr = copy_name;
240 memcpy(nameptr, fi->fileIdent + liu, 248 memcpy(nameptr, fi->fileIdent + liu,
241 lfi - poffset); 249 lfi - poffset);
242 memcpy(nameptr + lfi - poffset, 250 memcpy(nameptr + lfi - poffset,
@@ -279,6 +287,7 @@ out_err:
279out_ok: 287out_ok:
280 brelse(epos.bh); 288 brelse(epos.bh);
281 kfree(fname); 289 kfree(fname);
290 kfree(copy_name);
282 291
283 return fi; 292 return fi;
284} 293}
@@ -291,7 +300,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
291 struct udf_fileident_bh fibh; 300 struct udf_fileident_bh fibh;
292 struct fileIdentDesc *fi; 301 struct fileIdentDesc *fi;
293 302
294 if (dentry->d_name.len > UDF_NAME_LEN - 2) 303 if (dentry->d_name.len > UDF_NAME_LEN)
295 return ERR_PTR(-ENAMETOOLONG); 304 return ERR_PTR(-ENAMETOOLONG);
296 305
297#ifdef UDF_RECOVERY 306#ifdef UDF_RECOVERY
@@ -351,7 +360,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
351 struct udf_inode_info *dinfo; 360 struct udf_inode_info *dinfo;
352 361
353 fibh->sbh = fibh->ebh = NULL; 362 fibh->sbh = fibh->ebh = NULL;
354 name = kmalloc(UDF_NAME_LEN, GFP_NOFS); 363 name = kmalloc(UDF_NAME_LEN_CS0, GFP_NOFS);
355 if (!name) { 364 if (!name) {
356 *err = -ENOMEM; 365 *err = -ENOMEM;
357 goto out_err; 366 goto out_err;
@@ -362,8 +371,9 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
362 *err = -EINVAL; 371 *err = -EINVAL;
363 goto out_err; 372 goto out_err;
364 } 373 }
365 namelen = udf_put_filename(sb, dentry->d_name.name, name, 374 namelen = udf_put_filename(sb, dentry->d_name.name,
366 dentry->d_name.len); 375 dentry->d_name.len,
376 name, UDF_NAME_LEN_CS0);
367 if (!namelen) { 377 if (!namelen) {
368 *err = -ENAMETOOLONG; 378 *err = -ENAMETOOLONG;
369 goto out_err; 379 goto out_err;
@@ -914,7 +924,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
914 924
915 iinfo = UDF_I(inode); 925 iinfo = UDF_I(inode);
916 down_write(&iinfo->i_data_sem); 926 down_write(&iinfo->i_data_sem);
917 name = kmalloc(UDF_NAME_LEN, GFP_NOFS); 927 name = kmalloc(UDF_NAME_LEN_CS0, GFP_NOFS);
918 if (!name) { 928 if (!name) {
919 err = -ENOMEM; 929 err = -ENOMEM;
920 goto out_no_entry; 930 goto out_no_entry;
@@ -997,8 +1007,9 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
997 } 1007 }
998 1008
999 if (pc->componentType == 5) { 1009 if (pc->componentType == 5) {
1000 namelen = udf_put_filename(sb, compstart, name, 1010 namelen = udf_put_filename(sb, compstart,
1001 symname - compstart); 1011 symname - compstart,
1012 name, UDF_NAME_LEN_CS0);
1002 if (!namelen) 1013 if (!namelen)
1003 goto out_no_entry; 1014 goto out_no_entry;
1004 1015
diff --git a/fs/udf/super.c b/fs/udf/super.c
index a522c15a0bfd..fa92fe839fda 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -887,18 +887,14 @@ static int udf_find_fileset(struct super_block *sb,
887static int udf_load_pvoldesc(struct super_block *sb, sector_t block) 887static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
888{ 888{
889 struct primaryVolDesc *pvoldesc; 889 struct primaryVolDesc *pvoldesc;
890 struct ustr *instr, *outstr; 890 uint8_t *outstr;
891 struct buffer_head *bh; 891 struct buffer_head *bh;
892 uint16_t ident; 892 uint16_t ident;
893 int ret = -ENOMEM; 893 int ret = -ENOMEM;
894 894
895 instr = kmalloc(sizeof(struct ustr), GFP_NOFS); 895 outstr = kmalloc(128, GFP_NOFS);
896 if (!instr)
897 return -ENOMEM;
898
899 outstr = kmalloc(sizeof(struct ustr), GFP_NOFS);
900 if (!outstr) 896 if (!outstr)
901 goto out1; 897 return -ENOMEM;
902 898
903 bh = udf_read_tagged(sb, block, block, &ident); 899 bh = udf_read_tagged(sb, block, block, &ident);
904 if (!bh) { 900 if (!bh) {
@@ -923,31 +919,25 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
923#endif 919#endif
924 } 920 }
925 921
926 if (!udf_build_ustr(instr, pvoldesc->volIdent, 32)) { 922 ret = udf_CS0toUTF8(outstr, 31, pvoldesc->volIdent, 32);
927 ret = udf_CS0toUTF8(outstr, instr); 923 if (ret < 0)
928 if (ret < 0) 924 goto out_bh;
929 goto out_bh;
930 925
931 strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name, 926 strncpy(UDF_SB(sb)->s_volume_ident, outstr, ret);
932 outstr->u_len > 31 ? 31 : outstr->u_len); 927 udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident);
933 udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident);
934 }
935 928
936 if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128)) { 929 ret = udf_CS0toUTF8(outstr, 127, pvoldesc->volSetIdent, 128);
937 ret = udf_CS0toUTF8(outstr, instr); 930 if (ret < 0)
938 if (ret < 0) 931 goto out_bh;
939 goto out_bh;
940 932
941 udf_debug("volSetIdent[] = '%s'\n", outstr->u_name); 933 outstr[ret] = 0;
942 } 934 udf_debug("volSetIdent[] = '%s'\n", outstr);
943 935
944 ret = 0; 936 ret = 0;
945out_bh: 937out_bh:
946 brelse(bh); 938 brelse(bh);
947out2: 939out2:
948 kfree(outstr); 940 kfree(outstr);
949out1:
950 kfree(instr);
951 return ret; 941 return ret;
952} 942}
953 943
@@ -2358,7 +2348,7 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
2358 le32_to_cpu(lvidiu->numDirs)) : 0) 2348 le32_to_cpu(lvidiu->numDirs)) : 0)
2359 + buf->f_bfree; 2349 + buf->f_bfree;
2360 buf->f_ffree = buf->f_bfree; 2350 buf->f_ffree = buf->f_bfree;
2361 buf->f_namelen = UDF_NAME_LEN - 2; 2351 buf->f_namelen = UDF_NAME_LEN;
2362 buf->f_fsid.val[0] = (u32)id; 2352 buf->f_fsid.val[0] = (u32)id;
2363 buf->f_fsid.val[1] = (u32)(id >> 32); 2353 buf->f_fsid.val[1] = (u32)(id >> 32);
2364 2354
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index fa0044b6b81d..972b70625614 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -49,8 +49,8 @@ extern __printf(3, 4) void _udf_warn(struct super_block *sb,
49#define UDF_EXTENT_FLAG_MASK 0xC0000000 49#define UDF_EXTENT_FLAG_MASK 0xC0000000
50 50
51#define UDF_NAME_PAD 4 51#define UDF_NAME_PAD 4
52#define UDF_NAME_LEN 256 52#define UDF_NAME_LEN 254
53#define UDF_PATH_LEN 1023 53#define UDF_NAME_LEN_CS0 255
54 54
55static inline size_t udf_file_entry_alloc_offset(struct inode *inode) 55static inline size_t udf_file_entry_alloc_offset(struct inode *inode)
56{ 56{
@@ -106,12 +106,6 @@ struct generic_desc {
106 __le32 volDescSeqNum; 106 __le32 volDescSeqNum;
107}; 107};
108 108
109struct ustr {
110 uint8_t u_cmpID;
111 uint8_t u_name[UDF_NAME_LEN - 2];
112 uint8_t u_len;
113};
114
115 109
116/* super.c */ 110/* super.c */
117 111
@@ -214,12 +208,11 @@ udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc,
214} 208}
215 209
216/* unicode.c */ 210/* unicode.c */
217extern int udf_get_filename(struct super_block *, uint8_t *, int, uint8_t *, 211extern int udf_get_filename(struct super_block *, const uint8_t *, int,
218 int); 212 uint8_t *, int);
219extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *, 213extern int udf_put_filename(struct super_block *, const uint8_t *, int,
220 int); 214 uint8_t *, int);
221extern int udf_build_ustr(struct ustr *, dstring *, int); 215extern int udf_CS0toUTF8(uint8_t *, int, const uint8_t *, int);
222extern int udf_CS0toUTF8(struct ustr *, const struct ustr *);
223 216
224/* ialloc.c */ 217/* ialloc.c */
225extern void udf_free_inode(struct inode *); 218extern void udf_free_inode(struct inode *);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index e788a05aab83..3ff42f4437f3 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -28,199 +28,72 @@
28 28
29#include "udf_sb.h" 29#include "udf_sb.h"
30 30
31static int udf_translate_to_linux(uint8_t *, int, uint8_t *, int, uint8_t *, 31static int udf_uni2char_utf8(wchar_t uni,
32 int); 32 unsigned char *out,
33 33 int boundlen)
34static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
35{
36 if ((!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN - 2))
37 return 0;
38
39 memset(dest, 0, sizeof(struct ustr));
40 memcpy(dest->u_name, src, strlen);
41 dest->u_cmpID = 0x08;
42 dest->u_len = strlen;
43
44 return strlen;
45}
46
47/*
48 * udf_build_ustr
49 */
50int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
51{
52 int usesize;
53
54 if (!dest || !ptr || !size)
55 return -1;
56 BUG_ON(size < 2);
57
58 usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name));
59 usesize = min(usesize, size - 2);
60 dest->u_cmpID = ptr[0];
61 dest->u_len = usesize;
62 memcpy(dest->u_name, ptr + 1, usesize);
63 memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize);
64
65 return 0;
66}
67
68/*
69 * udf_build_ustr_exact
70 */
71static void udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
72{
73 memset(dest, 0, sizeof(struct ustr));
74 dest->u_cmpID = ptr[0];
75 dest->u_len = exactsize - 1;
76 memcpy(dest->u_name, ptr + 1, exactsize - 1);
77}
78
79/*
80 * udf_CS0toUTF8
81 *
82 * PURPOSE
83 * Convert OSTA Compressed Unicode to the UTF-8 equivalent.
84 *
85 * PRE-CONDITIONS
86 * utf Pointer to UTF-8 output buffer.
87 * ocu Pointer to OSTA Compressed Unicode input buffer
88 * of size UDF_NAME_LEN bytes.
89 * both of type "struct ustr *"
90 *
91 * POST-CONDITIONS
92 * <return> >= 0 on success.
93 *
94 * HISTORY
95 * November 12, 1997 - Andrew E. Mileski
96 * Written, tested, and released.
97 */
98int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
99{ 34{
100 const uint8_t *ocu; 35 int u_len = 0;
101 uint8_t cmp_id, ocu_len; 36
102 int i; 37 if (boundlen <= 0)
103 38 return -ENAMETOOLONG;
104 ocu_len = ocu_i->u_len; 39
105 if (ocu_len == 0) { 40 if (uni < 0x80) {
106 memset(utf_o, 0, sizeof(struct ustr)); 41 out[u_len++] = (unsigned char)uni;
107 return 0; 42 } else if (uni < 0x800) {
108 } 43 if (boundlen < 2)
109 44 return -ENAMETOOLONG;
110 cmp_id = ocu_i->u_cmpID; 45 out[u_len++] = (unsigned char)(0xc0 | (uni >> 6));
111 if (cmp_id != 8 && cmp_id != 16) { 46 out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f));
112 memset(utf_o, 0, sizeof(struct ustr)); 47 } else {
113 pr_err("unknown compression code (%d) stri=%s\n", 48 if (boundlen < 3)
114 cmp_id, ocu_i->u_name); 49 return -ENAMETOOLONG;
115 return -EINVAL; 50 out[u_len++] = (unsigned char)(0xe0 | (uni >> 12));
116 } 51 out[u_len++] = (unsigned char)(0x80 | ((uni >> 6) & 0x3f));
117 52 out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f));
118 ocu = ocu_i->u_name;
119 utf_o->u_len = 0;
120 for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
121
122 /* Expand OSTA compressed Unicode to Unicode */
123 uint32_t c = ocu[i++];
124 if (cmp_id == 16)
125 c = (c << 8) | ocu[i++];
126
127 /* Compress Unicode to UTF-8 */
128 if (c < 0x80U)
129 utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
130 else if (c < 0x800U) {
131 if (utf_o->u_len > (UDF_NAME_LEN - 4))
132 break;
133 utf_o->u_name[utf_o->u_len++] =
134 (uint8_t)(0xc0 | (c >> 6));
135 utf_o->u_name[utf_o->u_len++] =
136 (uint8_t)(0x80 | (c & 0x3f));
137 } else {
138 if (utf_o->u_len > (UDF_NAME_LEN - 5))
139 break;
140 utf_o->u_name[utf_o->u_len++] =
141 (uint8_t)(0xe0 | (c >> 12));
142 utf_o->u_name[utf_o->u_len++] =
143 (uint8_t)(0x80 |
144 ((c >> 6) & 0x3f));
145 utf_o->u_name[utf_o->u_len++] =
146 (uint8_t)(0x80 | (c & 0x3f));
147 }
148 } 53 }
149 utf_o->u_cmpID = 8; 54 return u_len;
150
151 return utf_o->u_len;
152} 55}
153 56
154/* 57static int udf_char2uni_utf8(const unsigned char *in,
155 * 58 int boundlen,
156 * udf_UTF8toCS0 59 wchar_t *uni)
157 *
158 * PURPOSE
159 * Convert UTF-8 to the OSTA Compressed Unicode equivalent.
160 *
161 * DESCRIPTION
162 * This routine is only called by udf_lookup().
163 *
164 * PRE-CONDITIONS
165 * ocu Pointer to OSTA Compressed Unicode output
166 * buffer of size UDF_NAME_LEN bytes.
167 * utf Pointer to UTF-8 input buffer.
168 * utf_len Length of UTF-8 input buffer in bytes.
169 *
170 * POST-CONDITIONS
171 * <return> Zero on success.
172 *
173 * HISTORY
174 * November 12, 1997 - Andrew E. Mileski
175 * Written, tested, and released.
176 */
177static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
178{ 60{
179 unsigned c, i, max_val, utf_char; 61 unsigned int utf_char;
180 int utf_cnt, u_len, u_ch; 62 unsigned char c;
181 63 int utf_cnt, u_len;
182 memset(ocu, 0, sizeof(dstring) * length);
183 ocu[0] = 8;
184 max_val = 0xffU;
185 u_ch = 1;
186 64
187try_again: 65 utf_char = 0;
188 u_len = 0U; 66 utf_cnt = 0;
189 utf_char = 0U; 67 for (u_len = 0; u_len < boundlen;) {
190 utf_cnt = 0U; 68 c = in[u_len++];
191 for (i = 0U; i < utf->u_len; i++) {
192 /* Name didn't fit? */
193 if (u_len + 1 + u_ch >= length)
194 return 0;
195
196 c = (uint8_t)utf->u_name[i];
197 69
198 /* Complete a multi-byte UTF-8 character */ 70 /* Complete a multi-byte UTF-8 character */
199 if (utf_cnt) { 71 if (utf_cnt) {
200 utf_char = (utf_char << 6) | (c & 0x3fU); 72 utf_char = (utf_char << 6) | (c & 0x3f);
201 if (--utf_cnt) 73 if (--utf_cnt)
202 continue; 74 continue;
203 } else { 75 } else {
204 /* Check for a multi-byte UTF-8 character */ 76 /* Check for a multi-byte UTF-8 character */
205 if (c & 0x80U) { 77 if (c & 0x80) {
206 /* Start a multi-byte UTF-8 character */ 78 /* Start a multi-byte UTF-8 character */
207 if ((c & 0xe0U) == 0xc0U) { 79 if ((c & 0xe0) == 0xc0) {
208 utf_char = c & 0x1fU; 80 utf_char = c & 0x1f;
209 utf_cnt = 1; 81 utf_cnt = 1;
210 } else if ((c & 0xf0U) == 0xe0U) { 82 } else if ((c & 0xf0) == 0xe0) {
211 utf_char = c & 0x0fU; 83 utf_char = c & 0x0f;
212 utf_cnt = 2; 84 utf_cnt = 2;
213 } else if ((c & 0xf8U) == 0xf0U) { 85 } else if ((c & 0xf8) == 0xf0) {
214 utf_char = c & 0x07U; 86 utf_char = c & 0x07;
215 utf_cnt = 3; 87 utf_cnt = 3;
216 } else if ((c & 0xfcU) == 0xf8U) { 88 } else if ((c & 0xfc) == 0xf8) {
217 utf_char = c & 0x03U; 89 utf_char = c & 0x03;
218 utf_cnt = 4; 90 utf_cnt = 4;
219 } else if ((c & 0xfeU) == 0xfcU) { 91 } else if ((c & 0xfe) == 0xfc) {
220 utf_char = c & 0x01U; 92 utf_char = c & 0x01;
221 utf_cnt = 5; 93 utf_cnt = 5;
222 } else { 94 } else {
223 goto error_out; 95 utf_cnt = -1;
96 break;
224 } 97 }
225 continue; 98 continue;
226 } else { 99 } else {
@@ -228,97 +101,216 @@ try_again:
228 utf_char = c; 101 utf_char = c;
229 } 102 }
230 } 103 }
231 104 *uni = utf_char;
232 /* Choose no compression if necessary */ 105 break;
233 if (utf_char > max_val) {
234 if (max_val == 0xffU) {
235 max_val = 0xffffU;
236 ocu[0] = (uint8_t)0x10U;
237 u_ch = 2;
238 goto try_again;
239 }
240 goto error_out;
241 }
242
243 if (max_val == 0xffffU)
244 ocu[++u_len] = (uint8_t)(utf_char >> 8);
245 ocu[++u_len] = (uint8_t)(utf_char & 0xffU);
246 } 106 }
247
248 if (utf_cnt) { 107 if (utf_cnt) {
249error_out: 108 *uni = '?';
250 ocu[++u_len] = '?'; 109 return -EINVAL;
251 printk(KERN_DEBUG pr_fmt("bad UTF-8 character\n"));
252 } 110 }
111 return u_len;
112}
253 113
254 ocu[length - 1] = (uint8_t)u_len + 1; 114#define ILLEGAL_CHAR_MARK '_'
115#define EXT_MARK '.'
116#define CRC_MARK '#'
117#define EXT_SIZE 5
118/* Number of chars we need to store generated CRC to make filename unique */
119#define CRC_LEN 5
120
121static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
122 int *str_o_idx,
123 const uint8_t *str_i, int str_i_max_len,
124 int *str_i_idx,
125 int u_ch, int *needsCRC,
126 int (*conv_f)(wchar_t, unsigned char *, int),
127 int translate)
128{
129 uint32_t c;
130 int illChar = 0;
131 int len, gotch = 0;
132
133 for (; (!gotch) && (*str_i_idx < str_i_max_len); *str_i_idx += u_ch) {
134 if (*str_o_idx >= str_o_max_len) {
135 *needsCRC = 1;
136 return gotch;
137 }
255 138
256 return u_len + 1; 139 /* Expand OSTA compressed Unicode to Unicode */
140 c = str_i[*str_i_idx];
141 if (u_ch > 1)
142 c = (c << 8) | str_i[*str_i_idx + 1];
143
144 if (translate && (c == '/' || c == 0))
145 illChar = 1;
146 else if (illChar)
147 break;
148 else
149 gotch = 1;
150 }
151 if (illChar) {
152 *needsCRC = 1;
153 c = ILLEGAL_CHAR_MARK;
154 gotch = 1;
155 }
156 if (gotch) {
157 len = conv_f(c, &str_o[*str_o_idx], str_o_max_len - *str_o_idx);
158 /* Valid character? */
159 if (len >= 0)
160 *str_o_idx += len;
161 else if (len == -ENAMETOOLONG) {
162 *needsCRC = 1;
163 gotch = 0;
164 } else {
165 str_o[(*str_o_idx)++] = '?';
166 *needsCRC = 1;
167 }
168 }
169 return gotch;
257} 170}
258 171
259static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, 172static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
260 const struct ustr *ocu_i) 173 const uint8_t *ocu, int ocu_len,
174 int (*conv_f)(wchar_t, unsigned char *, int),
175 int translate)
261{ 176{
262 const uint8_t *ocu; 177 uint32_t c;
263 uint8_t cmp_id, ocu_len; 178 uint8_t cmp_id;
264 int i, len; 179 int idx, len;
180 int u_ch;
181 int needsCRC = 0;
182 int ext_i_len, ext_max_len;
183 int str_o_len = 0; /* Length of resulting output */
184 int ext_o_len = 0; /* Extension output length */
185 int ext_crc_len = 0; /* Extension output length if used with CRC */
186 int i_ext = -1; /* Extension position in input buffer */
187 int o_crc = 0; /* Rightmost possible output pos for CRC+ext */
188 unsigned short valueCRC;
189 uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1];
190 uint8_t crc[CRC_LEN];
265 191
192 if (str_max_len <= 0)
193 return 0;
266 194
267 ocu_len = ocu_i->u_len;
268 if (ocu_len == 0) { 195 if (ocu_len == 0) {
269 memset(utf_o, 0, sizeof(struct ustr)); 196 memset(str_o, 0, str_max_len);
270 return 0; 197 return 0;
271 } 198 }
272 199
273 cmp_id = ocu_i->u_cmpID; 200 cmp_id = ocu[0];
274 if (cmp_id != 8 && cmp_id != 16) { 201 if (cmp_id != 8 && cmp_id != 16) {
275 memset(utf_o, 0, sizeof(struct ustr)); 202 memset(str_o, 0, str_max_len);
276 pr_err("unknown compression code (%d) stri=%s\n", 203 pr_err("unknown compression code (%d)\n", cmp_id);
277 cmp_id, ocu_i->u_name);
278 return -EINVAL; 204 return -EINVAL;
279 } 205 }
206 u_ch = cmp_id >> 3;
280 207
281 ocu = ocu_i->u_name; 208 ocu++;
282 utf_o->u_len = 0; 209 ocu_len--;
283 for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
284 /* Expand OSTA compressed Unicode to Unicode */
285 uint32_t c = ocu[i++];
286 if (cmp_id == 16)
287 c = (c << 8) | ocu[i++];
288 210
289 len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len], 211 if (ocu_len % u_ch) {
290 UDF_NAME_LEN - 2 - utf_o->u_len); 212 pr_err("incorrect filename length (%d)\n", ocu_len + 1);
291 /* Valid character? */ 213 return -EINVAL;
292 if (len >= 0) 214 }
293 utf_o->u_len += len; 215
294 else 216 if (translate) {
295 utf_o->u_name[utf_o->u_len++] = '?'; 217 /* Look for extension */
218 for (idx = ocu_len - u_ch, ext_i_len = 0;
219 (idx >= 0) && (ext_i_len < EXT_SIZE);
220 idx -= u_ch, ext_i_len++) {
221 c = ocu[idx];
222 if (u_ch > 1)
223 c = (c << 8) | ocu[idx + 1];
224
225 if (c == EXT_MARK) {
226 if (ext_i_len)
227 i_ext = idx;
228 break;
229 }
230 }
231 if (i_ext >= 0) {
232 /* Convert extension */
233 ext_max_len = min_t(int, sizeof(ext), str_max_len);
234 ext[ext_o_len++] = EXT_MARK;
235 idx = i_ext + u_ch;
236 while (udf_name_conv_char(ext, ext_max_len, &ext_o_len,
237 ocu, ocu_len, &idx,
238 u_ch, &needsCRC,
239 conv_f, translate)) {
240 if ((ext_o_len + CRC_LEN) < str_max_len)
241 ext_crc_len = ext_o_len;
242 }
243 }
296 } 244 }
297 utf_o->u_cmpID = 8;
298 245
299 return utf_o->u_len; 246 idx = 0;
247 while (1) {
248 if (translate && (idx == i_ext)) {
249 if (str_o_len > (str_max_len - ext_o_len))
250 needsCRC = 1;
251 break;
252 }
253
254 if (!udf_name_conv_char(str_o, str_max_len, &str_o_len,
255 ocu, ocu_len, &idx,
256 u_ch, &needsCRC, conv_f, translate))
257 break;
258
259 if (translate &&
260 (str_o_len <= (str_max_len - ext_o_len - CRC_LEN)))
261 o_crc = str_o_len;
262 }
263
264 if (translate) {
265 if (str_o_len <= 2 && str_o[0] == '.' &&
266 (str_o_len == 1 || str_o[1] == '.'))
267 needsCRC = 1;
268 if (needsCRC) {
269 str_o_len = o_crc;
270 valueCRC = crc_itu_t(0, ocu, ocu_len);
271 crc[0] = CRC_MARK;
272 crc[1] = hex_asc_upper_hi(valueCRC >> 8);
273 crc[2] = hex_asc_upper_lo(valueCRC >> 8);
274 crc[3] = hex_asc_upper_hi(valueCRC);
275 crc[4] = hex_asc_upper_lo(valueCRC);
276 len = min_t(int, CRC_LEN, str_max_len - str_o_len);
277 memcpy(&str_o[str_o_len], crc, len);
278 str_o_len += len;
279 ext_o_len = ext_crc_len;
280 }
281 if (ext_o_len > 0) {
282 memcpy(&str_o[str_o_len], ext, ext_o_len);
283 str_o_len += ext_o_len;
284 }
285 }
286
287 return str_o_len;
300} 288}
301 289
302static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, 290static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len,
303 int length) 291 const uint8_t *str_i, int str_len,
292 int (*conv_f)(const unsigned char *, int, wchar_t *))
304{ 293{
305 int len; 294 int i, len;
306 unsigned i, max_val; 295 unsigned int max_val;
307 uint16_t uni_char; 296 wchar_t uni_char;
308 int u_len, u_ch; 297 int u_len, u_ch;
309 298
310 memset(ocu, 0, sizeof(dstring) * length); 299 if (ocu_max_len <= 0)
300 return 0;
301
302 memset(ocu, 0, ocu_max_len);
311 ocu[0] = 8; 303 ocu[0] = 8;
312 max_val = 0xffU; 304 max_val = 0xff;
313 u_ch = 1; 305 u_ch = 1;
314 306
315try_again: 307try_again:
316 u_len = 0U; 308 u_len = 1;
317 for (i = 0U; i < uni->u_len; i++) { 309 for (i = 0; i < str_len; i++) {
318 /* Name didn't fit? */ 310 /* Name didn't fit? */
319 if (u_len + 1 + u_ch >= length) 311 if (u_len + u_ch > ocu_max_len)
320 return 0; 312 return 0;
321 len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char); 313 len = conv_f(&str_i[i], str_len - i, &uni_char);
322 if (!len) 314 if (!len)
323 continue; 315 continue;
324 /* Invalid character, deal with it */ 316 /* Invalid character, deal with it */
@@ -328,187 +320,65 @@ try_again:
328 } 320 }
329 321
330 if (uni_char > max_val) { 322 if (uni_char > max_val) {
331 max_val = 0xffffU; 323 max_val = 0xffff;
332 ocu[0] = (uint8_t)0x10U; 324 ocu[0] = 0x10;
333 u_ch = 2; 325 u_ch = 2;
334 goto try_again; 326 goto try_again;
335 } 327 }
336 328
337 if (max_val == 0xffffU) 329 if (max_val == 0xffff)
338 ocu[++u_len] = (uint8_t)(uni_char >> 8); 330 ocu[u_len++] = (uint8_t)(uni_char >> 8);
339 ocu[++u_len] = (uint8_t)(uni_char & 0xffU); 331 ocu[u_len++] = (uint8_t)(uni_char & 0xff);
340 i += len - 1; 332 i += len - 1;
341 } 333 }
342 334
343 ocu[length - 1] = (uint8_t)u_len + 1; 335 return u_len;
344 return u_len + 1;
345} 336}
346 337
347int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen, 338int udf_CS0toUTF8(uint8_t *utf_o, int o_len, const uint8_t *ocu_i, int i_len)
339{
340 return udf_name_from_CS0(utf_o, o_len, ocu_i, i_len,
341 udf_uni2char_utf8, 0);
342}
343
344int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
348 uint8_t *dname, int dlen) 345 uint8_t *dname, int dlen)
349{ 346{
350 struct ustr *filename, *unifilename; 347 int (*conv_f)(wchar_t, unsigned char *, int);
351 int ret; 348 int ret;
352 349
353 if (!slen) 350 if (!slen)
354 return -EIO; 351 return -EIO;
355 352
356 filename = kmalloc(sizeof(struct ustr), GFP_NOFS); 353 if (dlen <= 0)
357 if (!filename) 354 return 0;
358 return -ENOMEM;
359
360 unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
361 if (!unifilename) {
362 ret = -ENOMEM;
363 goto out1;
364 }
365 355
366 udf_build_ustr_exact(unifilename, sname, slen);
367 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { 356 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
368 ret = udf_CS0toUTF8(filename, unifilename); 357 conv_f = udf_uni2char_utf8;
369 if (ret < 0) {
370 udf_debug("Failed in udf_get_filename: sname = %s\n",
371 sname);
372 goto out2;
373 }
374 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { 358 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
375 ret = udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename, 359 conv_f = UDF_SB(sb)->s_nls_map->uni2char;
376 unifilename);
377 if (ret < 0) {
378 udf_debug("Failed in udf_get_filename: sname = %s\n",
379 sname);
380 goto out2;
381 }
382 } else 360 } else
383 BUG(); 361 BUG();
384 362
385 ret = udf_translate_to_linux(dname, dlen, 363 ret = udf_name_from_CS0(dname, dlen, sname, slen, conv_f, 1);
386 filename->u_name, filename->u_len,
387 unifilename->u_name, unifilename->u_len);
388 /* Zero length filename isn't valid... */ 364 /* Zero length filename isn't valid... */
389 if (ret == 0) 365 if (ret == 0)
390 ret = -EINVAL; 366 ret = -EINVAL;
391out2:
392 kfree(unifilename);
393out1:
394 kfree(filename);
395 return ret; 367 return ret;
396} 368}
397 369
398int udf_put_filename(struct super_block *sb, const uint8_t *sname, 370int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen,
399 uint8_t *dname, int flen) 371 uint8_t *dname, int dlen)
400{ 372{
401 struct ustr unifilename; 373 int (*conv_f)(const unsigned char *, int, wchar_t *);
402 int namelen;
403
404 if (!udf_char_to_ustr(&unifilename, sname, flen))
405 return 0;
406 374
407 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { 375 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
408 namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN); 376 conv_f = udf_char2uni_utf8;
409 if (!namelen)
410 return 0;
411 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { 377 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
412 namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname, 378 conv_f = UDF_SB(sb)->s_nls_map->char2uni;
413 &unifilename, UDF_NAME_LEN);
414 if (!namelen)
415 return 0;
416 } else 379 } else
417 return 0; 380 BUG();
418 381
419 return namelen; 382 return udf_name_to_CS0(dname, dlen, sname, slen, conv_f);
420} 383}
421 384
422#define ILLEGAL_CHAR_MARK '_'
423#define EXT_MARK '.'
424#define CRC_MARK '#'
425#define EXT_SIZE 5
426/* Number of chars we need to store generated CRC to make filename unique */
427#define CRC_LEN 5
428
429static int udf_translate_to_linux(uint8_t *newName, int newLen,
430 uint8_t *udfName, int udfLen,
431 uint8_t *fidName, int fidNameLen)
432{
433 int index, newIndex = 0, needsCRC = 0;
434 int extIndex = 0, newExtIndex = 0, hasExt = 0;
435 unsigned short valueCRC;
436 uint8_t curr;
437
438 if (udfName[0] == '.' &&
439 (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
440 needsCRC = 1;
441 newIndex = udfLen;
442 memcpy(newName, udfName, udfLen);
443 } else {
444 for (index = 0; index < udfLen; index++) {
445 curr = udfName[index];
446 if (curr == '/' || curr == 0) {
447 needsCRC = 1;
448 curr = ILLEGAL_CHAR_MARK;
449 while (index + 1 < udfLen &&
450 (udfName[index + 1] == '/' ||
451 udfName[index + 1] == 0))
452 index++;
453 }
454 if (curr == EXT_MARK &&
455 (udfLen - index - 1) <= EXT_SIZE) {
456 if (udfLen == index + 1)
457 hasExt = 0;
458 else {
459 hasExt = 1;
460 extIndex = index;
461 newExtIndex = newIndex;
462 }
463 }
464 if (newIndex < newLen)
465 newName[newIndex++] = curr;
466 else
467 needsCRC = 1;
468 }
469 }
470 if (needsCRC) {
471 uint8_t ext[EXT_SIZE];
472 int localExtIndex = 0;
473
474 if (hasExt) {
475 int maxFilenameLen;
476 for (index = 0;
477 index < EXT_SIZE && extIndex + index + 1 < udfLen;
478 index++) {
479 curr = udfName[extIndex + index + 1];
480
481 if (curr == '/' || curr == 0) {
482 needsCRC = 1;
483 curr = ILLEGAL_CHAR_MARK;
484 while (extIndex + index + 2 < udfLen &&
485 (index + 1 < EXT_SIZE &&
486 (udfName[extIndex + index + 2] == '/' ||
487 udfName[extIndex + index + 2] == 0)))
488 index++;
489 }
490 ext[localExtIndex++] = curr;
491 }
492 maxFilenameLen = newLen - CRC_LEN - localExtIndex;
493 if (newIndex > maxFilenameLen)
494 newIndex = maxFilenameLen;
495 else
496 newIndex = newExtIndex;
497 } else if (newIndex > newLen - CRC_LEN)
498 newIndex = newLen - CRC_LEN;
499 newName[newIndex++] = CRC_MARK;
500 valueCRC = crc_itu_t(0, fidName, fidNameLen);
501 newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8);
502 newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8);
503 newName[newIndex++] = hex_asc_upper_hi(valueCRC);
504 newName[newIndex++] = hex_asc_upper_lo(valueCRC);
505
506 if (hasExt) {
507 newName[newIndex++] = EXT_MARK;
508 for (index = 0; index < localExtIndex; index++)
509 newName[newIndex++] = ext[index];
510 }
511 }
512
513 return newIndex;
514}
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index f64639176670..3542d94fddce 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -121,4 +121,5 @@ xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
121xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o 121xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
122xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o 122xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
123xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o 123xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
124xfs-$(CONFIG_NFSD_PNFS) += xfs_pnfs.o 124xfs-$(CONFIG_NFSD_BLOCKLAYOUT) += xfs_pnfs.o
125xfs-$(CONFIG_NFSD_SCSILAYOUT) += xfs_pnfs.o
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 444626ddbd1b..d9b42425291e 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -118,8 +118,6 @@ xfs_allocbt_free_block(
118 xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, 118 xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
119 XFS_EXTENT_BUSY_SKIP_DISCARD); 119 XFS_EXTENT_BUSY_SKIP_DISCARD);
120 xfs_trans_agbtree_delta(cur->bc_tp, -1); 120 xfs_trans_agbtree_delta(cur->bc_tp, -1);
121
122 xfs_trans_binval(cur->bc_tp, bp);
123 return 0; 121 return 0;
124} 122}
125 123
diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
index 919756e3ba53..90928bbe693c 100644
--- a/fs/xfs/libxfs/xfs_attr_sf.h
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
@@ -24,22 +24,6 @@
24 * Small attribute lists are packed as tightly as possible so as 24 * Small attribute lists are packed as tightly as possible so as
25 * to fit into the literal area of the inode. 25 * to fit into the literal area of the inode.
26 */ 26 */
27
28/*
29 * Entries are packed toward the top as tight as possible.
30 */
31typedef struct xfs_attr_shortform {
32 struct xfs_attr_sf_hdr { /* constant-structure header block */
33 __be16 totsize; /* total bytes in shortform list */
34 __u8 count; /* count of active entries */
35 } hdr;
36 struct xfs_attr_sf_entry {
37 __uint8_t namelen; /* actual length of name (no NULL) */
38 __uint8_t valuelen; /* actual length of value (no NULL) */
39 __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
40 __uint8_t nameval[1]; /* name & value bytes concatenated */
41 } list[1]; /* variable sized array */
42} xfs_attr_shortform_t;
43typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t; 27typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
44typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t; 28typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
45 29
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index ef00156f4f96..041b6948aecc 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -477,10 +477,7 @@ xfs_bmap_check_leaf_extents(
477 } 477 }
478 block = XFS_BUF_TO_BLOCK(bp); 478 block = XFS_BUF_TO_BLOCK(bp);
479 } 479 }
480 if (bp_release) { 480
481 bp_release = 0;
482 xfs_trans_brelse(NULL, bp);
483 }
484 return; 481 return;
485 482
486error0: 483error0:
@@ -912,7 +909,7 @@ xfs_bmap_local_to_extents(
912 * We don't want to deal with the case of keeping inode data inline yet. 909 * We don't want to deal with the case of keeping inode data inline yet.
913 * So sending the data fork of a regular inode is invalid. 910 * So sending the data fork of a regular inode is invalid.
914 */ 911 */
915 ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK)); 912 ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK));
916 ifp = XFS_IFORK_PTR(ip, whichfork); 913 ifp = XFS_IFORK_PTR(ip, whichfork);
917 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); 914 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
918 915
@@ -1079,7 +1076,7 @@ xfs_bmap_add_attrfork_local(
1079 if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) 1076 if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
1080 return 0; 1077 return 0;
1081 1078
1082 if (S_ISDIR(ip->i_d.di_mode)) { 1079 if (S_ISDIR(VFS_I(ip)->i_mode)) {
1083 memset(&dargs, 0, sizeof(dargs)); 1080 memset(&dargs, 0, sizeof(dargs));
1084 dargs.geo = ip->i_mount->m_dir_geo; 1081 dargs.geo = ip->i_mount->m_dir_geo;
1085 dargs.dp = ip; 1082 dargs.dp = ip;
@@ -1091,7 +1088,7 @@ xfs_bmap_add_attrfork_local(
1091 return xfs_dir2_sf_to_block(&dargs); 1088 return xfs_dir2_sf_to_block(&dargs);
1092 } 1089 }
1093 1090
1094 if (S_ISLNK(ip->i_d.di_mode)) 1091 if (S_ISLNK(VFS_I(ip)->i_mode))
1095 return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, 1092 return xfs_bmap_local_to_extents(tp, ip, firstblock, 1,
1096 flags, XFS_DATA_FORK, 1093 flags, XFS_DATA_FORK,
1097 xfs_symlink_local_to_remote); 1094 xfs_symlink_local_to_remote);
@@ -4721,6 +4718,66 @@ error0:
4721} 4718}
4722 4719
4723/* 4720/*
4721 * When a delalloc extent is split (e.g., due to a hole punch), the original
4722 * indlen reservation must be shared across the two new extents that are left
4723 * behind.
4724 *
4725 * Given the original reservation and the worst case indlen for the two new
4726 * extents (as calculated by xfs_bmap_worst_indlen()), split the original
4727 * reservation fairly across the two new extents. If necessary, steal available
4728 * blocks from a deleted extent to make up a reservation deficiency (e.g., if
4729 * ores == 1). The number of stolen blocks is returned. The availability and
4730 * subsequent accounting of stolen blocks is the responsibility of the caller.
4731 */
4732static xfs_filblks_t
4733xfs_bmap_split_indlen(
4734 xfs_filblks_t ores, /* original res. */
4735 xfs_filblks_t *indlen1, /* ext1 worst indlen */
4736 xfs_filblks_t *indlen2, /* ext2 worst indlen */
4737 xfs_filblks_t avail) /* stealable blocks */
4738{
4739 xfs_filblks_t len1 = *indlen1;
4740 xfs_filblks_t len2 = *indlen2;
4741 xfs_filblks_t nres = len1 + len2; /* new total res. */
4742 xfs_filblks_t stolen = 0;
4743
4744 /*
4745 * Steal as many blocks as we can to try and satisfy the worst case
4746 * indlen for both new extents.
4747 */
4748 while (nres > ores && avail) {
4749 nres--;
4750 avail--;
4751 stolen++;
4752 }
4753
4754 /*
4755 * The only blocks available are those reserved for the original
4756 * extent and what we can steal from the extent being removed.
4757 * If this still isn't enough to satisfy the combined
4758 * requirements for the two new extents, skim blocks off of each
4759 * of the new reservations until they match what is available.
4760 */
4761 while (nres > ores) {
4762 if (len1) {
4763 len1--;
4764 nres--;
4765 }
4766 if (nres == ores)
4767 break;
4768 if (len2) {
4769 len2--;
4770 nres--;
4771 }
4772 }
4773
4774 *indlen1 = len1;
4775 *indlen2 = len2;
4776
4777 return stolen;
4778}
4779
4780/*
4724 * Called by xfs_bmapi to update file extent records and the btree 4781 * Called by xfs_bmapi to update file extent records and the btree
4725 * after removing space (or undoing a delayed allocation). 4782 * after removing space (or undoing a delayed allocation).
4726 */ 4783 */
@@ -4984,28 +5041,29 @@ xfs_bmap_del_extent(
4984 XFS_IFORK_NEXT_SET(ip, whichfork, 5041 XFS_IFORK_NEXT_SET(ip, whichfork,
4985 XFS_IFORK_NEXTENTS(ip, whichfork) + 1); 5042 XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
4986 } else { 5043 } else {
5044 xfs_filblks_t stolen;
4987 ASSERT(whichfork == XFS_DATA_FORK); 5045 ASSERT(whichfork == XFS_DATA_FORK);
4988 temp = xfs_bmap_worst_indlen(ip, temp); 5046
5047 /*
5048 * Distribute the original indlen reservation across the
5049 * two new extents. Steal blocks from the deleted extent
5050 * if necessary. Stealing blocks simply fudges the
5051 * fdblocks accounting in xfs_bunmapi().
5052 */
5053 temp = xfs_bmap_worst_indlen(ip, got.br_blockcount);
5054 temp2 = xfs_bmap_worst_indlen(ip, new.br_blockcount);
5055 stolen = xfs_bmap_split_indlen(da_old, &temp, &temp2,
5056 del->br_blockcount);
5057 da_new = temp + temp2 - stolen;
5058 del->br_blockcount -= stolen;
5059
5060 /*
5061 * Set the reservation for each extent. Warn if either
5062 * is zero as this can lead to delalloc problems.
5063 */
5064 WARN_ON_ONCE(!temp || !temp2);
4989 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 5065 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
4990 temp2 = xfs_bmap_worst_indlen(ip, temp2);
4991 new.br_startblock = nullstartblock((int)temp2); 5066 new.br_startblock = nullstartblock((int)temp2);
4992 da_new = temp + temp2;
4993 while (da_new > da_old) {
4994 if (temp) {
4995 temp--;
4996 da_new--;
4997 xfs_bmbt_set_startblock(ep,
4998 nullstartblock((int)temp));
4999 }
5000 if (da_new == da_old)
5001 break;
5002 if (temp2) {
5003 temp2--;
5004 da_new--;
5005 new.br_startblock =
5006 nullstartblock((int)temp2);
5007 }
5008 }
5009 } 5067 }
5010 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 5068 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
5011 xfs_iext_insert(ip, *idx + 1, 1, &new, state); 5069 xfs_iext_insert(ip, *idx + 1, 1, &new, state);
@@ -5210,7 +5268,7 @@ xfs_bunmapi(
5210 * This is better than zeroing it. 5268 * This is better than zeroing it.
5211 */ 5269 */
5212 ASSERT(del.br_state == XFS_EXT_NORM); 5270 ASSERT(del.br_state == XFS_EXT_NORM);
5213 ASSERT(xfs_trans_get_block_res(tp) > 0); 5271 ASSERT(tp->t_blk_res > 0);
5214 /* 5272 /*
5215 * If this spans a realtime extent boundary, 5273 * If this spans a realtime extent boundary,
5216 * chop it back to the start of the one we end at. 5274 * chop it back to the start of the one we end at.
@@ -5241,7 +5299,7 @@ xfs_bunmapi(
5241 del.br_startblock += mod; 5299 del.br_startblock += mod;
5242 } else if ((del.br_startoff == start && 5300 } else if ((del.br_startoff == start &&
5243 (del.br_state == XFS_EXT_UNWRITTEN || 5301 (del.br_state == XFS_EXT_UNWRITTEN ||
5244 xfs_trans_get_block_res(tp) == 0)) || 5302 tp->t_blk_res == 0)) ||
5245 !xfs_sb_version_hasextflgbit(&mp->m_sb)) { 5303 !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
5246 /* 5304 /*
5247 * Can't make it unwritten. There isn't 5305 * Can't make it unwritten. There isn't
@@ -5296,9 +5354,37 @@ xfs_bunmapi(
5296 goto nodelete; 5354 goto nodelete;
5297 } 5355 }
5298 } 5356 }
5357
5358 /*
5359 * If it's the case where the directory code is running
5360 * with no block reservation, and the deleted block is in
5361 * the middle of its extent, and the resulting insert
5362 * of an extent would cause transformation to btree format,
5363 * then reject it. The calling code will then swap
5364 * blocks around instead.
5365 * We have to do this now, rather than waiting for the
5366 * conversion to btree format, since the transaction
5367 * will be dirty.
5368 */
5369 if (!wasdel && tp->t_blk_res == 0 &&
5370 XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
5371 XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
5372 XFS_IFORK_MAXEXT(ip, whichfork) &&
5373 del.br_startoff > got.br_startoff &&
5374 del.br_startoff + del.br_blockcount <
5375 got.br_startoff + got.br_blockcount) {
5376 error = -ENOSPC;
5377 goto error0;
5378 }
5379
5380 /*
5381 * Unreserve quota and update realtime free space, if
5382 * appropriate. If delayed allocation, update the inode delalloc
5383 * counter now and wait to update the sb counters as
5384 * xfs_bmap_del_extent() might need to borrow some blocks.
5385 */
5299 if (wasdel) { 5386 if (wasdel) {
5300 ASSERT(startblockval(del.br_startblock) > 0); 5387 ASSERT(startblockval(del.br_startblock) > 0);
5301 /* Update realtime/data freespace, unreserve quota */
5302 if (isrt) { 5388 if (isrt) {
5303 xfs_filblks_t rtexts; 5389 xfs_filblks_t rtexts;
5304 5390
@@ -5309,8 +5395,6 @@ xfs_bunmapi(
5309 ip, -((long)del.br_blockcount), 0, 5395 ip, -((long)del.br_blockcount), 0,
5310 XFS_QMOPT_RES_RTBLKS); 5396 XFS_QMOPT_RES_RTBLKS);
5311 } else { 5397 } else {
5312 xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount,
5313 false);
5314 (void)xfs_trans_reserve_quota_nblks(NULL, 5398 (void)xfs_trans_reserve_quota_nblks(NULL,
5315 ip, -((long)del.br_blockcount), 0, 5399 ip, -((long)del.br_blockcount), 0,
5316 XFS_QMOPT_RES_REGBLKS); 5400 XFS_QMOPT_RES_REGBLKS);
@@ -5321,32 +5405,16 @@ xfs_bunmapi(
5321 XFS_BTCUR_BPRV_WASDEL; 5405 XFS_BTCUR_BPRV_WASDEL;
5322 } else if (cur) 5406 } else if (cur)
5323 cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL; 5407 cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
5324 /* 5408
5325 * If it's the case where the directory code is running
5326 * with no block reservation, and the deleted block is in
5327 * the middle of its extent, and the resulting insert
5328 * of an extent would cause transformation to btree format,
5329 * then reject it. The calling code will then swap
5330 * blocks around instead.
5331 * We have to do this now, rather than waiting for the
5332 * conversion to btree format, since the transaction
5333 * will be dirty.
5334 */
5335 if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
5336 XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
5337 XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
5338 XFS_IFORK_MAXEXT(ip, whichfork) &&
5339 del.br_startoff > got.br_startoff &&
5340 del.br_startoff + del.br_blockcount <
5341 got.br_startoff + got.br_blockcount) {
5342 error = -ENOSPC;
5343 goto error0;
5344 }
5345 error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del, 5409 error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
5346 &tmp_logflags, whichfork); 5410 &tmp_logflags, whichfork);
5347 logflags |= tmp_logflags; 5411 logflags |= tmp_logflags;
5348 if (error) 5412 if (error)
5349 goto error0; 5413 goto error0;
5414
5415 if (!isrt && wasdel)
5416 xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false);
5417
5350 bno = del.br_startoff - 1; 5418 bno = del.br_startoff - 1;
5351nodelete: 5419nodelete:
5352 /* 5420 /*
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 1637c37bfbaa..6282f6e708af 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -461,7 +461,7 @@ xfs_bmbt_alloc_block(
461 * reservation amount is insufficient then we may fail a 461 * reservation amount is insufficient then we may fail a
462 * block allocation here and corrupt the filesystem. 462 * block allocation here and corrupt the filesystem.
463 */ 463 */
464 args.minleft = xfs_trans_get_block_res(args.tp); 464 args.minleft = args.tp->t_blk_res;
465 } else if (cur->bc_private.b.flist->xbf_low) { 465 } else if (cur->bc_private.b.flist->xbf_low) {
466 args.type = XFS_ALLOCTYPE_START_BNO; 466 args.type = XFS_ALLOCTYPE_START_BNO;
467 } else { 467 } else {
@@ -470,7 +470,7 @@ xfs_bmbt_alloc_block(
470 470
471 args.minlen = args.maxlen = args.prod = 1; 471 args.minlen = args.maxlen = args.prod = 1;
472 args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; 472 args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
473 if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) { 473 if (!args.wasdel && args.tp->t_blk_res == 0) {
474 error = -ENOSPC; 474 error = -ENOSPC;
475 goto error0; 475 goto error0;
476 } 476 }
@@ -531,7 +531,6 @@ xfs_bmbt_free_block(
531 531
532 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 532 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
533 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); 533 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
534 xfs_trans_binval(tp, bp);
535 return 0; 534 return 0;
536} 535}
537 536
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index a0eb18ce3ad3..1f88e1ce770f 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -294,6 +294,21 @@ xfs_btree_sblock_verify_crc(
294 return true; 294 return true;
295} 295}
296 296
297static int
298xfs_btree_free_block(
299 struct xfs_btree_cur *cur,
300 struct xfs_buf *bp)
301{
302 int error;
303
304 error = cur->bc_ops->free_block(cur, bp);
305 if (!error) {
306 xfs_trans_binval(cur->bc_tp, bp);
307 XFS_BTREE_STATS_INC(cur, free);
308 }
309 return error;
310}
311
297/* 312/*
298 * Delete the btree cursor. 313 * Delete the btree cursor.
299 */ 314 */
@@ -3209,6 +3224,7 @@ xfs_btree_kill_iroot(
3209 int level; 3224 int level;
3210 int index; 3225 int index;
3211 int numrecs; 3226 int numrecs;
3227 int error;
3212#ifdef DEBUG 3228#ifdef DEBUG
3213 union xfs_btree_ptr ptr; 3229 union xfs_btree_ptr ptr;
3214 int i; 3230 int i;
@@ -3272,8 +3288,6 @@ xfs_btree_kill_iroot(
3272 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3288 cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3273#ifdef DEBUG 3289#ifdef DEBUG
3274 for (i = 0; i < numrecs; i++) { 3290 for (i = 0; i < numrecs; i++) {
3275 int error;
3276
3277 error = xfs_btree_check_ptr(cur, cpp, i, level - 1); 3291 error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
3278 if (error) { 3292 if (error) {
3279 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 3293 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
@@ -3283,8 +3297,11 @@ xfs_btree_kill_iroot(
3283#endif 3297#endif
3284 xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); 3298 xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3285 3299
3286 cur->bc_ops->free_block(cur, cbp); 3300 error = xfs_btree_free_block(cur, cbp);
3287 XFS_BTREE_STATS_INC(cur, free); 3301 if (error) {
3302 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3303 return error;
3304 }
3288 3305
3289 cur->bc_bufs[level - 1] = NULL; 3306 cur->bc_bufs[level - 1] = NULL;
3290 be16_add_cpu(&block->bb_level, -1); 3307 be16_add_cpu(&block->bb_level, -1);
@@ -3317,14 +3334,12 @@ xfs_btree_kill_root(
3317 */ 3334 */
3318 cur->bc_ops->set_root(cur, newroot, -1); 3335 cur->bc_ops->set_root(cur, newroot, -1);
3319 3336
3320 error = cur->bc_ops->free_block(cur, bp); 3337 error = xfs_btree_free_block(cur, bp);
3321 if (error) { 3338 if (error) {
3322 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 3339 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3323 return error; 3340 return error;
3324 } 3341 }
3325 3342
3326 XFS_BTREE_STATS_INC(cur, free);
3327
3328 cur->bc_bufs[level] = NULL; 3343 cur->bc_bufs[level] = NULL;
3329 cur->bc_ra[level] = 0; 3344 cur->bc_ra[level] = 0;
3330 cur->bc_nlevels--; 3345 cur->bc_nlevels--;
@@ -3830,10 +3845,9 @@ xfs_btree_delrec(
3830 } 3845 }
3831 3846
3832 /* Free the deleted block. */ 3847 /* Free the deleted block. */
3833 error = cur->bc_ops->free_block(cur, rbp); 3848 error = xfs_btree_free_block(cur, rbp);
3834 if (error) 3849 if (error)
3835 goto error0; 3850 goto error0;
3836 XFS_BTREE_STATS_INC(cur, free);
3837 3851
3838 /* 3852 /*
3839 * If we joined with the left neighbor, set the buffer in the 3853 * If we joined with the left neighbor, set the buffer in the
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index b14bbd6bb05f..8d4d8bce41bf 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -641,6 +641,22 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
641 */ 641 */
642#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */ 642#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */
643 643
644/*
645 * Entries are packed toward the top as tight as possible.
646 */
647typedef struct xfs_attr_shortform {
648 struct xfs_attr_sf_hdr { /* constant-structure header block */
649 __be16 totsize; /* total bytes in shortform list */
650 __u8 count; /* count of active entries */
651 } hdr;
652 struct xfs_attr_sf_entry {
653 __uint8_t namelen; /* actual length of name (no NULL) */
654 __uint8_t valuelen; /* actual length of value (no NULL) */
655 __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
656 __uint8_t nameval[1]; /* name & value bytes concatenated */
657 } list[1]; /* variable sized array */
658} xfs_attr_shortform_t;
659
644typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ 660typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
645 __be16 base; /* base of free region */ 661 __be16 base; /* base of free region */
646 __be16 size; /* length of free region */ 662 __be16 size; /* length of free region */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 2fb53a5c0a74..af0f9d171f8a 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -176,7 +176,7 @@ xfs_dir_isempty(
176{ 176{
177 xfs_dir2_sf_hdr_t *sfp; 177 xfs_dir2_sf_hdr_t *sfp;
178 178
179 ASSERT(S_ISDIR(dp->i_d.di_mode)); 179 ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
180 if (dp->i_d.di_size == 0) /* might happen during shutdown. */ 180 if (dp->i_d.di_size == 0) /* might happen during shutdown. */
181 return 1; 181 return 1;
182 if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp)) 182 if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
@@ -231,7 +231,7 @@ xfs_dir_init(
231 struct xfs_da_args *args; 231 struct xfs_da_args *args;
232 int error; 232 int error;
233 233
234 ASSERT(S_ISDIR(dp->i_d.di_mode)); 234 ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
235 error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino); 235 error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
236 if (error) 236 if (error)
237 return error; 237 return error;
@@ -266,7 +266,7 @@ xfs_dir_createname(
266 int rval; 266 int rval;
267 int v; /* type-checking value */ 267 int v; /* type-checking value */
268 268
269 ASSERT(S_ISDIR(dp->i_d.di_mode)); 269 ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
270 if (inum) { 270 if (inum) {
271 rval = xfs_dir_ino_validate(tp->t_mountp, inum); 271 rval = xfs_dir_ino_validate(tp->t_mountp, inum);
272 if (rval) 272 if (rval)
@@ -364,7 +364,7 @@ xfs_dir_lookup(
364 int v; /* type-checking value */ 364 int v; /* type-checking value */
365 int lock_mode; 365 int lock_mode;
366 366
367 ASSERT(S_ISDIR(dp->i_d.di_mode)); 367 ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
368 XFS_STATS_INC(dp->i_mount, xs_dir_lookup); 368 XFS_STATS_INC(dp->i_mount, xs_dir_lookup);
369 369
370 /* 370 /*
@@ -443,7 +443,7 @@ xfs_dir_removename(
443 int rval; 443 int rval;
444 int v; /* type-checking value */ 444 int v; /* type-checking value */
445 445
446 ASSERT(S_ISDIR(dp->i_d.di_mode)); 446 ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
447 XFS_STATS_INC(dp->i_mount, xs_dir_remove); 447 XFS_STATS_INC(dp->i_mount, xs_dir_remove);
448 448
449 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 449 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
@@ -505,7 +505,7 @@ xfs_dir_replace(
505 int rval; 505 int rval;
506 int v; /* type-checking value */ 506 int v; /* type-checking value */
507 507
508 ASSERT(S_ISDIR(dp->i_d.di_mode)); 508 ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
509 509
510 rval = xfs_dir_ino_validate(tp->t_mountp, inum); 510 rval = xfs_dir_ino_validate(tp->t_mountp, inum);
511 if (rval) 511 if (rval)
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 63ee03db796c..75a557432d0f 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -2235,6 +2235,9 @@ xfs_dir2_node_trim_free(
2235 2235
2236 dp = args->dp; 2236 dp = args->dp;
2237 tp = args->trans; 2237 tp = args->trans;
2238
2239 *rvalp = 0;
2240
2238 /* 2241 /*
2239 * Read the freespace block. 2242 * Read the freespace block.
2240 */ 2243 */
@@ -2255,7 +2258,6 @@ xfs_dir2_node_trim_free(
2255 */ 2258 */
2256 if (freehdr.nused > 0) { 2259 if (freehdr.nused > 0) {
2257 xfs_trans_brelse(tp, bp); 2260 xfs_trans_brelse(tp, bp);
2258 *rvalp = 0;
2259 return 0; 2261 return 0;
2260 } 2262 }
2261 /* 2263 /*
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 66d702e6b9ff..22297f9b0fd5 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2403,8 +2403,8 @@ xfs_ialloc_compute_maxlevels(
2403 2403
2404 maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >> 2404 maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
2405 XFS_INODES_PER_CHUNK_LOG; 2405 XFS_INODES_PER_CHUNK_LOG;
2406 minleafrecs = mp->m_alloc_mnr[0]; 2406 minleafrecs = mp->m_inobt_mnr[0];
2407 minnoderecs = mp->m_alloc_mnr[1]; 2407 minnoderecs = mp->m_inobt_mnr[1];
2408 maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; 2408 maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
2409 for (level = 1; maxblocks > 1; level++) 2409 for (level = 1; maxblocks > 1; level++)
2410 maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; 2410 maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index c679f3c05b63..89c21d771e35 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -125,16 +125,8 @@ xfs_inobt_free_block(
125 struct xfs_btree_cur *cur, 125 struct xfs_btree_cur *cur,
126 struct xfs_buf *bp) 126 struct xfs_buf *bp)
127{ 127{
128 xfs_fsblock_t fsbno; 128 return xfs_free_extent(cur->bc_tp,
129 int error; 129 XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1);
130
131 fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
132 error = xfs_free_extent(cur->bc_tp, fsbno, 1);
133 if (error)
134 return error;
135
136 xfs_trans_binval(cur->bc_tp, bp);
137 return error;
138} 130}
139 131
140STATIC int 132STATIC int
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 1aabfda669b0..9d9559eb2835 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -195,28 +195,50 @@ xfs_imap_to_bp(
195} 195}
196 196
197void 197void
198xfs_dinode_from_disk( 198xfs_inode_from_disk(
199 xfs_icdinode_t *to, 199 struct xfs_inode *ip,
200 xfs_dinode_t *from) 200 struct xfs_dinode *from)
201{ 201{
202 to->di_magic = be16_to_cpu(from->di_magic); 202 struct xfs_icdinode *to = &ip->i_d;
203 to->di_mode = be16_to_cpu(from->di_mode); 203 struct inode *inode = VFS_I(ip);
204 to->di_version = from ->di_version; 204
205
206 /*
207 * Convert v1 inodes immediately to v2 inode format as this is the
208 * minimum inode version format we support in the rest of the code.
209 */
210 to->di_version = from->di_version;
211 if (to->di_version == 1) {
212 set_nlink(inode, be16_to_cpu(from->di_onlink));
213 to->di_projid_lo = 0;
214 to->di_projid_hi = 0;
215 to->di_version = 2;
216 } else {
217 set_nlink(inode, be32_to_cpu(from->di_nlink));
218 to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
219 to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
220 }
221
205 to->di_format = from->di_format; 222 to->di_format = from->di_format;
206 to->di_onlink = be16_to_cpu(from->di_onlink);
207 to->di_uid = be32_to_cpu(from->di_uid); 223 to->di_uid = be32_to_cpu(from->di_uid);
208 to->di_gid = be32_to_cpu(from->di_gid); 224 to->di_gid = be32_to_cpu(from->di_gid);
209 to->di_nlink = be32_to_cpu(from->di_nlink);
210 to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
211 to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
212 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
213 to->di_flushiter = be16_to_cpu(from->di_flushiter); 225 to->di_flushiter = be16_to_cpu(from->di_flushiter);
214 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); 226
215 to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec); 227 /*
216 to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec); 228 * Time is signed, so need to convert to signed 32 bit before
217 to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec); 229 * storing in inode timestamp which may be 64 bit. Otherwise
218 to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec); 230 * a time before epoch is converted to a time long after epoch
219 to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec); 231 * on 64 bit systems.
232 */
233 inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec);
234 inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec);
235 inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec);
236 inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec);
237 inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec);
238 inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec);
239 inode->i_generation = be32_to_cpu(from->di_gen);
240 inode->i_mode = be16_to_cpu(from->di_mode);
241
220 to->di_size = be64_to_cpu(from->di_size); 242 to->di_size = be64_to_cpu(from->di_size);
221 to->di_nblocks = be64_to_cpu(from->di_nblocks); 243 to->di_nblocks = be64_to_cpu(from->di_nblocks);
222 to->di_extsize = be32_to_cpu(from->di_extsize); 244 to->di_extsize = be32_to_cpu(from->di_extsize);
@@ -227,42 +249,96 @@ xfs_dinode_from_disk(
227 to->di_dmevmask = be32_to_cpu(from->di_dmevmask); 249 to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
228 to->di_dmstate = be16_to_cpu(from->di_dmstate); 250 to->di_dmstate = be16_to_cpu(from->di_dmstate);
229 to->di_flags = be16_to_cpu(from->di_flags); 251 to->di_flags = be16_to_cpu(from->di_flags);
230 to->di_gen = be32_to_cpu(from->di_gen);
231 252
232 if (to->di_version == 3) { 253 if (to->di_version == 3) {
233 to->di_changecount = be64_to_cpu(from->di_changecount); 254 inode->i_version = be64_to_cpu(from->di_changecount);
234 to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); 255 to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
235 to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); 256 to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
236 to->di_flags2 = be64_to_cpu(from->di_flags2); 257 to->di_flags2 = be64_to_cpu(from->di_flags2);
237 to->di_ino = be64_to_cpu(from->di_ino);
238 to->di_lsn = be64_to_cpu(from->di_lsn);
239 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
240 uuid_copy(&to->di_uuid, &from->di_uuid);
241 } 258 }
242} 259}
243 260
244void 261void
245xfs_dinode_to_disk( 262xfs_inode_to_disk(
246 xfs_dinode_t *to, 263 struct xfs_inode *ip,
247 xfs_icdinode_t *from) 264 struct xfs_dinode *to,
265 xfs_lsn_t lsn)
266{
267 struct xfs_icdinode *from = &ip->i_d;
268 struct inode *inode = VFS_I(ip);
269
270 to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
271 to->di_onlink = 0;
272
273 to->di_version = from->di_version;
274 to->di_format = from->di_format;
275 to->di_uid = cpu_to_be32(from->di_uid);
276 to->di_gid = cpu_to_be32(from->di_gid);
277 to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
278 to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
279
280 memset(to->di_pad, 0, sizeof(to->di_pad));
281 to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec);
282 to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
283 to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec);
284 to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
285 to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec);
286 to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
287 to->di_nlink = cpu_to_be32(inode->i_nlink);
288 to->di_gen = cpu_to_be32(inode->i_generation);
289 to->di_mode = cpu_to_be16(inode->i_mode);
290
291 to->di_size = cpu_to_be64(from->di_size);
292 to->di_nblocks = cpu_to_be64(from->di_nblocks);
293 to->di_extsize = cpu_to_be32(from->di_extsize);
294 to->di_nextents = cpu_to_be32(from->di_nextents);
295 to->di_anextents = cpu_to_be16(from->di_anextents);
296 to->di_forkoff = from->di_forkoff;
297 to->di_aformat = from->di_aformat;
298 to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
299 to->di_dmstate = cpu_to_be16(from->di_dmstate);
300 to->di_flags = cpu_to_be16(from->di_flags);
301
302 if (from->di_version == 3) {
303 to->di_changecount = cpu_to_be64(inode->i_version);
304 to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
305 to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
306 to->di_flags2 = cpu_to_be64(from->di_flags2);
307
308 to->di_ino = cpu_to_be64(ip->i_ino);
309 to->di_lsn = cpu_to_be64(lsn);
310 memset(to->di_pad2, 0, sizeof(to->di_pad2));
311 uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
312 to->di_flushiter = 0;
313 } else {
314 to->di_flushiter = cpu_to_be16(from->di_flushiter);
315 }
316}
317
318void
319xfs_log_dinode_to_disk(
320 struct xfs_log_dinode *from,
321 struct xfs_dinode *to)
248{ 322{
249 to->di_magic = cpu_to_be16(from->di_magic); 323 to->di_magic = cpu_to_be16(from->di_magic);
250 to->di_mode = cpu_to_be16(from->di_mode); 324 to->di_mode = cpu_to_be16(from->di_mode);
251 to->di_version = from ->di_version; 325 to->di_version = from->di_version;
252 to->di_format = from->di_format; 326 to->di_format = from->di_format;
253 to->di_onlink = cpu_to_be16(from->di_onlink); 327 to->di_onlink = 0;
254 to->di_uid = cpu_to_be32(from->di_uid); 328 to->di_uid = cpu_to_be32(from->di_uid);
255 to->di_gid = cpu_to_be32(from->di_gid); 329 to->di_gid = cpu_to_be32(from->di_gid);
256 to->di_nlink = cpu_to_be32(from->di_nlink); 330 to->di_nlink = cpu_to_be32(from->di_nlink);
257 to->di_projid_lo = cpu_to_be16(from->di_projid_lo); 331 to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
258 to->di_projid_hi = cpu_to_be16(from->di_projid_hi); 332 to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
259 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 333 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
334
260 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); 335 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
261 to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); 336 to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
262 to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); 337 to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
263 to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); 338 to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
264 to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); 339 to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
265 to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); 340 to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
341
266 to->di_size = cpu_to_be64(from->di_size); 342 to->di_size = cpu_to_be64(from->di_size);
267 to->di_nblocks = cpu_to_be64(from->di_nblocks); 343 to->di_nblocks = cpu_to_be64(from->di_nblocks);
268 to->di_extsize = cpu_to_be32(from->di_extsize); 344 to->di_extsize = cpu_to_be32(from->di_extsize);
@@ -367,13 +443,10 @@ xfs_iread(
367 !(mp->m_flags & XFS_MOUNT_IKEEP)) { 443 !(mp->m_flags & XFS_MOUNT_IKEEP)) {
368 /* initialise the on-disk inode core */ 444 /* initialise the on-disk inode core */
369 memset(&ip->i_d, 0, sizeof(ip->i_d)); 445 memset(&ip->i_d, 0, sizeof(ip->i_d));
370 ip->i_d.di_magic = XFS_DINODE_MAGIC; 446 VFS_I(ip)->i_generation = prandom_u32();
371 ip->i_d.di_gen = prandom_u32(); 447 if (xfs_sb_version_hascrc(&mp->m_sb))
372 if (xfs_sb_version_hascrc(&mp->m_sb)) {
373 ip->i_d.di_version = 3; 448 ip->i_d.di_version = 3;
374 ip->i_d.di_ino = ip->i_ino; 449 else
375 uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid);
376 } else
377 ip->i_d.di_version = 2; 450 ip->i_d.di_version = 2;
378 return 0; 451 return 0;
379 } 452 }
@@ -403,7 +476,7 @@ xfs_iread(
403 * Otherwise, just get the truly permanent information. 476 * Otherwise, just get the truly permanent information.
404 */ 477 */
405 if (dip->di_mode) { 478 if (dip->di_mode) {
406 xfs_dinode_from_disk(&ip->i_d, dip); 479 xfs_inode_from_disk(ip, dip);
407 error = xfs_iformat_fork(ip, dip); 480 error = xfs_iformat_fork(ip, dip);
408 if (error) { 481 if (error) {
409#ifdef DEBUG 482#ifdef DEBUG
@@ -417,16 +490,10 @@ xfs_iread(
417 * Partial initialisation of the in-core inode. Just the bits 490 * Partial initialisation of the in-core inode. Just the bits
418 * that xfs_ialloc won't overwrite or relies on being correct. 491 * that xfs_ialloc won't overwrite or relies on being correct.
419 */ 492 */
420 ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
421 ip->i_d.di_version = dip->di_version; 493 ip->i_d.di_version = dip->di_version;
422 ip->i_d.di_gen = be32_to_cpu(dip->di_gen); 494 VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen);
423 ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); 495 ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
424 496
425 if (dip->di_version == 3) {
426 ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
427 uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
428 }
429
430 /* 497 /*
431 * Make sure to pull in the mode here as well in 498 * Make sure to pull in the mode here as well in
432 * case the inode is released without being used. 499 * case the inode is released without being used.
@@ -434,25 +501,10 @@ xfs_iread(
434 * the inode is already free and not try to mess 501 * the inode is already free and not try to mess
435 * with the uninitialized part of it. 502 * with the uninitialized part of it.
436 */ 503 */
437 ip->i_d.di_mode = 0; 504 VFS_I(ip)->i_mode = 0;
438 }
439
440 /*
441 * Automatically convert version 1 inode formats in memory to version 2
442 * inode format. If the inode is modified, it will get logged and
443 * rewritten as a version 2 inode. We can do this because we set the
444 * superblock feature bit for v2 inodes unconditionally during mount
445 * and it means the reast of the code can assume the inode version is 2
446 * or higher.
447 */
448 if (ip->i_d.di_version == 1) {
449 ip->i_d.di_version = 2;
450 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
451 ip->i_d.di_nlink = ip->i_d.di_onlink;
452 ip->i_d.di_onlink = 0;
453 xfs_set_projid(ip, 0);
454 } 505 }
455 506
507 ASSERT(ip->i_d.di_version >= 2);
456 ip->i_delayed_blks = 0; 508 ip->i_delayed_blks = 0;
457 509
458 /* 510 /*
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 9308c47f2a52..7c4dd321b215 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -20,7 +20,36 @@
20 20
21struct xfs_inode; 21struct xfs_inode;
22struct xfs_dinode; 22struct xfs_dinode;
23struct xfs_icdinode; 23
24/*
25 * In memory representation of the XFS inode. This is held in the in-core struct
26 * xfs_inode and represents the current on disk values but the structure is not
27 * in on-disk format. That is, this structure is always translated to on-disk
28 * format specific structures at the appropriate time.
29 */
30struct xfs_icdinode {
31 __int8_t di_version; /* inode version */
32 __int8_t di_format; /* format of di_c data */
33 __uint16_t di_flushiter; /* incremented on flush */
34 __uint32_t di_uid; /* owner's user id */
35 __uint32_t di_gid; /* owner's group id */
36 __uint16_t di_projid_lo; /* lower part of owner's project id */
37 __uint16_t di_projid_hi; /* higher part of owner's project id */
38 xfs_fsize_t di_size; /* number of bytes in file */
39 xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */
40 xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
41 xfs_extnum_t di_nextents; /* number of extents in data fork */
42 xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
43 __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
44 __int8_t di_aformat; /* format of attr fork's data */
45 __uint32_t di_dmevmask; /* DMIG event mask */
46 __uint16_t di_dmstate; /* DMIG state info */
47 __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
48
49 __uint64_t di_flags2; /* more random flags */
50
51 xfs_ictimestamp_t di_crtime; /* time created */
52};
24 53
25/* 54/*
26 * Inode location information. Stored in the inode and passed to 55 * Inode location information. Stored in the inode and passed to
@@ -38,8 +67,11 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
38int xfs_iread(struct xfs_mount *, struct xfs_trans *, 67int xfs_iread(struct xfs_mount *, struct xfs_trans *,
39 struct xfs_inode *, uint); 68 struct xfs_inode *, uint);
40void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); 69void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
41void xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from); 70void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to,
42void xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from); 71 xfs_lsn_t lsn);
72void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
73void xfs_log_dinode_to_disk(struct xfs_log_dinode *from,
74 struct xfs_dinode *to);
43 75
44#if defined(DEBUG) 76#if defined(DEBUG)
45void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); 77void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 0defbd02f62d..11faf7df14c8 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -31,6 +31,7 @@
31#include "xfs_error.h" 31#include "xfs_error.h"
32#include "xfs_trace.h" 32#include "xfs_trace.h"
33#include "xfs_attr_sf.h" 33#include "xfs_attr_sf.h"
34#include "xfs_da_format.h"
34 35
35kmem_zone_t *xfs_ifork_zone; 36kmem_zone_t *xfs_ifork_zone;
36 37
@@ -120,7 +121,7 @@ xfs_iformat_fork(
120 return -EFSCORRUPTED; 121 return -EFSCORRUPTED;
121 } 122 }
122 123
123 switch (ip->i_d.di_mode & S_IFMT) { 124 switch (VFS_I(ip)->i_mode & S_IFMT) {
124 case S_IFIFO: 125 case S_IFIFO:
125 case S_IFCHR: 126 case S_IFCHR:
126 case S_IFBLK: 127 case S_IFBLK:
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 265314690415..d54a8018b079 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -290,6 +290,7 @@ typedef struct xfs_inode_log_format_64 {
290 __int32_t ilf_boffset; /* off of inode in buffer */ 290 __int32_t ilf_boffset; /* off of inode in buffer */
291} xfs_inode_log_format_64_t; 291} xfs_inode_log_format_64_t;
292 292
293
293/* 294/*
294 * Flags for xfs_trans_log_inode flags field. 295 * Flags for xfs_trans_log_inode flags field.
295 */ 296 */
@@ -360,15 +361,15 @@ typedef struct xfs_ictimestamp {
360} xfs_ictimestamp_t; 361} xfs_ictimestamp_t;
361 362
362/* 363/*
363 * NOTE: This structure must be kept identical to struct xfs_dinode 364 * Define the format of the inode core that is logged. This structure must be
364 * except for the endianness annotations. 365 * kept identical to struct xfs_dinode except for the endianness annotations.
365 */ 366 */
366typedef struct xfs_icdinode { 367struct xfs_log_dinode {
367 __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ 368 __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
368 __uint16_t di_mode; /* mode and type of file */ 369 __uint16_t di_mode; /* mode and type of file */
369 __int8_t di_version; /* inode version */ 370 __int8_t di_version; /* inode version */
370 __int8_t di_format; /* format of di_c data */ 371 __int8_t di_format; /* format of di_c data */
371 __uint16_t di_onlink; /* old number of links to file */ 372 __uint8_t di_pad3[2]; /* unused in v2/3 inodes */
372 __uint32_t di_uid; /* owner's user id */ 373 __uint32_t di_uid; /* owner's user id */
373 __uint32_t di_gid; /* owner's group id */ 374 __uint32_t di_gid; /* owner's group id */
374 __uint32_t di_nlink; /* number of links to file */ 375 __uint32_t di_nlink; /* number of links to file */
@@ -407,13 +408,13 @@ typedef struct xfs_icdinode {
407 uuid_t di_uuid; /* UUID of the filesystem */ 408 uuid_t di_uuid; /* UUID of the filesystem */
408 409
409 /* structure must be padded to 64 bit alignment */ 410 /* structure must be padded to 64 bit alignment */
410} xfs_icdinode_t; 411};
411 412
412static inline uint xfs_icdinode_size(int version) 413static inline uint xfs_log_dinode_size(int version)
413{ 414{
414 if (version == 3) 415 if (version == 3)
415 return sizeof(struct xfs_icdinode); 416 return sizeof(struct xfs_log_dinode);
416 return offsetof(struct xfs_icdinode, di_next_unlinked); 417 return offsetof(struct xfs_log_dinode, di_next_unlinked);
417} 418}
418 419
419/* 420/*
@@ -495,6 +496,8 @@ enum xfs_blft {
495 XFS_BLFT_ATTR_LEAF_BUF, 496 XFS_BLFT_ATTR_LEAF_BUF,
496 XFS_BLFT_ATTR_RMT_BUF, 497 XFS_BLFT_ATTR_RMT_BUF,
497 XFS_BLFT_SB_BUF, 498 XFS_BLFT_SB_BUF,
499 XFS_BLFT_RTBITMAP_BUF,
500 XFS_BLFT_RTSUMMARY_BUF,
498 XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS), 501 XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
499}; 502};
500 503
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index f51078f1e92a..8eed51275bb3 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -37,7 +37,7 @@ typedef __uint16_t xfs_qwarncnt_t;
37#define XFS_DQ_PROJ 0x0002 /* project quota */ 37#define XFS_DQ_PROJ 0x0002 /* project quota */
38#define XFS_DQ_GROUP 0x0004 /* a group quota */ 38#define XFS_DQ_GROUP 0x0004 /* a group quota */
39#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */ 39#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */
40#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */ 40#define XFS_DQ_FREEING 0x0010 /* dquot is being torn down */
41 41
42#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) 42#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
43 43
@@ -116,6 +116,7 @@ typedef __uint16_t xfs_qwarncnt_t;
116#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ 116#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
117#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ 117#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
118#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ 118#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */
119#define XFS_QMOPT_DQNEXT 0x0008000 /* return next dquot >= this ID */
119 120
120/* 121/*
121 * flags to xfs_trans_mod_dquot to indicate which field needs to be 122 * flags to xfs_trans_mod_dquot to indicate which field needs to be
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 9b59ffa1fc19..951c044e24e4 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -42,6 +42,31 @@
42 */ 42 */
43 43
44/* 44/*
45 * Real time buffers need verifiers to avoid runtime warnings during IO.
46 * We don't have anything to verify, however, so these are just dummy
47 * operations.
48 */
49static void
50xfs_rtbuf_verify_read(
51 struct xfs_buf *bp)
52{
53 return;
54}
55
56static void
57xfs_rtbuf_verify_write(
58 struct xfs_buf *bp)
59{
60 return;
61}
62
63const struct xfs_buf_ops xfs_rtbuf_ops = {
64 .name = "rtbuf",
65 .verify_read = xfs_rtbuf_verify_read,
66 .verify_write = xfs_rtbuf_verify_write,
67};
68
69/*
45 * Get a buffer for the bitmap or summary file block specified. 70 * Get a buffer for the bitmap or summary file block specified.
46 * The buffer is returned read and locked. 71 * The buffer is returned read and locked.
47 */ 72 */
@@ -68,9 +93,12 @@ xfs_rtbuf_get(
68 ASSERT(map.br_startblock != NULLFSBLOCK); 93 ASSERT(map.br_startblock != NULLFSBLOCK);
69 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 94 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
70 XFS_FSB_TO_DADDR(mp, map.br_startblock), 95 XFS_FSB_TO_DADDR(mp, map.br_startblock),
71 mp->m_bsize, 0, &bp, NULL); 96 mp->m_bsize, 0, &bp, &xfs_rtbuf_ops);
72 if (error) 97 if (error)
73 return error; 98 return error;
99
100 xfs_trans_buf_set_type(tp, bp, issum ? XFS_BLFT_RTSUMMARY_BUF
101 : XFS_BLFT_RTBITMAP_BUF);
74 *bpp = bp; 102 *bpp = bp;
75 return 0; 103 return 0;
76} 104}
@@ -983,7 +1011,7 @@ xfs_rtfree_extent(
983 mp->m_sb.sb_rextents) { 1011 mp->m_sb.sb_rextents) {
984 if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) 1012 if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
985 mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; 1013 mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
986 *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0; 1014 *(__uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0;
987 xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); 1015 xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
988 } 1016 }
989 return 0; 1017 return 0;
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index b25bb9a343f3..961e6475a309 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -27,7 +27,6 @@ extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
27extern void xfs_perag_put(struct xfs_perag *pag); 27extern void xfs_perag_put(struct xfs_perag *pag);
28extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t); 28extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
29 29
30extern void xfs_sb_calc_crc(struct xfs_buf *bp);
31extern void xfs_log_sb(struct xfs_trans *tp); 30extern void xfs_log_sb(struct xfs_trans *tp);
32extern int xfs_sync_sb(struct xfs_mount *mp, bool wait); 31extern int xfs_sync_sb(struct xfs_mount *mp, bool wait);
33extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp); 32extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 15c3ceb845b9..81ac870834da 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -53,6 +53,7 @@ extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops;
53extern const struct xfs_buf_ops xfs_sb_buf_ops; 53extern const struct xfs_buf_ops xfs_sb_buf_ops;
54extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; 54extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
55extern const struct xfs_buf_ops xfs_symlink_buf_ops; 55extern const struct xfs_buf_ops xfs_symlink_buf_ops;
56extern const struct xfs_buf_ops xfs_rtbuf_ops;
56 57
57/* 58/*
58 * Transaction types. Used to distinguish types of buffers. These never reach 59 * Transaction types. Used to distinguish types of buffers. These never reach
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a9ebabfe7587..d445a64b979e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -36,6 +36,21 @@
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37#include <linux/writeback.h> 37#include <linux/writeback.h>
38 38
39/* flags for direct write completions */
40#define XFS_DIO_FLAG_UNWRITTEN (1 << 0)
41#define XFS_DIO_FLAG_APPEND (1 << 1)
42
43/*
44 * structure owned by writepages passed to individual writepage calls
45 */
46struct xfs_writepage_ctx {
47 struct xfs_bmbt_irec imap;
48 bool imap_valid;
49 unsigned int io_type;
50 struct xfs_ioend *ioend;
51 sector_t last_block;
52};
53
39void 54void
40xfs_count_page_state( 55xfs_count_page_state(
41 struct page *page, 56 struct page *page,
@@ -214,10 +229,12 @@ xfs_end_io(
214 struct xfs_inode *ip = XFS_I(ioend->io_inode); 229 struct xfs_inode *ip = XFS_I(ioend->io_inode);
215 int error = 0; 230 int error = 0;
216 231
217 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 232 /*
233 * Set an error if the mount has shut down and proceed with end I/O
234 * processing so it can perform whatever cleanups are necessary.
235 */
236 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
218 ioend->io_error = -EIO; 237 ioend->io_error = -EIO;
219 goto done;
220 }
221 238
222 /* 239 /*
223 * For unwritten extents we need to issue transactions to convert a 240 * For unwritten extents we need to issue transactions to convert a
@@ -265,7 +282,7 @@ xfs_alloc_ioend(
265 */ 282 */
266 atomic_set(&ioend->io_remaining, 1); 283 atomic_set(&ioend->io_remaining, 1);
267 ioend->io_error = 0; 284 ioend->io_error = 0;
268 ioend->io_list = NULL; 285 INIT_LIST_HEAD(&ioend->io_list);
269 ioend->io_type = type; 286 ioend->io_type = type;
270 ioend->io_inode = inode; 287 ioend->io_inode = inode;
271 ioend->io_buffer_head = NULL; 288 ioend->io_buffer_head = NULL;
@@ -283,8 +300,7 @@ xfs_map_blocks(
283 struct inode *inode, 300 struct inode *inode,
284 loff_t offset, 301 loff_t offset,
285 struct xfs_bmbt_irec *imap, 302 struct xfs_bmbt_irec *imap,
286 int type, 303 int type)
287 int nonblocking)
288{ 304{
289 struct xfs_inode *ip = XFS_I(inode); 305 struct xfs_inode *ip = XFS_I(inode);
290 struct xfs_mount *mp = ip->i_mount; 306 struct xfs_mount *mp = ip->i_mount;
@@ -300,12 +316,7 @@ xfs_map_blocks(
300 if (type == XFS_IO_UNWRITTEN) 316 if (type == XFS_IO_UNWRITTEN)
301 bmapi_flags |= XFS_BMAPI_IGSTATE; 317 bmapi_flags |= XFS_BMAPI_IGSTATE;
302 318
303 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 319 xfs_ilock(ip, XFS_ILOCK_SHARED);
304 if (nonblocking)
305 return -EAGAIN;
306 xfs_ilock(ip, XFS_ILOCK_SHARED);
307 }
308
309 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 320 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
310 (ip->i_df.if_flags & XFS_IFEXTENTS)); 321 (ip->i_df.if_flags & XFS_IFEXTENTS));
311 ASSERT(offset <= mp->m_super->s_maxbytes); 322 ASSERT(offset <= mp->m_super->s_maxbytes);
@@ -341,7 +352,7 @@ xfs_map_blocks(
341 return 0; 352 return 0;
342} 353}
343 354
344STATIC int 355STATIC bool
345xfs_imap_valid( 356xfs_imap_valid(
346 struct inode *inode, 357 struct inode *inode,
347 struct xfs_bmbt_irec *imap, 358 struct xfs_bmbt_irec *imap,
@@ -414,8 +425,7 @@ xfs_start_buffer_writeback(
414STATIC void 425STATIC void
415xfs_start_page_writeback( 426xfs_start_page_writeback(
416 struct page *page, 427 struct page *page,
417 int clear_dirty, 428 int clear_dirty)
418 int buffers)
419{ 429{
420 ASSERT(PageLocked(page)); 430 ASSERT(PageLocked(page));
421 ASSERT(!PageWriteback(page)); 431 ASSERT(!PageWriteback(page));
@@ -434,10 +444,6 @@ xfs_start_page_writeback(
434 set_page_writeback_keepwrite(page); 444 set_page_writeback_keepwrite(page);
435 445
436 unlock_page(page); 446 unlock_page(page);
437
438 /* If no buffers on the page are to be written, finish it here */
439 if (!buffers)
440 end_page_writeback(page);
441} 447}
442 448
443static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) 449static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
@@ -446,153 +452,101 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
446} 452}
447 453
448/* 454/*
449 * Submit all of the bios for all of the ioends we have saved up, covering the 455 * Submit all of the bios for an ioend. We are only passed a single ioend at a
450 * initial writepage page and also any probed pages. 456 * time; the caller is responsible for chaining prior to submission.
451 *
452 * Because we may have multiple ioends spanning a page, we need to start
453 * writeback on all the buffers before we submit them for I/O. If we mark the
454 * buffers as we got, then we can end up with a page that only has buffers
455 * marked async write and I/O complete on can occur before we mark the other
456 * buffers async write.
457 *
458 * The end result of this is that we trip a bug in end_page_writeback() because
459 * we call it twice for the one page as the code in end_buffer_async_write()
460 * assumes that all buffers on the page are started at the same time.
461 *
462 * The fix is two passes across the ioend list - one to start writeback on the
463 * buffer_heads, and then submit them for I/O on the second pass.
464 * 457 *
465 * If @fail is non-zero, it means that we have a situation where some part of 458 * If @fail is non-zero, it means that we have a situation where some part of
466 * the submission process has failed after we have marked paged for writeback 459 * the submission process has failed after we have marked paged for writeback
467 * and unlocked them. In this situation, we need to fail the ioend chain rather 460 * and unlocked them. In this situation, we need to fail the ioend chain rather
468 * than submit it to IO. This typically only happens on a filesystem shutdown. 461 * than submit it to IO. This typically only happens on a filesystem shutdown.
469 */ 462 */
470STATIC void 463STATIC int
471xfs_submit_ioend( 464xfs_submit_ioend(
472 struct writeback_control *wbc, 465 struct writeback_control *wbc,
473 xfs_ioend_t *ioend, 466 xfs_ioend_t *ioend,
474 int fail) 467 int status)
475{ 468{
476 xfs_ioend_t *head = ioend;
477 xfs_ioend_t *next;
478 struct buffer_head *bh; 469 struct buffer_head *bh;
479 struct bio *bio; 470 struct bio *bio;
480 sector_t lastblock = 0; 471 sector_t lastblock = 0;
481 472
482 /* Pass 1 - start writeback */ 473 /* Reserve log space if we might write beyond the on-disk inode size. */
483 do { 474 if (!status &&
484 next = ioend->io_list; 475 ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
485 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) 476 status = xfs_setfilesize_trans_alloc(ioend);
486 xfs_start_buffer_writeback(bh); 477 /*
487 } while ((ioend = next) != NULL); 478 * If we are failing the IO now, just mark the ioend with an
479 * error and finish it. This will run IO completion immediately
480 * as there is only one reference to the ioend at this point in
481 * time.
482 */
483 if (status) {
484 ioend->io_error = status;
485 xfs_finish_ioend(ioend);
486 return status;
487 }
488 488
489 /* Pass 2 - submit I/O */ 489 bio = NULL;
490 ioend = head; 490 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
491 do {
492 next = ioend->io_list;
493 bio = NULL;
494 491
495 /* 492 if (!bio) {
496 * If we are failing the IO now, just mark the ioend with an 493retry:
497 * error and finish it. This will run IO completion immediately 494 bio = xfs_alloc_ioend_bio(bh);
498 * as there is only one reference to the ioend at this point in 495 } else if (bh->b_blocknr != lastblock + 1) {
499 * time. 496 xfs_submit_ioend_bio(wbc, ioend, bio);
500 */ 497 goto retry;
501 if (fail) {
502 ioend->io_error = fail;
503 xfs_finish_ioend(ioend);
504 continue;
505 } 498 }
506 499
507 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 500 if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
508
509 if (!bio) {
510 retry:
511 bio = xfs_alloc_ioend_bio(bh);
512 } else if (bh->b_blocknr != lastblock + 1) {
513 xfs_submit_ioend_bio(wbc, ioend, bio);
514 goto retry;
515 }
516
517 if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
518 xfs_submit_ioend_bio(wbc, ioend, bio);
519 goto retry;
520 }
521
522 lastblock = bh->b_blocknr;
523 }
524 if (bio)
525 xfs_submit_ioend_bio(wbc, ioend, bio); 501 xfs_submit_ioend_bio(wbc, ioend, bio);
526 xfs_finish_ioend(ioend); 502 goto retry;
527 } while ((ioend = next) != NULL); 503 }
528}
529
530/*
531 * Cancel submission of all buffer_heads so far in this endio.
532 * Toss the endio too. Only ever called for the initial page
533 * in a writepage request, so only ever one page.
534 */
535STATIC void
536xfs_cancel_ioend(
537 xfs_ioend_t *ioend)
538{
539 xfs_ioend_t *next;
540 struct buffer_head *bh, *next_bh;
541
542 do {
543 next = ioend->io_list;
544 bh = ioend->io_buffer_head;
545 do {
546 next_bh = bh->b_private;
547 clear_buffer_async_write(bh);
548 /*
549 * The unwritten flag is cleared when added to the
550 * ioend. We're not submitting for I/O so mark the
551 * buffer unwritten again for next time around.
552 */
553 if (ioend->io_type == XFS_IO_UNWRITTEN)
554 set_buffer_unwritten(bh);
555 unlock_buffer(bh);
556 } while ((bh = next_bh) != NULL);
557 504
558 mempool_free(ioend, xfs_ioend_pool); 505 lastblock = bh->b_blocknr;
559 } while ((ioend = next) != NULL); 506 }
507 if (bio)
508 xfs_submit_ioend_bio(wbc, ioend, bio);
509 xfs_finish_ioend(ioend);
510 return 0;
560} 511}
561 512
562/* 513/*
563 * Test to see if we've been building up a completion structure for 514 * Test to see if we've been building up a completion structure for
564 * earlier buffers -- if so, we try to append to this ioend if we 515 * earlier buffers -- if so, we try to append to this ioend if we
565 * can, otherwise we finish off any current ioend and start another. 516 * can, otherwise we finish off any current ioend and start another.
566 * Return true if we've finished the given ioend. 517 * Return the ioend we finished off so that the caller can submit it
518 * once it has finished processing the dirty page.
567 */ 519 */
568STATIC void 520STATIC void
569xfs_add_to_ioend( 521xfs_add_to_ioend(
570 struct inode *inode, 522 struct inode *inode,
571 struct buffer_head *bh, 523 struct buffer_head *bh,
572 xfs_off_t offset, 524 xfs_off_t offset,
573 unsigned int type, 525 struct xfs_writepage_ctx *wpc,
574 xfs_ioend_t **result, 526 struct list_head *iolist)
575 int need_ioend)
576{ 527{
577 xfs_ioend_t *ioend = *result; 528 if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
578 529 bh->b_blocknr != wpc->last_block + 1 ||
579 if (!ioend || need_ioend || type != ioend->io_type) { 530 offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
580 xfs_ioend_t *previous = *result; 531 struct xfs_ioend *new;
581 532
582 ioend = xfs_alloc_ioend(inode, type); 533 if (wpc->ioend)
583 ioend->io_offset = offset; 534 list_add(&wpc->ioend->io_list, iolist);
584 ioend->io_buffer_head = bh; 535
585 ioend->io_buffer_tail = bh; 536 new = xfs_alloc_ioend(inode, wpc->io_type);
586 if (previous) 537 new->io_offset = offset;
587 previous->io_list = ioend; 538 new->io_buffer_head = bh;
588 *result = ioend; 539 new->io_buffer_tail = bh;
540 wpc->ioend = new;
589 } else { 541 } else {
590 ioend->io_buffer_tail->b_private = bh; 542 wpc->ioend->io_buffer_tail->b_private = bh;
591 ioend->io_buffer_tail = bh; 543 wpc->ioend->io_buffer_tail = bh;
592 } 544 }
593 545
594 bh->b_private = NULL; 546 bh->b_private = NULL;
595 ioend->io_size += bh->b_size; 547 wpc->ioend->io_size += bh->b_size;
548 wpc->last_block = bh->b_blocknr;
549 xfs_start_buffer_writeback(bh);
596} 550}
597 551
598STATIC void 552STATIC void
@@ -678,183 +632,6 @@ xfs_check_page_type(
678 return false; 632 return false;
679} 633}
680 634
681/*
682 * Allocate & map buffers for page given the extent map. Write it out.
683 * except for the original page of a writepage, this is called on
684 * delalloc/unwritten pages only, for the original page it is possible
685 * that the page has no mapping at all.
686 */
687STATIC int
688xfs_convert_page(
689 struct inode *inode,
690 struct page *page,
691 loff_t tindex,
692 struct xfs_bmbt_irec *imap,
693 xfs_ioend_t **ioendp,
694 struct writeback_control *wbc)
695{
696 struct buffer_head *bh, *head;
697 xfs_off_t end_offset;
698 unsigned long p_offset;
699 unsigned int type;
700 int len, page_dirty;
701 int count = 0, done = 0, uptodate = 1;
702 xfs_off_t offset = page_offset(page);
703
704 if (page->index != tindex)
705 goto fail;
706 if (!trylock_page(page))
707 goto fail;
708 if (PageWriteback(page))
709 goto fail_unlock_page;
710 if (page->mapping != inode->i_mapping)
711 goto fail_unlock_page;
712 if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
713 goto fail_unlock_page;
714
715 /*
716 * page_dirty is initially a count of buffers on the page before
717 * EOF and is decremented as we move each into a cleanable state.
718 *
719 * Derivation:
720 *
721 * End offset is the highest offset that this page should represent.
722 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
723 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
724 * hence give us the correct page_dirty count. On any other page,
725 * it will be zero and in that case we need page_dirty to be the
726 * count of buffers on the page.
727 */
728 end_offset = min_t(unsigned long long,
729 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
730 i_size_read(inode));
731
732 /*
733 * If the current map does not span the entire page we are about to try
734 * to write, then give up. The only way we can write a page that spans
735 * multiple mappings in a single writeback iteration is via the
736 * xfs_vm_writepage() function. Data integrity writeback requires the
737 * entire page to be written in a single attempt, otherwise the part of
738 * the page we don't write here doesn't get written as part of the data
739 * integrity sync.
740 *
741 * For normal writeback, we also don't attempt to write partial pages
742 * here as it simply means that write_cache_pages() will see it under
743 * writeback and ignore the page until some point in the future, at
744 * which time this will be the only page in the file that needs
745 * writeback. Hence for more optimal IO patterns, we should always
746 * avoid partial page writeback due to multiple mappings on a page here.
747 */
748 if (!xfs_imap_valid(inode, imap, end_offset))
749 goto fail_unlock_page;
750
751 len = 1 << inode->i_blkbits;
752 p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
753 PAGE_CACHE_SIZE);
754 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
755 page_dirty = p_offset / len;
756
757 /*
758 * The moment we find a buffer that doesn't match our current type
759 * specification or can't be written, abort the loop and start
760 * writeback. As per the above xfs_imap_valid() check, only
761 * xfs_vm_writepage() can handle partial page writeback fully - we are
762 * limited here to the buffers that are contiguous with the current
763 * ioend, and hence a buffer we can't write breaks that contiguity and
764 * we have to defer the rest of the IO to xfs_vm_writepage().
765 */
766 bh = head = page_buffers(page);
767 do {
768 if (offset >= end_offset)
769 break;
770 if (!buffer_uptodate(bh))
771 uptodate = 0;
772 if (!(PageUptodate(page) || buffer_uptodate(bh))) {
773 done = 1;
774 break;
775 }
776
777 if (buffer_unwritten(bh) || buffer_delay(bh) ||
778 buffer_mapped(bh)) {
779 if (buffer_unwritten(bh))
780 type = XFS_IO_UNWRITTEN;
781 else if (buffer_delay(bh))
782 type = XFS_IO_DELALLOC;
783 else
784 type = XFS_IO_OVERWRITE;
785
786 /*
787 * imap should always be valid because of the above
788 * partial page end_offset check on the imap.
789 */
790 ASSERT(xfs_imap_valid(inode, imap, offset));
791
792 lock_buffer(bh);
793 if (type != XFS_IO_OVERWRITE)
794 xfs_map_at_offset(inode, bh, imap, offset);
795 xfs_add_to_ioend(inode, bh, offset, type,
796 ioendp, done);
797
798 page_dirty--;
799 count++;
800 } else {
801 done = 1;
802 break;
803 }
804 } while (offset += len, (bh = bh->b_this_page) != head);
805
806 if (uptodate && bh == head)
807 SetPageUptodate(page);
808
809 if (count) {
810 if (--wbc->nr_to_write <= 0 &&
811 wbc->sync_mode == WB_SYNC_NONE)
812 done = 1;
813 }
814 xfs_start_page_writeback(page, !page_dirty, count);
815
816 return done;
817 fail_unlock_page:
818 unlock_page(page);
819 fail:
820 return 1;
821}
822
823/*
824 * Convert & write out a cluster of pages in the same extent as defined
825 * by mp and following the start page.
826 */
827STATIC void
828xfs_cluster_write(
829 struct inode *inode,
830 pgoff_t tindex,
831 struct xfs_bmbt_irec *imap,
832 xfs_ioend_t **ioendp,
833 struct writeback_control *wbc,
834 pgoff_t tlast)
835{
836 struct pagevec pvec;
837 int done = 0, i;
838
839 pagevec_init(&pvec, 0);
840 while (!done && tindex <= tlast) {
841 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
842
843 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
844 break;
845
846 for (i = 0; i < pagevec_count(&pvec); i++) {
847 done = xfs_convert_page(inode, pvec.pages[i], tindex++,
848 imap, ioendp, wbc);
849 if (done)
850 break;
851 }
852
853 pagevec_release(&pvec);
854 cond_resched();
855 }
856}
857
858STATIC void 635STATIC void
859xfs_vm_invalidatepage( 636xfs_vm_invalidatepage(
860 struct page *page, 637 struct page *page,
@@ -932,6 +709,164 @@ out_invalidate:
932} 709}
933 710
934/* 711/*
712 * We implement an immediate ioend submission policy here to avoid needing to
713 * chain multiple ioends and hence nest mempool allocations which can violate
714 * forward progress guarantees we need to provide. The current ioend we are
715 * adding buffers to is cached on the writepage context, and if the new buffer
716 * does not append to the cached ioend it will create a new ioend and cache that
717 * instead.
718 *
719 * If a new ioend is created and cached, the old ioend is returned and queued
720 * locally for submission once the entire page is processed or an error has been
721 * detected. While ioends are submitted immediately after they are completed,
722 * batching optimisations are provided by higher level block plugging.
723 *
724 * At the end of a writeback pass, there will be a cached ioend remaining on the
725 * writepage context that the caller will need to submit.
726 */
727static int
728xfs_writepage_map(
729 struct xfs_writepage_ctx *wpc,
730 struct writeback_control *wbc,
731 struct inode *inode,
732 struct page *page,
733 loff_t offset,
734 __uint64_t end_offset)
735{
736 LIST_HEAD(submit_list);
737 struct xfs_ioend *ioend, *next;
738 struct buffer_head *bh, *head;
739 ssize_t len = 1 << inode->i_blkbits;
740 int error = 0;
741 int count = 0;
742 int uptodate = 1;
743
744 bh = head = page_buffers(page);
745 offset = page_offset(page);
746 do {
747 if (offset >= end_offset)
748 break;
749 if (!buffer_uptodate(bh))
750 uptodate = 0;
751
752 /*
753 * set_page_dirty dirties all buffers in a page, independent
754 * of their state. The dirty state however is entirely
755 * meaningless for holes (!mapped && uptodate), so skip
756 * buffers covering holes here.
757 */
758 if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
759 wpc->imap_valid = false;
760 continue;
761 }
762
763 if (buffer_unwritten(bh)) {
764 if (wpc->io_type != XFS_IO_UNWRITTEN) {
765 wpc->io_type = XFS_IO_UNWRITTEN;
766 wpc->imap_valid = false;
767 }
768 } else if (buffer_delay(bh)) {
769 if (wpc->io_type != XFS_IO_DELALLOC) {
770 wpc->io_type = XFS_IO_DELALLOC;
771 wpc->imap_valid = false;
772 }
773 } else if (buffer_uptodate(bh)) {
774 if (wpc->io_type != XFS_IO_OVERWRITE) {
775 wpc->io_type = XFS_IO_OVERWRITE;
776 wpc->imap_valid = false;
777 }
778 } else {
779 if (PageUptodate(page))
780 ASSERT(buffer_mapped(bh));
781 /*
782 * This buffer is not uptodate and will not be
783 * written to disk. Ensure that we will put any
784 * subsequent writeable buffers into a new
785 * ioend.
786 */
787 wpc->imap_valid = false;
788 continue;
789 }
790
791 if (wpc->imap_valid)
792 wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
793 offset);
794 if (!wpc->imap_valid) {
795 error = xfs_map_blocks(inode, offset, &wpc->imap,
796 wpc->io_type);
797 if (error)
798 goto out;
799 wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
800 offset);
801 }
802 if (wpc->imap_valid) {
803 lock_buffer(bh);
804 if (wpc->io_type != XFS_IO_OVERWRITE)
805 xfs_map_at_offset(inode, bh, &wpc->imap, offset);
806 xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list);
807 count++;
808 }
809
810 } while (offset += len, ((bh = bh->b_this_page) != head));
811
812 if (uptodate && bh == head)
813 SetPageUptodate(page);
814
815 ASSERT(wpc->ioend || list_empty(&submit_list));
816
817out:
818 /*
819 * On error, we have to fail the ioend here because we have locked
820 * buffers in the ioend. If we don't do this, we'll deadlock
821 * invalidating the page as that tries to lock the buffers on the page.
822 * Also, because we may have set pages under writeback, we have to make
823 * sure we run IO completion to mark the error state of the IO
824 * appropriately, so we can't cancel the ioend directly here. That means
825 * we have to mark this page as under writeback if we included any
826 * buffers from it in the ioend chain so that completion treats it
827 * correctly.
828 *
829 * If we didn't include the page in the ioend, the on error we can
830 * simply discard and unlock it as there are no other users of the page
831 * or it's buffers right now. The caller will still need to trigger
832 * submission of outstanding ioends on the writepage context so they are
833 * treated correctly on error.
834 */
835 if (count) {
836 xfs_start_page_writeback(page, !error);
837
838 /*
839 * Preserve the original error if there was one, otherwise catch
840 * submission errors here and propagate into subsequent ioend
841 * submissions.
842 */
843 list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
844 int error2;
845
846 list_del_init(&ioend->io_list);
847 error2 = xfs_submit_ioend(wbc, ioend, error);
848 if (error2 && !error)
849 error = error2;
850 }
851 } else if (error) {
852 xfs_aops_discard_page(page);
853 ClearPageUptodate(page);
854 unlock_page(page);
855 } else {
856 /*
857 * We can end up here with no error and nothing to write if we
858 * race with a partial page truncate on a sub-page block sized
859 * filesystem. In that case we need to mark the page clean.
860 */
861 xfs_start_page_writeback(page, 1);
862 end_page_writeback(page);
863 }
864
865 mapping_set_error(page->mapping, error);
866 return error;
867}
868
869/*
935 * Write out a dirty page. 870 * Write out a dirty page.
936 * 871 *
937 * For delalloc space on the page we need to allocate space and flush it. 872 * For delalloc space on the page we need to allocate space and flush it.
@@ -940,22 +875,16 @@ out_invalidate:
940 * For any other dirty buffer heads on the page we should flush them. 875 * For any other dirty buffer heads on the page we should flush them.
941 */ 876 */
942STATIC int 877STATIC int
943xfs_vm_writepage( 878xfs_do_writepage(
944 struct page *page, 879 struct page *page,
945 struct writeback_control *wbc) 880 struct writeback_control *wbc,
881 void *data)
946{ 882{
883 struct xfs_writepage_ctx *wpc = data;
947 struct inode *inode = page->mapping->host; 884 struct inode *inode = page->mapping->host;
948 struct buffer_head *bh, *head;
949 struct xfs_bmbt_irec imap;
950 xfs_ioend_t *ioend = NULL, *iohead = NULL;
951 loff_t offset; 885 loff_t offset;
952 unsigned int type;
953 __uint64_t end_offset; 886 __uint64_t end_offset;
954 pgoff_t end_index, last_index; 887 pgoff_t end_index;
955 ssize_t len;
956 int err, imap_valid = 0, uptodate = 1;
957 int count = 0;
958 int nonblocking = 0;
959 888
960 trace_xfs_writepage(inode, page, 0, 0); 889 trace_xfs_writepage(inode, page, 0, 0);
961 890
@@ -982,12 +911,9 @@ xfs_vm_writepage(
982 if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) 911 if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
983 goto redirty; 912 goto redirty;
984 913
985 /* Is this page beyond the end of the file? */
986 offset = i_size_read(inode);
987 end_index = offset >> PAGE_CACHE_SHIFT;
988 last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
989
990 /* 914 /*
915 * Is this page beyond the end of the file?
916 *
991 * The page index is less than the end_index, adjust the end_offset 917 * The page index is less than the end_index, adjust the end_offset
992 * to the highest offset that this page should represent. 918 * to the highest offset that this page should represent.
993 * ----------------------------------------------------- 919 * -----------------------------------------------------
@@ -998,6 +924,8 @@ xfs_vm_writepage(
998 * | desired writeback range | see else | 924 * | desired writeback range | see else |
999 * ---------------------------------^------------------| 925 * ---------------------------------^------------------|
1000 */ 926 */
927 offset = i_size_read(inode);
928 end_index = offset >> PAGE_CACHE_SHIFT;
1001 if (page->index < end_index) 929 if (page->index < end_index)
1002 end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT; 930 end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
1003 else { 931 else {
@@ -1049,152 +977,7 @@ xfs_vm_writepage(
1049 end_offset = offset; 977 end_offset = offset;
1050 } 978 }
1051 979
1052 len = 1 << inode->i_blkbits; 980 return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
1053
1054 bh = head = page_buffers(page);
1055 offset = page_offset(page);
1056 type = XFS_IO_OVERWRITE;
1057
1058 if (wbc->sync_mode == WB_SYNC_NONE)
1059 nonblocking = 1;
1060
1061 do {
1062 int new_ioend = 0;
1063
1064 if (offset >= end_offset)
1065 break;
1066 if (!buffer_uptodate(bh))
1067 uptodate = 0;
1068
1069 /*
1070 * set_page_dirty dirties all buffers in a page, independent
1071 * of their state. The dirty state however is entirely
1072 * meaningless for holes (!mapped && uptodate), so skip
1073 * buffers covering holes here.
1074 */
1075 if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
1076 imap_valid = 0;
1077 continue;
1078 }
1079
1080 if (buffer_unwritten(bh)) {
1081 if (type != XFS_IO_UNWRITTEN) {
1082 type = XFS_IO_UNWRITTEN;
1083 imap_valid = 0;
1084 }
1085 } else if (buffer_delay(bh)) {
1086 if (type != XFS_IO_DELALLOC) {
1087 type = XFS_IO_DELALLOC;
1088 imap_valid = 0;
1089 }
1090 } else if (buffer_uptodate(bh)) {
1091 if (type != XFS_IO_OVERWRITE) {
1092 type = XFS_IO_OVERWRITE;
1093 imap_valid = 0;
1094 }
1095 } else {
1096 if (PageUptodate(page))
1097 ASSERT(buffer_mapped(bh));
1098 /*
1099 * This buffer is not uptodate and will not be
1100 * written to disk. Ensure that we will put any
1101 * subsequent writeable buffers into a new
1102 * ioend.
1103 */
1104 imap_valid = 0;
1105 continue;
1106 }
1107
1108 if (imap_valid)
1109 imap_valid = xfs_imap_valid(inode, &imap, offset);
1110 if (!imap_valid) {
1111 /*
1112 * If we didn't have a valid mapping then we need to
1113 * put the new mapping into a separate ioend structure.
1114 * This ensures non-contiguous extents always have
1115 * separate ioends, which is particularly important
1116 * for unwritten extent conversion at I/O completion
1117 * time.
1118 */
1119 new_ioend = 1;
1120 err = xfs_map_blocks(inode, offset, &imap, type,
1121 nonblocking);
1122 if (err)
1123 goto error;
1124 imap_valid = xfs_imap_valid(inode, &imap, offset);
1125 }
1126 if (imap_valid) {
1127 lock_buffer(bh);
1128 if (type != XFS_IO_OVERWRITE)
1129 xfs_map_at_offset(inode, bh, &imap, offset);
1130 xfs_add_to_ioend(inode, bh, offset, type, &ioend,
1131 new_ioend);
1132 count++;
1133 }
1134
1135 if (!iohead)
1136 iohead = ioend;
1137
1138 } while (offset += len, ((bh = bh->b_this_page) != head));
1139
1140 if (uptodate && bh == head)
1141 SetPageUptodate(page);
1142
1143 xfs_start_page_writeback(page, 1, count);
1144
1145 /* if there is no IO to be submitted for this page, we are done */
1146 if (!ioend)
1147 return 0;
1148
1149 ASSERT(iohead);
1150
1151 /*
1152 * Any errors from this point onwards need tobe reported through the IO
1153 * completion path as we have marked the initial page as under writeback
1154 * and unlocked it.
1155 */
1156 if (imap_valid) {
1157 xfs_off_t end_index;
1158
1159 end_index = imap.br_startoff + imap.br_blockcount;
1160
1161 /* to bytes */
1162 end_index <<= inode->i_blkbits;
1163
1164 /* to pages */
1165 end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
1166
1167 /* check against file size */
1168 if (end_index > last_index)
1169 end_index = last_index;
1170
1171 xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
1172 wbc, end_index);
1173 }
1174
1175
1176 /*
1177 * Reserve log space if we might write beyond the on-disk inode size.
1178 */
1179 err = 0;
1180 if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
1181 err = xfs_setfilesize_trans_alloc(ioend);
1182
1183 xfs_submit_ioend(wbc, iohead, err);
1184
1185 return 0;
1186
1187error:
1188 if (iohead)
1189 xfs_cancel_ioend(iohead);
1190
1191 if (err == -EAGAIN)
1192 goto redirty;
1193
1194 xfs_aops_discard_page(page);
1195 ClearPageUptodate(page);
1196 unlock_page(page);
1197 return err;
1198 981
1199redirty: 982redirty:
1200 redirty_page_for_writepage(wbc, page); 983 redirty_page_for_writepage(wbc, page);
@@ -1203,16 +986,40 @@ redirty:
1203} 986}
1204 987
1205STATIC int 988STATIC int
989xfs_vm_writepage(
990 struct page *page,
991 struct writeback_control *wbc)
992{
993 struct xfs_writepage_ctx wpc = {
994 .io_type = XFS_IO_INVALID,
995 };
996 int ret;
997
998 ret = xfs_do_writepage(page, wbc, &wpc);
999 if (wpc.ioend)
1000 ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
1001 return ret;
1002}
1003
1004STATIC int
1206xfs_vm_writepages( 1005xfs_vm_writepages(
1207 struct address_space *mapping, 1006 struct address_space *mapping,
1208 struct writeback_control *wbc) 1007 struct writeback_control *wbc)
1209{ 1008{
1009 struct xfs_writepage_ctx wpc = {
1010 .io_type = XFS_IO_INVALID,
1011 };
1012 int ret;
1013
1210 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 1014 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
1211 if (dax_mapping(mapping)) 1015 if (dax_mapping(mapping))
1212 return dax_writeback_mapping_range(mapping, 1016 return dax_writeback_mapping_range(mapping,
1213 xfs_find_bdev_for_inode(mapping->host), wbc); 1017 xfs_find_bdev_for_inode(mapping->host), wbc);
1214 1018
1215 return generic_writepages(mapping, wbc); 1019 ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
1020 if (wpc.ioend)
1021 ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
1022 return ret;
1216} 1023}
1217 1024
1218/* 1025/*
@@ -1242,27 +1049,8 @@ xfs_vm_releasepage(
1242} 1049}
1243 1050
1244/* 1051/*
1245 * When we map a DIO buffer, we may need to attach an ioend that describes the 1052 * When we map a DIO buffer, we may need to pass flags to
1246 * type of write IO we are doing. This passes to the completion function the 1053 * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
1247 * operations it needs to perform. If the mapping is for an overwrite wholly
1248 * within the EOF then we don't need an ioend and so we don't allocate one.
1249 * This avoids the unnecessary overhead of allocating and freeing ioends for
1250 * workloads that don't require transactions on IO completion.
1251 *
1252 * If we get multiple mappings in a single IO, we might be mapping different
1253 * types. But because the direct IO can only have a single private pointer, we
1254 * need to ensure that:
1255 *
1256 * a) i) the ioend spans the entire region of unwritten mappings; or
1257 * ii) the ioend spans all the mappings that cross or are beyond EOF; and
1258 * b) if it contains unwritten extents, it is *permanently* marked as such
1259 *
1260 * We could do this by chaining ioends like buffered IO does, but we only
1261 * actually get one IO completion callback from the direct IO, and that spans
1262 * the entire IO regardless of how many mappings and IOs are needed to complete
1263 * the DIO. There is only going to be one reference to the ioend and its life
1264 * cycle is constrained by the DIO completion code. hence we don't need
1265 * reference counting here.
1266 * 1054 *
1267 * Note that for DIO, an IO to the highest supported file block offset (i.e. 1055 * Note that for DIO, an IO to the highest supported file block offset (i.e.
1268 * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64 1056 * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
@@ -1270,68 +1058,26 @@ xfs_vm_releasepage(
1270 * extending the file size. We won't know for sure until IO completion is run 1058 * extending the file size. We won't know for sure until IO completion is run
1271 * and the actual max write offset is communicated to the IO completion 1059 * and the actual max write offset is communicated to the IO completion
1272 * routine. 1060 * routine.
1273 *
1274 * For DAX page faults, we are preparing to never see unwritten extents here,
1275 * nor should we ever extend the inode size. Hence we will soon have nothing to
1276 * do here for this case, ensuring we don't have to provide an IO completion
1277 * callback to free an ioend that we don't actually need for a fault into the
1278 * page at offset (2^63 - 1FSB) bytes.
1279 */ 1061 */
1280
1281static void 1062static void
1282xfs_map_direct( 1063xfs_map_direct(
1283 struct inode *inode, 1064 struct inode *inode,
1284 struct buffer_head *bh_result, 1065 struct buffer_head *bh_result,
1285 struct xfs_bmbt_irec *imap, 1066 struct xfs_bmbt_irec *imap,
1286 xfs_off_t offset, 1067 xfs_off_t offset)
1287 bool dax_fault)
1288{ 1068{
1289 struct xfs_ioend *ioend; 1069 uintptr_t *flags = (uintptr_t *)&bh_result->b_private;
1290 xfs_off_t size = bh_result->b_size; 1070 xfs_off_t size = bh_result->b_size;
1291 int type;
1292
1293 if (ISUNWRITTEN(imap))
1294 type = XFS_IO_UNWRITTEN;
1295 else
1296 type = XFS_IO_OVERWRITE;
1297 1071
1298 trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); 1072 trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
1299 1073 ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap);
1300 if (dax_fault) {
1301 ASSERT(type == XFS_IO_OVERWRITE);
1302 trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
1303 imap);
1304 return;
1305 }
1306 1074
1307 if (bh_result->b_private) { 1075 if (ISUNWRITTEN(imap)) {
1308 ioend = bh_result->b_private; 1076 *flags |= XFS_DIO_FLAG_UNWRITTEN;
1309 ASSERT(ioend->io_size > 0); 1077 set_buffer_defer_completion(bh_result);
1310 ASSERT(offset >= ioend->io_offset); 1078 } else if (offset + size > i_size_read(inode) || offset + size < 0) {
1311 if (offset + size > ioend->io_offset + ioend->io_size) 1079 *flags |= XFS_DIO_FLAG_APPEND;
1312 ioend->io_size = offset - ioend->io_offset + size;
1313
1314 if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
1315 ioend->io_type = XFS_IO_UNWRITTEN;
1316
1317 trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
1318 ioend->io_size, ioend->io_type,
1319 imap);
1320 } else if (type == XFS_IO_UNWRITTEN ||
1321 offset + size > i_size_read(inode) ||
1322 offset + size < 0) {
1323 ioend = xfs_alloc_ioend(inode, type);
1324 ioend->io_offset = offset;
1325 ioend->io_size = size;
1326
1327 bh_result->b_private = ioend;
1328 set_buffer_defer_completion(bh_result); 1080 set_buffer_defer_completion(bh_result);
1329
1330 trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
1331 imap);
1332 } else {
1333 trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
1334 imap);
1335 } 1081 }
1336} 1082}
1337 1083
@@ -1502,9 +1248,12 @@ __xfs_get_blocks(
1502 if (ISUNWRITTEN(&imap)) 1248 if (ISUNWRITTEN(&imap))
1503 set_buffer_unwritten(bh_result); 1249 set_buffer_unwritten(bh_result);
1504 /* direct IO needs special help */ 1250 /* direct IO needs special help */
1505 if (create && direct) 1251 if (create && direct) {
1506 xfs_map_direct(inode, bh_result, &imap, offset, 1252 if (dax_fault)
1507 dax_fault); 1253 ASSERT(!ISUNWRITTEN(&imap));
1254 else
1255 xfs_map_direct(inode, bh_result, &imap, offset);
1256 }
1508 } 1257 }
1509 1258
1510 /* 1259 /*
@@ -1574,42 +1323,50 @@ xfs_get_blocks_dax_fault(
1574 return __xfs_get_blocks(inode, iblock, bh_result, create, true, true); 1323 return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
1575} 1324}
1576 1325
1577static void 1326/*
1578__xfs_end_io_direct_write( 1327 * Complete a direct I/O write request.
1579 struct inode *inode, 1328 *
1580 struct xfs_ioend *ioend, 1329 * xfs_map_direct passes us some flags in the private data to tell us what to
1330 * do. If no flags are set, then the write IO is an overwrite wholly within
1331 * the existing allocated file size and so there is nothing for us to do.
1332 *
1333 * Note that in this case the completion can be called in interrupt context,
1334 * whereas if we have flags set we will always be called in task context
1335 * (i.e. from a workqueue).
1336 */
1337STATIC int
1338xfs_end_io_direct_write(
1339 struct kiocb *iocb,
1581 loff_t offset, 1340 loff_t offset,
1582 ssize_t size) 1341 ssize_t size,
1342 void *private)
1583{ 1343{
1584 struct xfs_mount *mp = XFS_I(inode)->i_mount; 1344 struct inode *inode = file_inode(iocb->ki_filp);
1345 struct xfs_inode *ip = XFS_I(inode);
1346 struct xfs_mount *mp = ip->i_mount;
1347 uintptr_t flags = (uintptr_t)private;
1348 int error = 0;
1585 1349
1586 if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error) 1350 trace_xfs_end_io_direct_write(ip, offset, size);
1587 goto out_end_io;
1588 1351
1589 /* 1352 if (XFS_FORCED_SHUTDOWN(mp))
1590 * dio completion end_io functions are only called on writes if more 1353 return -EIO;
1591 * than 0 bytes was written.
1592 */
1593 ASSERT(size > 0);
1594 1354
1595 /* 1355 if (size <= 0)
1596 * The ioend only maps whole blocks, while the IO may be sector aligned. 1356 return size;
1597 * Hence the ioend offset/size may not match the IO offset/size exactly.
1598 * Because we don't map overwrites within EOF into the ioend, the offset
1599 * may not match, but only if the endio spans EOF. Either way, write
1600 * the IO sizes into the ioend so that completion processing does the
1601 * right thing.
1602 */
1603 ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
1604 ioend->io_size = size;
1605 ioend->io_offset = offset;
1606 1357
1607 /* 1358 /*
1608 * The ioend tells us whether we are doing unwritten extent conversion 1359 * The flags tell us whether we are doing unwritten extent conversions
1609 * or an append transaction that updates the on-disk file size. These 1360 * or an append transaction that updates the on-disk file size. These
1610 * cases are the only cases where we should *potentially* be needing 1361 * cases are the only cases where we should *potentially* be needing
1611 * to update the VFS inode size. 1362 * to update the VFS inode size.
1612 * 1363 */
1364 if (flags == 0) {
1365 ASSERT(offset + size <= i_size_read(inode));
1366 return 0;
1367 }
1368
1369 /*
1613 * We need to update the in-core inode size here so that we don't end up 1370 * We need to update the in-core inode size here so that we don't end up
1614 * with the on-disk inode size being outside the in-core inode size. We 1371 * with the on-disk inode size being outside the in-core inode size. We
1615 * have no other method of updating EOF for AIO, so always do it here 1372 * have no other method of updating EOF for AIO, so always do it here
@@ -1620,91 +1377,56 @@ __xfs_end_io_direct_write(
1620 * here can result in EOF moving backwards and Bad Things Happen when 1377 * here can result in EOF moving backwards and Bad Things Happen when
1621 * that occurs. 1378 * that occurs.
1622 */ 1379 */
1623 spin_lock(&XFS_I(inode)->i_flags_lock); 1380 spin_lock(&ip->i_flags_lock);
1624 if (offset + size > i_size_read(inode)) 1381 if (offset + size > i_size_read(inode))
1625 i_size_write(inode, offset + size); 1382 i_size_write(inode, offset + size);
1626 spin_unlock(&XFS_I(inode)->i_flags_lock); 1383 spin_unlock(&ip->i_flags_lock);
1627 1384
1628 /* 1385 if (flags & XFS_DIO_FLAG_UNWRITTEN) {
1629 * If we are doing an append IO that needs to update the EOF on disk, 1386 trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
1630 * do the transaction reserve now so we can use common end io
1631 * processing. Stashing the error (if there is one) in the ioend will
1632 * result in the ioend processing passing on the error if it is
1633 * possible as we can't return it from here.
1634 */
1635 if (ioend->io_type == XFS_IO_OVERWRITE)
1636 ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
1637 1387
1638out_end_io: 1388 error = xfs_iomap_write_unwritten(ip, offset, size);
1639 xfs_end_io(&ioend->io_work); 1389 } else if (flags & XFS_DIO_FLAG_APPEND) {
1640 return; 1390 struct xfs_trans *tp;
1641}
1642 1391
1643/* 1392 trace_xfs_end_io_direct_write_append(ip, offset, size);
1644 * Complete a direct I/O write request.
1645 *
1646 * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
1647 * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
1648 * wholly within the EOF and so there is nothing for us to do. Note that in this
1649 * case the completion can be called in interrupt context, whereas if we have an
1650 * ioend we will always be called in task context (i.e. from a workqueue).
1651 */
1652STATIC void
1653xfs_end_io_direct_write(
1654 struct kiocb *iocb,
1655 loff_t offset,
1656 ssize_t size,
1657 void *private)
1658{
1659 struct inode *inode = file_inode(iocb->ki_filp);
1660 struct xfs_ioend *ioend = private;
1661
1662 trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
1663 ioend ? ioend->io_type : 0, NULL);
1664 1393
1665 if (!ioend) { 1394 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
1666 ASSERT(offset + size <= i_size_read(inode)); 1395 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
1667 return; 1396 if (error) {
1397 xfs_trans_cancel(tp);
1398 return error;
1399 }
1400 error = xfs_setfilesize(ip, tp, offset, size);
1668 } 1401 }
1669 1402
1670 __xfs_end_io_direct_write(inode, ioend, offset, size); 1403 return error;
1671} 1404}
1672 1405
1673static inline ssize_t 1406STATIC ssize_t
1674xfs_vm_do_dio( 1407xfs_vm_direct_IO(
1675 struct inode *inode,
1676 struct kiocb *iocb, 1408 struct kiocb *iocb,
1677 struct iov_iter *iter, 1409 struct iov_iter *iter,
1678 loff_t offset, 1410 loff_t offset)
1679 void (*endio)(struct kiocb *iocb,
1680 loff_t offset,
1681 ssize_t size,
1682 void *private),
1683 int flags)
1684{ 1411{
1412 struct inode *inode = iocb->ki_filp->f_mapping->host;
1413 dio_iodone_t *endio = NULL;
1414 int flags = 0;
1685 struct block_device *bdev; 1415 struct block_device *bdev;
1686 1416
1687 if (IS_DAX(inode)) 1417 if (iov_iter_rw(iter) == WRITE) {
1418 endio = xfs_end_io_direct_write;
1419 flags = DIO_ASYNC_EXTEND;
1420 }
1421
1422 if (IS_DAX(inode)) {
1688 return dax_do_io(iocb, inode, iter, offset, 1423 return dax_do_io(iocb, inode, iter, offset,
1689 xfs_get_blocks_direct, endio, 0); 1424 xfs_get_blocks_direct, endio, 0);
1425 }
1690 1426
1691 bdev = xfs_find_bdev_for_inode(inode); 1427 bdev = xfs_find_bdev_for_inode(inode);
1692 return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, 1428 return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
1693 xfs_get_blocks_direct, endio, NULL, flags); 1429 xfs_get_blocks_direct, endio, NULL, flags);
1694}
1695
1696STATIC ssize_t
1697xfs_vm_direct_IO(
1698 struct kiocb *iocb,
1699 struct iov_iter *iter,
1700 loff_t offset)
1701{
1702 struct inode *inode = iocb->ki_filp->f_mapping->host;
1703
1704 if (iov_iter_rw(iter) == WRITE)
1705 return xfs_vm_do_dio(inode, iocb, iter, offset,
1706 xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
1707 return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
1708} 1430}
1709 1431
1710/* 1432/*
@@ -1756,6 +1478,7 @@ xfs_vm_write_failed(
1756 loff_t from = pos & (PAGE_CACHE_SIZE - 1); 1478 loff_t from = pos & (PAGE_CACHE_SIZE - 1);
1757 loff_t to = from + len; 1479 loff_t to = from + len;
1758 struct buffer_head *bh, *head; 1480 struct buffer_head *bh, *head;
1481 struct xfs_mount *mp = XFS_I(inode)->i_mount;
1759 1482
1760 /* 1483 /*
1761 * The request pos offset might be 32 or 64 bit, this is all fine 1484 * The request pos offset might be 32 or 64 bit, this is all fine
@@ -1787,14 +1510,23 @@ xfs_vm_write_failed(
1787 if (block_start >= to) 1510 if (block_start >= to)
1788 break; 1511 break;
1789 1512
1790 if (!buffer_delay(bh)) 1513 /*
1514 * Process delalloc and unwritten buffers beyond EOF. We can
1515 * encounter unwritten buffers in the event that a file has
1516 * post-EOF unwritten extents and an extending write happens to
1517 * fail (e.g., an unaligned write that also involves a delalloc
1518 * to the same page).
1519 */
1520 if (!buffer_delay(bh) && !buffer_unwritten(bh))
1791 continue; 1521 continue;
1792 1522
1793 if (!buffer_new(bh) && block_offset < i_size_read(inode)) 1523 if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
1524 block_offset < i_size_read(inode))
1794 continue; 1525 continue;
1795 1526
1796 xfs_vm_kill_delalloc_range(inode, block_offset, 1527 if (buffer_delay(bh))
1797 block_offset + bh->b_size); 1528 xfs_vm_kill_delalloc_range(inode, block_offset,
1529 block_offset + bh->b_size);
1798 1530
1799 /* 1531 /*
1800 * This buffer does not contain data anymore. make sure anyone 1532 * This buffer does not contain data anymore. make sure anyone
@@ -1805,6 +1537,7 @@ xfs_vm_write_failed(
1805 clear_buffer_mapped(bh); 1537 clear_buffer_mapped(bh);
1806 clear_buffer_new(bh); 1538 clear_buffer_new(bh);
1807 clear_buffer_dirty(bh); 1539 clear_buffer_dirty(bh);
1540 clear_buffer_unwritten(bh);
1808 } 1541 }
1809 1542
1810} 1543}
@@ -1828,6 +1561,7 @@ xfs_vm_write_begin(
1828 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1561 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1829 struct page *page; 1562 struct page *page;
1830 int status; 1563 int status;
1564 struct xfs_mount *mp = XFS_I(mapping->host)->i_mount;
1831 1565
1832 ASSERT(len <= PAGE_CACHE_SIZE); 1566 ASSERT(len <= PAGE_CACHE_SIZE);
1833 1567
@@ -1836,6 +1570,8 @@ xfs_vm_write_begin(
1836 return -ENOMEM; 1570 return -ENOMEM;
1837 1571
1838 status = __block_write_begin(page, pos, len, xfs_get_blocks); 1572 status = __block_write_begin(page, pos, len, xfs_get_blocks);
1573 if (xfs_mp_fail_writes(mp))
1574 status = -EIO;
1839 if (unlikely(status)) { 1575 if (unlikely(status)) {
1840 struct inode *inode = mapping->host; 1576 struct inode *inode = mapping->host;
1841 size_t isize = i_size_read(inode); 1577 size_t isize = i_size_read(inode);
@@ -1848,6 +1584,8 @@ xfs_vm_write_begin(
1848 * allocated in this write, not blocks that were previously 1584 * allocated in this write, not blocks that were previously
1849 * written successfully. 1585 * written successfully.
1850 */ 1586 */
1587 if (xfs_mp_fail_writes(mp))
1588 isize = 0;
1851 if (pos + len > isize) { 1589 if (pos + len > isize) {
1852 ssize_t start = max_t(ssize_t, pos, isize); 1590 ssize_t start = max_t(ssize_t, pos, isize);
1853 1591
@@ -1957,7 +1695,6 @@ xfs_vm_set_page_dirty(
1957 loff_t end_offset; 1695 loff_t end_offset;
1958 loff_t offset; 1696 loff_t offset;
1959 int newly_dirty; 1697 int newly_dirty;
1960 struct mem_cgroup *memcg;
1961 1698
1962 if (unlikely(!mapping)) 1699 if (unlikely(!mapping))
1963 return !TestSetPageDirty(page); 1700 return !TestSetPageDirty(page);
@@ -1978,10 +1715,10 @@ xfs_vm_set_page_dirty(
1978 } while (bh != head); 1715 } while (bh != head);
1979 } 1716 }
1980 /* 1717 /*
1981 * Use mem_group_begin_page_stat() to keep PageDirty synchronized with 1718 * Lock out page->mem_cgroup migration to keep PageDirty
1982 * per-memcg dirty page counters. 1719 * synchronized with per-memcg dirty page counters.
1983 */ 1720 */
1984 memcg = mem_cgroup_begin_page_stat(page); 1721 lock_page_memcg(page);
1985 newly_dirty = !TestSetPageDirty(page); 1722 newly_dirty = !TestSetPageDirty(page);
1986 spin_unlock(&mapping->private_lock); 1723 spin_unlock(&mapping->private_lock);
1987 1724
@@ -1992,13 +1729,13 @@ xfs_vm_set_page_dirty(
1992 spin_lock_irqsave(&mapping->tree_lock, flags); 1729 spin_lock_irqsave(&mapping->tree_lock, flags);
1993 if (page->mapping) { /* Race with truncate? */ 1730 if (page->mapping) { /* Race with truncate? */
1994 WARN_ON_ONCE(!PageUptodate(page)); 1731 WARN_ON_ONCE(!PageUptodate(page));
1995 account_page_dirtied(page, mapping, memcg); 1732 account_page_dirtied(page, mapping);
1996 radix_tree_tag_set(&mapping->page_tree, 1733 radix_tree_tag_set(&mapping->page_tree,
1997 page_index(page), PAGECACHE_TAG_DIRTY); 1734 page_index(page), PAGECACHE_TAG_DIRTY);
1998 } 1735 }
1999 spin_unlock_irqrestore(&mapping->tree_lock, flags); 1736 spin_unlock_irqrestore(&mapping->tree_lock, flags);
2000 } 1737 }
2001 mem_cgroup_end_page_stat(memcg); 1738 unlock_page_memcg(page);
2002 if (newly_dirty) 1739 if (newly_dirty)
2003 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1740 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
2004 return newly_dirty; 1741 return newly_dirty;
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index a4343c63fb38..b4421177b68d 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -24,12 +24,14 @@ extern mempool_t *xfs_ioend_pool;
24 * Types of I/O for bmap clustering and I/O completion tracking. 24 * Types of I/O for bmap clustering and I/O completion tracking.
25 */ 25 */
26enum { 26enum {
27 XFS_IO_INVALID, /* initial state */
27 XFS_IO_DELALLOC, /* covers delalloc region */ 28 XFS_IO_DELALLOC, /* covers delalloc region */
28 XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ 29 XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */
29 XFS_IO_OVERWRITE, /* covers already allocated extent */ 30 XFS_IO_OVERWRITE, /* covers already allocated extent */
30}; 31};
31 32
32#define XFS_IO_TYPES \ 33#define XFS_IO_TYPES \
34 { XFS_IO_INVALID, "invalid" }, \
33 { XFS_IO_DELALLOC, "delalloc" }, \ 35 { XFS_IO_DELALLOC, "delalloc" }, \
34 { XFS_IO_UNWRITTEN, "unwritten" }, \ 36 { XFS_IO_UNWRITTEN, "unwritten" }, \
35 { XFS_IO_OVERWRITE, "overwrite" } 37 { XFS_IO_OVERWRITE, "overwrite" }
@@ -39,7 +41,7 @@ enum {
39 * It can manage several multi-page bio's at once. 41 * It can manage several multi-page bio's at once.
40 */ 42 */
41typedef struct xfs_ioend { 43typedef struct xfs_ioend {
42 struct xfs_ioend *io_list; /* next ioend in chain */ 44 struct list_head io_list; /* next ioend in chain */
43 unsigned int io_type; /* delalloc / unwritten */ 45 unsigned int io_type; /* delalloc / unwritten */
44 int io_error; /* I/O error code */ 46 int io_error; /* I/O error code */
45 atomic_t io_remaining; /* hold count */ 47 atomic_t io_remaining; /* hold count */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 0ef7c2ed3f8a..4fa14820e2e2 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -202,8 +202,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
202 sbp->namelen, 202 sbp->namelen,
203 sbp->valuelen, 203 sbp->valuelen,
204 &sbp->name[sbp->namelen]); 204 &sbp->name[sbp->namelen]);
205 if (error) 205 if (error) {
206 kmem_free(sbuf);
206 return error; 207 return error;
208 }
207 if (context->seen_enough) 209 if (context->seen_enough)
208 break; 210 break;
209 cursor->offset++; 211 cursor->offset++;
@@ -454,14 +456,13 @@ xfs_attr3_leaf_list_int(
454 args.rmtblkcnt = xfs_attr3_rmt_blocks( 456 args.rmtblkcnt = xfs_attr3_rmt_blocks(
455 args.dp->i_mount, valuelen); 457 args.dp->i_mount, valuelen);
456 retval = xfs_attr_rmtval_get(&args); 458 retval = xfs_attr_rmtval_get(&args);
457 if (retval) 459 if (!retval)
458 return retval; 460 retval = context->put_listent(context,
459 retval = context->put_listent(context, 461 entry->flags,
460 entry->flags, 462 name_rmt->name,
461 name_rmt->name, 463 (int)name_rmt->namelen,
462 (int)name_rmt->namelen, 464 valuelen,
463 valuelen, 465 args.value);
464 args.value);
465 kmem_free(args.value); 466 kmem_free(args.value);
466 } else { 467 } else {
467 retval = context->put_listent(context, 468 retval = context->put_listent(context,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 6c876012b2e5..a32c1dcae2ff 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -203,10 +203,12 @@ xfs_bmap_rtalloc(
203 ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; 203 ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
204 204
205 /* 205 /*
206 * Lock out other modifications to the RT bitmap inode. 206 * Lock out modifications to both the RT bitmap and summary inodes
207 */ 207 */
208 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); 208 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
209 xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); 209 xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
210 xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
211 xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
210 212
211 /* 213 /*
212 * If it's an allocation to an empty file at offset 0, 214 * If it's an allocation to an empty file at offset 0,
@@ -822,7 +824,7 @@ bool
822xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) 824xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
823{ 825{
824 /* prealloc/delalloc exists only on regular files */ 826 /* prealloc/delalloc exists only on regular files */
825 if (!S_ISREG(ip->i_d.di_mode)) 827 if (!S_ISREG(VFS_I(ip)->i_mode))
826 return false; 828 return false;
827 829
828 /* 830 /*
@@ -1727,7 +1729,7 @@ xfs_swap_extents(
1727 xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); 1729 xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
1728 1730
1729 /* Verify that both files have the same format */ 1731 /* Verify that both files have the same format */
1730 if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { 1732 if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
1731 error = -EINVAL; 1733 error = -EINVAL;
1732 goto out_unlock; 1734 goto out_unlock;
1733 } 1735 }
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 435c7de42e5f..9a2191b91137 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -650,7 +650,7 @@ xfs_buf_read_map(
650 if (bp) { 650 if (bp) {
651 trace_xfs_buf_read(bp, flags, _RET_IP_); 651 trace_xfs_buf_read(bp, flags, _RET_IP_);
652 652
653 if (!XFS_BUF_ISDONE(bp)) { 653 if (!(bp->b_flags & XBF_DONE)) {
654 XFS_STATS_INC(target->bt_mount, xb_get_read); 654 XFS_STATS_INC(target->bt_mount, xb_get_read);
655 bp->b_ops = ops; 655 bp->b_ops = ops;
656 _xfs_buf_read(bp, flags); 656 _xfs_buf_read(bp, flags);
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c75721acd867..4eb89bd4ee73 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -302,6 +302,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
302 302
303/* Buffer Utility Routines */ 303/* Buffer Utility Routines */
304extern void *xfs_buf_offset(struct xfs_buf *, size_t); 304extern void *xfs_buf_offset(struct xfs_buf *, size_t);
305extern void xfs_buf_stale(struct xfs_buf *bp);
305 306
306/* Delayed Write Buffer Routines */ 307/* Delayed Write Buffer Routines */
307extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); 308extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
@@ -312,31 +313,6 @@ extern int xfs_buf_delwri_submit_nowait(struct list_head *);
312extern int xfs_buf_init(void); 313extern int xfs_buf_init(void);
313extern void xfs_buf_terminate(void); 314extern void xfs_buf_terminate(void);
314 315
315#define XFS_BUF_ZEROFLAGS(bp) \
316 ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \
317 XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \
318 XBF_WRITE_FAIL))
319
320void xfs_buf_stale(struct xfs_buf *bp);
321#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
322#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
323
324#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE)
325#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE)
326#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE)
327
328#define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC)
329#define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC)
330#define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC)
331
332#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ)
333#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ)
334#define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ)
335
336#define XFS_BUF_WRITE(bp) ((bp)->b_flags |= XBF_WRITE)
337#define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE)
338#define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE)
339
340/* 316/*
341 * These macros use the IO block map rather than b_bn. b_bn is now really 317 * These macros use the IO block map rather than b_bn. b_bn is now really
342 * just for the buffer cache index for cached buffers. As IO does not use b_bn 318 * just for the buffer cache index for cached buffers. As IO does not use b_bn
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 7e986da34f6c..99e91a0e554e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -431,7 +431,7 @@ xfs_buf_item_unpin(
431 if (freed && stale) { 431 if (freed && stale) {
432 ASSERT(bip->bli_flags & XFS_BLI_STALE); 432 ASSERT(bip->bli_flags & XFS_BLI_STALE);
433 ASSERT(xfs_buf_islocked(bp)); 433 ASSERT(xfs_buf_islocked(bp));
434 ASSERT(XFS_BUF_ISSTALE(bp)); 434 ASSERT(bp->b_flags & XBF_STALE);
435 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); 435 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
436 436
437 trace_xfs_buf_item_unpin_stale(bip); 437 trace_xfs_buf_item_unpin_stale(bip);
@@ -493,7 +493,7 @@ xfs_buf_item_unpin(
493 xfs_buf_hold(bp); 493 xfs_buf_hold(bp);
494 bp->b_flags |= XBF_ASYNC; 494 bp->b_flags |= XBF_ASYNC;
495 xfs_buf_ioerror(bp, -EIO); 495 xfs_buf_ioerror(bp, -EIO);
496 XFS_BUF_UNDONE(bp); 496 bp->b_flags &= ~XBF_DONE;
497 xfs_buf_stale(bp); 497 xfs_buf_stale(bp);
498 xfs_buf_ioend(bp); 498 xfs_buf_ioend(bp);
499 } 499 }
@@ -1067,7 +1067,7 @@ xfs_buf_iodone_callbacks(
1067 */ 1067 */
1068 if (XFS_FORCED_SHUTDOWN(mp)) { 1068 if (XFS_FORCED_SHUTDOWN(mp)) {
1069 xfs_buf_stale(bp); 1069 xfs_buf_stale(bp);
1070 XFS_BUF_DONE(bp); 1070 bp->b_flags |= XBF_DONE;
1071 trace_xfs_buf_item_iodone(bp, _RET_IP_); 1071 trace_xfs_buf_item_iodone(bp, _RET_IP_);
1072 goto do_callbacks; 1072 goto do_callbacks;
1073 } 1073 }
@@ -1090,7 +1090,7 @@ xfs_buf_iodone_callbacks(
1090 * errors tend to affect the whole device and a failing log write 1090 * errors tend to affect the whole device and a failing log write
1091 * will make us give up. But we really ought to do better here. 1091 * will make us give up. But we really ought to do better here.
1092 */ 1092 */
1093 if (XFS_BUF_ISASYNC(bp)) { 1093 if (bp->b_flags & XBF_ASYNC) {
1094 ASSERT(bp->b_iodone != NULL); 1094 ASSERT(bp->b_iodone != NULL);
1095 1095
1096 trace_xfs_buf_item_iodone_async(bp, _RET_IP_); 1096 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
@@ -1113,7 +1113,7 @@ xfs_buf_iodone_callbacks(
1113 * sure to return the error to the caller of xfs_bwrite(). 1113 * sure to return the error to the caller of xfs_bwrite().
1114 */ 1114 */
1115 xfs_buf_stale(bp); 1115 xfs_buf_stale(bp);
1116 XFS_BUF_DONE(bp); 1116 bp->b_flags |= XBF_DONE;
1117 1117
1118 trace_xfs_buf_error_relse(bp, _RET_IP_); 1118 trace_xfs_buf_error_relse(bp, _RET_IP_);
1119 1119
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 642d55d10075..93b3ab0c5435 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -665,7 +665,7 @@ xfs_readdir(
665 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) 665 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
666 return -EIO; 666 return -EIO;
667 667
668 ASSERT(S_ISDIR(dp->i_d.di_mode)); 668 ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
669 XFS_STATS_INC(dp->i_mount, xs_dir_getdents); 669 XFS_STATS_INC(dp->i_mount, xs_dir_getdents);
670 670
671 args.dp = dp; 671 args.dp = dp;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index e85a9519a5ae..272c3f8b6f7d 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -227,7 +227,7 @@ xfs_discard_extents(
227 GFP_NOFS, 0); 227 GFP_NOFS, 0);
228 if (error && error != -EOPNOTSUPP) { 228 if (error && error != -EOPNOTSUPP) {
229 xfs_info(mp, 229 xfs_info(mp,
230 "discard failed for extent [0x%llu,%u], error %d", 230 "discard failed for extent [0x%llx,%u], error %d",
231 (unsigned long long)busyp->bno, 231 (unsigned long long)busyp->bno,
232 busyp->length, 232 busyp->length,
233 error); 233 error);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 9c44d38dcd1f..316b2a1bdba5 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -92,26 +92,28 @@ xfs_qm_adjust_dqlimits(
92{ 92{
93 struct xfs_quotainfo *q = mp->m_quotainfo; 93 struct xfs_quotainfo *q = mp->m_quotainfo;
94 struct xfs_disk_dquot *d = &dq->q_core; 94 struct xfs_disk_dquot *d = &dq->q_core;
95 struct xfs_def_quota *defq;
95 int prealloc = 0; 96 int prealloc = 0;
96 97
97 ASSERT(d->d_id); 98 ASSERT(d->d_id);
99 defq = xfs_get_defquota(dq, q);
98 100
99 if (q->qi_bsoftlimit && !d->d_blk_softlimit) { 101 if (defq->bsoftlimit && !d->d_blk_softlimit) {
100 d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit); 102 d->d_blk_softlimit = cpu_to_be64(defq->bsoftlimit);
101 prealloc = 1; 103 prealloc = 1;
102 } 104 }
103 if (q->qi_bhardlimit && !d->d_blk_hardlimit) { 105 if (defq->bhardlimit && !d->d_blk_hardlimit) {
104 d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit); 106 d->d_blk_hardlimit = cpu_to_be64(defq->bhardlimit);
105 prealloc = 1; 107 prealloc = 1;
106 } 108 }
107 if (q->qi_isoftlimit && !d->d_ino_softlimit) 109 if (defq->isoftlimit && !d->d_ino_softlimit)
108 d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit); 110 d->d_ino_softlimit = cpu_to_be64(defq->isoftlimit);
109 if (q->qi_ihardlimit && !d->d_ino_hardlimit) 111 if (defq->ihardlimit && !d->d_ino_hardlimit)
110 d->d_ino_hardlimit = cpu_to_be64(q->qi_ihardlimit); 112 d->d_ino_hardlimit = cpu_to_be64(defq->ihardlimit);
111 if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit) 113 if (defq->rtbsoftlimit && !d->d_rtb_softlimit)
112 d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit); 114 d->d_rtb_softlimit = cpu_to_be64(defq->rtbsoftlimit);
113 if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit) 115 if (defq->rtbhardlimit && !d->d_rtb_hardlimit)
114 d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit); 116 d->d_rtb_hardlimit = cpu_to_be64(defq->rtbhardlimit);
115 117
116 if (prealloc) 118 if (prealloc)
117 xfs_dquot_set_prealloc_limits(dq); 119 xfs_dquot_set_prealloc_limits(dq);
@@ -232,7 +234,8 @@ xfs_qm_init_dquot_blk(
232{ 234{
233 struct xfs_quotainfo *q = mp->m_quotainfo; 235 struct xfs_quotainfo *q = mp->m_quotainfo;
234 xfs_dqblk_t *d; 236 xfs_dqblk_t *d;
235 int curid, i; 237 xfs_dqid_t curid;
238 int i;
236 239
237 ASSERT(tp); 240 ASSERT(tp);
238 ASSERT(xfs_buf_islocked(bp)); 241 ASSERT(xfs_buf_islocked(bp));
@@ -243,7 +246,6 @@ xfs_qm_init_dquot_blk(
243 * ID of the first dquot in the block - id's are zero based. 246 * ID of the first dquot in the block - id's are zero based.
244 */ 247 */
245 curid = id - (id % q->qi_dqperchunk); 248 curid = id - (id % q->qi_dqperchunk);
246 ASSERT(curid >= 0);
247 memset(d, 0, BBTOB(q->qi_dqchunklen)); 249 memset(d, 0, BBTOB(q->qi_dqchunklen));
248 for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) { 250 for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) {
249 d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); 251 d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
@@ -464,12 +466,13 @@ xfs_qm_dqtobp(
464 struct xfs_bmbt_irec map; 466 struct xfs_bmbt_irec map;
465 int nmaps = 1, error; 467 int nmaps = 1, error;
466 struct xfs_buf *bp; 468 struct xfs_buf *bp;
467 struct xfs_inode *quotip = xfs_dq_to_quota_inode(dqp); 469 struct xfs_inode *quotip;
468 struct xfs_mount *mp = dqp->q_mount; 470 struct xfs_mount *mp = dqp->q_mount;
469 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); 471 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
470 struct xfs_trans *tp = (tpp ? *tpp : NULL); 472 struct xfs_trans *tp = (tpp ? *tpp : NULL);
471 uint lock_mode; 473 uint lock_mode;
472 474
475 quotip = xfs_quota_inode(dqp->q_mount, dqp->dq_flags);
473 dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; 476 dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
474 477
475 lock_mode = xfs_ilock_data_map_shared(quotip); 478 lock_mode = xfs_ilock_data_map_shared(quotip);
@@ -685,6 +688,56 @@ error0:
685} 688}
686 689
687/* 690/*
691 * Advance to the next id in the current chunk, or if at the
692 * end of the chunk, skip ahead to first id in next allocated chunk
693 * using the SEEK_DATA interface.
694 */
695int
696xfs_dq_get_next_id(
697 xfs_mount_t *mp,
698 uint type,
699 xfs_dqid_t *id,
700 loff_t eof)
701{
702 struct xfs_inode *quotip;
703 xfs_fsblock_t start;
704 loff_t offset;
705 uint lock;
706 xfs_dqid_t next_id;
707 int error = 0;
708
709 /* Simple advance */
710 next_id = *id + 1;
711
712 /* If new ID is within the current chunk, advancing it sufficed */
713 if (next_id % mp->m_quotainfo->qi_dqperchunk) {
714 *id = next_id;
715 return 0;
716 }
717
718 /* Nope, next_id is now past the current chunk, so find the next one */
719 start = (xfs_fsblock_t)next_id / mp->m_quotainfo->qi_dqperchunk;
720
721 quotip = xfs_quota_inode(mp, type);
722 lock = xfs_ilock_data_map_shared(quotip);
723
724 offset = __xfs_seek_hole_data(VFS_I(quotip), XFS_FSB_TO_B(mp, start),
725 eof, SEEK_DATA);
726 if (offset < 0)
727 error = offset;
728
729 xfs_iunlock(quotip, lock);
730
731 /* -ENXIO is essentially "no more data" */
732 if (error)
733 return (error == -ENXIO ? -ENOENT: error);
734
735 /* Convert next data offset back to a quota id */
736 *id = XFS_B_TO_FSB(mp, offset) * mp->m_quotainfo->qi_dqperchunk;
737 return 0;
738}
739
740/*
688 * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a 741 * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
689 * a locked dquot, doing an allocation (if requested) as needed. 742 * a locked dquot, doing an allocation (if requested) as needed.
690 * When both an inode and an id are given, the inode's id takes precedence. 743 * When both an inode and an id are given, the inode's id takes precedence.
@@ -704,6 +757,7 @@ xfs_qm_dqget(
704 struct xfs_quotainfo *qi = mp->m_quotainfo; 757 struct xfs_quotainfo *qi = mp->m_quotainfo;
705 struct radix_tree_root *tree = xfs_dquot_tree(qi, type); 758 struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
706 struct xfs_dquot *dqp; 759 struct xfs_dquot *dqp;
760 loff_t eof = 0;
707 int error; 761 int error;
708 762
709 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 763 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -731,6 +785,21 @@ xfs_qm_dqget(
731 } 785 }
732#endif 786#endif
733 787
788 /* Get the end of the quota file if we need it */
789 if (flags & XFS_QMOPT_DQNEXT) {
790 struct xfs_inode *quotip;
791 xfs_fileoff_t last;
792 uint lock_mode;
793
794 quotip = xfs_quota_inode(mp, type);
795 lock_mode = xfs_ilock_data_map_shared(quotip);
796 error = xfs_bmap_last_offset(quotip, &last, XFS_DATA_FORK);
797 xfs_iunlock(quotip, lock_mode);
798 if (error)
799 return error;
800 eof = XFS_FSB_TO_B(mp, last);
801 }
802
734restart: 803restart:
735 mutex_lock(&qi->qi_tree_lock); 804 mutex_lock(&qi->qi_tree_lock);
736 dqp = radix_tree_lookup(tree, id); 805 dqp = radix_tree_lookup(tree, id);
@@ -744,6 +813,18 @@ restart:
744 goto restart; 813 goto restart;
745 } 814 }
746 815
816 /* uninit / unused quota found in radix tree, keep looking */
817 if (flags & XFS_QMOPT_DQNEXT) {
818 if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
819 xfs_dqunlock(dqp);
820 mutex_unlock(&qi->qi_tree_lock);
821 error = xfs_dq_get_next_id(mp, type, &id, eof);
822 if (error)
823 return error;
824 goto restart;
825 }
826 }
827
747 dqp->q_nrefs++; 828 dqp->q_nrefs++;
748 mutex_unlock(&qi->qi_tree_lock); 829 mutex_unlock(&qi->qi_tree_lock);
749 830
@@ -770,6 +851,13 @@ restart:
770 if (ip) 851 if (ip)
771 xfs_ilock(ip, XFS_ILOCK_EXCL); 852 xfs_ilock(ip, XFS_ILOCK_EXCL);
772 853
854 /* If we are asked to find next active id, keep looking */
855 if (error == -ENOENT && (flags & XFS_QMOPT_DQNEXT)) {
856 error = xfs_dq_get_next_id(mp, type, &id, eof);
857 if (!error)
858 goto restart;
859 }
860
773 if (error) 861 if (error)
774 return error; 862 return error;
775 863
@@ -820,6 +908,17 @@ restart:
820 qi->qi_dquots++; 908 qi->qi_dquots++;
821 mutex_unlock(&qi->qi_tree_lock); 909 mutex_unlock(&qi->qi_tree_lock);
822 910
911 /* If we are asked to find next active id, keep looking */
912 if (flags & XFS_QMOPT_DQNEXT) {
913 if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
914 xfs_qm_dqput(dqp);
915 error = xfs_dq_get_next_id(mp, type, &id, eof);
916 if (error)
917 return error;
918 goto restart;
919 }
920 }
921
823 dqret: 922 dqret:
824 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); 923 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
825 trace_xfs_dqget_miss(dqp); 924 trace_xfs_dqget_miss(dqp);
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 652cd3c5b58c..a1b2dd828b9d 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -152,7 +152,7 @@ xfs_nfs_get_inode(
152 return ERR_PTR(error); 152 return ERR_PTR(error);
153 } 153 }
154 154
155 if (ip->i_d.di_gen != generation) { 155 if (VFS_I(ip)->i_generation != generation) {
156 IRELE(ip); 156 IRELE(ip);
157 return ERR_PTR(-ESTALE); 157 return ERR_PTR(-ESTALE);
158 } 158 }
@@ -246,7 +246,7 @@ const struct export_operations xfs_export_operations = {
246 .fh_to_parent = xfs_fs_fh_to_parent, 246 .fh_to_parent = xfs_fs_fh_to_parent,
247 .get_parent = xfs_fs_get_parent, 247 .get_parent = xfs_fs_get_parent,
248 .commit_metadata = xfs_fs_nfs_commit_metadata, 248 .commit_metadata = xfs_fs_nfs_commit_metadata,
249#ifdef CONFIG_NFSD_PNFS 249#ifdef CONFIG_NFSD_BLOCKLAYOUT
250 .get_uuid = xfs_fs_get_uuid, 250 .get_uuid = xfs_fs_get_uuid,
251 .map_blocks = xfs_fs_map_blocks, 251 .map_blocks = xfs_fs_map_blocks,
252 .commit_blocks = xfs_fs_commit_blocks, 252 .commit_blocks = xfs_fs_commit_blocks,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 52883ac3cf84..ac0fd32de31e 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -156,9 +156,9 @@ xfs_update_prealloc_flags(
156 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 156 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
157 157
158 if (!(flags & XFS_PREALLOC_INVISIBLE)) { 158 if (!(flags & XFS_PREALLOC_INVISIBLE)) {
159 ip->i_d.di_mode &= ~S_ISUID; 159 VFS_I(ip)->i_mode &= ~S_ISUID;
160 if (ip->i_d.di_mode & S_IXGRP) 160 if (VFS_I(ip)->i_mode & S_IXGRP)
161 ip->i_d.di_mode &= ~S_ISGID; 161 VFS_I(ip)->i_mode &= ~S_ISGID;
162 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 162 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
163 } 163 }
164 164
@@ -1337,31 +1337,31 @@ out:
1337 return found; 1337 return found;
1338} 1338}
1339 1339
1340STATIC loff_t 1340/*
1341xfs_seek_hole_data( 1341 * caller must lock inode with xfs_ilock_data_map_shared,
1342 struct file *file, 1342 * can we craft an appropriate ASSERT?
1343 *
1344 * end is because the VFS-level lseek interface is defined such that any
1345 * offset past i_size shall return -ENXIO, but we use this for quota code
1346 * which does not maintain i_size, and we want to SEEK_DATA past i_size.
1347 */
1348loff_t
1349__xfs_seek_hole_data(
1350 struct inode *inode,
1343 loff_t start, 1351 loff_t start,
1352 loff_t end,
1344 int whence) 1353 int whence)
1345{ 1354{
1346 struct inode *inode = file->f_mapping->host;
1347 struct xfs_inode *ip = XFS_I(inode); 1355 struct xfs_inode *ip = XFS_I(inode);
1348 struct xfs_mount *mp = ip->i_mount; 1356 struct xfs_mount *mp = ip->i_mount;
1349 loff_t uninitialized_var(offset); 1357 loff_t uninitialized_var(offset);
1350 xfs_fsize_t isize;
1351 xfs_fileoff_t fsbno; 1358 xfs_fileoff_t fsbno;
1352 xfs_filblks_t end; 1359 xfs_filblks_t lastbno;
1353 uint lock;
1354 int error; 1360 int error;
1355 1361
1356 if (XFS_FORCED_SHUTDOWN(mp)) 1362 if (start >= end) {
1357 return -EIO;
1358
1359 lock = xfs_ilock_data_map_shared(ip);
1360
1361 isize = i_size_read(inode);
1362 if (start >= isize) {
1363 error = -ENXIO; 1363 error = -ENXIO;
1364 goto out_unlock; 1364 goto out_error;
1365 } 1365 }
1366 1366
1367 /* 1367 /*
@@ -1369,22 +1369,22 @@ xfs_seek_hole_data(
1369 * by fsbno to the end block of the file. 1369 * by fsbno to the end block of the file.
1370 */ 1370 */
1371 fsbno = XFS_B_TO_FSBT(mp, start); 1371 fsbno = XFS_B_TO_FSBT(mp, start);
1372 end = XFS_B_TO_FSB(mp, isize); 1372 lastbno = XFS_B_TO_FSB(mp, end);
1373 1373
1374 for (;;) { 1374 for (;;) {
1375 struct xfs_bmbt_irec map[2]; 1375 struct xfs_bmbt_irec map[2];
1376 int nmap = 2; 1376 int nmap = 2;
1377 unsigned int i; 1377 unsigned int i;
1378 1378
1379 error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, 1379 error = xfs_bmapi_read(ip, fsbno, lastbno - fsbno, map, &nmap,
1380 XFS_BMAPI_ENTIRE); 1380 XFS_BMAPI_ENTIRE);
1381 if (error) 1381 if (error)
1382 goto out_unlock; 1382 goto out_error;
1383 1383
1384 /* No extents at given offset, must be beyond EOF */ 1384 /* No extents at given offset, must be beyond EOF */
1385 if (nmap == 0) { 1385 if (nmap == 0) {
1386 error = -ENXIO; 1386 error = -ENXIO;
1387 goto out_unlock; 1387 goto out_error;
1388 } 1388 }
1389 1389
1390 for (i = 0; i < nmap; i++) { 1390 for (i = 0; i < nmap; i++) {
@@ -1426,7 +1426,7 @@ xfs_seek_hole_data(
1426 * hole at the end of any file). 1426 * hole at the end of any file).
1427 */ 1427 */
1428 if (whence == SEEK_HOLE) { 1428 if (whence == SEEK_HOLE) {
1429 offset = isize; 1429 offset = end;
1430 break; 1430 break;
1431 } 1431 }
1432 /* 1432 /*
@@ -1434,7 +1434,7 @@ xfs_seek_hole_data(
1434 */ 1434 */
1435 ASSERT(whence == SEEK_DATA); 1435 ASSERT(whence == SEEK_DATA);
1436 error = -ENXIO; 1436 error = -ENXIO;
1437 goto out_unlock; 1437 goto out_error;
1438 } 1438 }
1439 1439
1440 ASSERT(i > 1); 1440 ASSERT(i > 1);
@@ -1445,14 +1445,14 @@ xfs_seek_hole_data(
1445 */ 1445 */
1446 fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; 1446 fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
1447 start = XFS_FSB_TO_B(mp, fsbno); 1447 start = XFS_FSB_TO_B(mp, fsbno);
1448 if (start >= isize) { 1448 if (start >= end) {
1449 if (whence == SEEK_HOLE) { 1449 if (whence == SEEK_HOLE) {
1450 offset = isize; 1450 offset = end;
1451 break; 1451 break;
1452 } 1452 }
1453 ASSERT(whence == SEEK_DATA); 1453 ASSERT(whence == SEEK_DATA);
1454 error = -ENXIO; 1454 error = -ENXIO;
1455 goto out_unlock; 1455 goto out_error;
1456 } 1456 }
1457 } 1457 }
1458 1458
@@ -1464,7 +1464,39 @@ out:
1464 * situation in particular. 1464 * situation in particular.
1465 */ 1465 */
1466 if (whence == SEEK_HOLE) 1466 if (whence == SEEK_HOLE)
1467 offset = min_t(loff_t, offset, isize); 1467 offset = min_t(loff_t, offset, end);
1468
1469 return offset;
1470
1471out_error:
1472 return error;
1473}
1474
1475STATIC loff_t
1476xfs_seek_hole_data(
1477 struct file *file,
1478 loff_t start,
1479 int whence)
1480{
1481 struct inode *inode = file->f_mapping->host;
1482 struct xfs_inode *ip = XFS_I(inode);
1483 struct xfs_mount *mp = ip->i_mount;
1484 uint lock;
1485 loff_t offset, end;
1486 int error = 0;
1487
1488 if (XFS_FORCED_SHUTDOWN(mp))
1489 return -EIO;
1490
1491 lock = xfs_ilock_data_map_shared(ip);
1492
1493 end = i_size_read(inode);
1494 offset = __xfs_seek_hole_data(inode, start, end, whence);
1495 if (offset < 0) {
1496 error = offset;
1497 goto out_unlock;
1498 }
1499
1468 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 1500 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1469 1501
1470out_unlock: 1502out_unlock:
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index c4c130f9bfb6..a51353a1f87f 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -151,7 +151,7 @@ xfs_filestream_pick_ag(
151 xfs_agnumber_t ag, max_ag = NULLAGNUMBER; 151 xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
152 int err, trylock, nscan; 152 int err, trylock, nscan;
153 153
154 ASSERT(S_ISDIR(ip->i_d.di_mode)); 154 ASSERT(S_ISDIR(VFS_I(ip)->i_mode));
155 155
156 /* 2% of an AG's blocks must be free for it to be chosen. */ 156 /* 2% of an AG's blocks must be free for it to be chosen. */
157 minfree = mp->m_sb.sb_agblocks / 50; 157 minfree = mp->m_sb.sb_agblocks / 50;
@@ -319,7 +319,7 @@ xfs_filestream_lookup_ag(
319 xfs_agnumber_t startag, ag = NULLAGNUMBER; 319 xfs_agnumber_t startag, ag = NULLAGNUMBER;
320 struct xfs_mru_cache_elem *mru; 320 struct xfs_mru_cache_elem *mru;
321 321
322 ASSERT(S_ISREG(ip->i_d.di_mode)); 322 ASSERT(S_ISREG(VFS_I(ip)->i_mode));
323 323
324 pip = xfs_filestream_get_parent(ip); 324 pip = xfs_filestream_get_parent(ip);
325 if (!pip) 325 if (!pip)
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 1b6a98b66886..f32713f14f9a 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,5 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, 25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
26 xfs_fsop_resblks_t *outval); 26 xfs_fsop_resblks_t *outval);
27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); 27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
28extern int xfs_fs_log_dummy(struct xfs_mount *mp);
29 28
30#endif /* __XFS_FSOPS_H__ */ 29#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index d7a490f24ead..bf2d60749278 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -63,6 +63,9 @@ xfs_inode_alloc(
63 return NULL; 63 return NULL;
64 } 64 }
65 65
66 /* VFS doesn't initialise i_mode! */
67 VFS_I(ip)->i_mode = 0;
68
66 XFS_STATS_INC(mp, vn_active); 69 XFS_STATS_INC(mp, vn_active);
67 ASSERT(atomic_read(&ip->i_pincount) == 0); 70 ASSERT(atomic_read(&ip->i_pincount) == 0);
68 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 71 ASSERT(!spin_is_locked(&ip->i_flags_lock));
@@ -79,7 +82,7 @@ xfs_inode_alloc(
79 memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); 82 memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
80 ip->i_flags = 0; 83 ip->i_flags = 0;
81 ip->i_delayed_blks = 0; 84 ip->i_delayed_blks = 0;
82 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); 85 memset(&ip->i_d, 0, sizeof(ip->i_d));
83 86
84 return ip; 87 return ip;
85} 88}
@@ -98,7 +101,7 @@ void
98xfs_inode_free( 101xfs_inode_free(
99 struct xfs_inode *ip) 102 struct xfs_inode *ip)
100{ 103{
101 switch (ip->i_d.di_mode & S_IFMT) { 104 switch (VFS_I(ip)->i_mode & S_IFMT) {
102 case S_IFREG: 105 case S_IFREG:
103 case S_IFDIR: 106 case S_IFDIR:
104 case S_IFLNK: 107 case S_IFLNK:
@@ -135,6 +138,34 @@ xfs_inode_free(
135} 138}
136 139
137/* 140/*
141 * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
142 * part of the structure. This is made more complex by the fact we store
143 * information about the on-disk values in the VFS inode and so we can't just
144 * overwrite the values unconditionally. Hence we save the parameters we
145 * need to retain across reinitialisation, and rewrite them into the VFS inode
146 * after reinitialisation even if it fails.
147 */
148static int
149xfs_reinit_inode(
150 struct xfs_mount *mp,
151 struct inode *inode)
152{
153 int error;
154 uint32_t nlink = inode->i_nlink;
155 uint32_t generation = inode->i_generation;
156 uint64_t version = inode->i_version;
157 umode_t mode = inode->i_mode;
158
159 error = inode_init_always(mp->m_super, inode);
160
161 set_nlink(inode, nlink);
162 inode->i_generation = generation;
163 inode->i_version = version;
164 inode->i_mode = mode;
165 return error;
166}
167
168/*
138 * Check the validity of the inode we just found it the cache 169 * Check the validity of the inode we just found it the cache
139 */ 170 */
140static int 171static int
@@ -185,7 +216,7 @@ xfs_iget_cache_hit(
185 /* 216 /*
186 * If lookup is racing with unlink return an error immediately. 217 * If lookup is racing with unlink return an error immediately.
187 */ 218 */
188 if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { 219 if (VFS_I(ip)->i_mode == 0 && !(flags & XFS_IGET_CREATE)) {
189 error = -ENOENT; 220 error = -ENOENT;
190 goto out_error; 221 goto out_error;
191 } 222 }
@@ -208,7 +239,7 @@ xfs_iget_cache_hit(
208 spin_unlock(&ip->i_flags_lock); 239 spin_unlock(&ip->i_flags_lock);
209 rcu_read_unlock(); 240 rcu_read_unlock();
210 241
211 error = inode_init_always(mp->m_super, inode); 242 error = xfs_reinit_inode(mp, inode);
212 if (error) { 243 if (error) {
213 /* 244 /*
214 * Re-initializing the inode failed, and we are in deep 245 * Re-initializing the inode failed, and we are in deep
@@ -295,7 +326,7 @@ xfs_iget_cache_miss(
295 326
296 trace_xfs_iget_miss(ip); 327 trace_xfs_iget_miss(ip);
297 328
298 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { 329 if ((VFS_I(ip)->i_mode == 0) && !(flags & XFS_IGET_CREATE)) {
299 error = -ENOENT; 330 error = -ENOENT;
300 goto out_destroy; 331 goto out_destroy;
301 } 332 }
@@ -444,7 +475,7 @@ again:
444 * If we have a real type for an on-disk inode, we can setup the inode 475 * If we have a real type for an on-disk inode, we can setup the inode
445 * now. If it's a new inode being created, xfs_ialloc will handle it. 476 * now. If it's a new inode being created, xfs_ialloc will handle it.
446 */ 477 */
447 if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) 478 if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
448 xfs_setup_existing_inode(ip); 479 xfs_setup_existing_inode(ip);
449 return 0; 480 return 0;
450 481
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ceba1a83cacc..96f606deee31 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -57,9 +57,9 @@ kmem_zone_t *xfs_inode_zone;
57 */ 57 */
58#define XFS_ITRUNC_MAX_EXTENTS 2 58#define XFS_ITRUNC_MAX_EXTENTS 2
59 59
60STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); 60STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *);
61 61STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
62STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *); 62STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
63 63
64/* 64/*
65 * helper function to extract extent size hint from inode 65 * helper function to extract extent size hint from inode
@@ -766,6 +766,7 @@ xfs_ialloc(
766 uint flags; 766 uint flags;
767 int error; 767 int error;
768 struct timespec tv; 768 struct timespec tv;
769 struct inode *inode;
769 770
770 /* 771 /*
771 * Call the space management code to pick 772 * Call the space management code to pick
@@ -791,6 +792,7 @@ xfs_ialloc(
791 if (error) 792 if (error)
792 return error; 793 return error;
793 ASSERT(ip != NULL); 794 ASSERT(ip != NULL);
795 inode = VFS_I(ip);
794 796
795 /* 797 /*
796 * We always convert v1 inodes to v2 now - we only support filesystems 798 * We always convert v1 inodes to v2 now - we only support filesystems
@@ -800,20 +802,16 @@ xfs_ialloc(
800 if (ip->i_d.di_version == 1) 802 if (ip->i_d.di_version == 1)
801 ip->i_d.di_version = 2; 803 ip->i_d.di_version = 2;
802 804
803 ip->i_d.di_mode = mode; 805 inode->i_mode = mode;
804 ip->i_d.di_onlink = 0; 806 set_nlink(inode, nlink);
805 ip->i_d.di_nlink = nlink;
806 ASSERT(ip->i_d.di_nlink == nlink);
807 ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid()); 807 ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
808 ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid()); 808 ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
809 xfs_set_projid(ip, prid); 809 xfs_set_projid(ip, prid);
810 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
811 810
812 if (pip && XFS_INHERIT_GID(pip)) { 811 if (pip && XFS_INHERIT_GID(pip)) {
813 ip->i_d.di_gid = pip->i_d.di_gid; 812 ip->i_d.di_gid = pip->i_d.di_gid;
814 if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) { 813 if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
815 ip->i_d.di_mode |= S_ISGID; 814 inode->i_mode |= S_ISGID;
816 }
817 } 815 }
818 816
819 /* 817 /*
@@ -822,38 +820,29 @@ xfs_ialloc(
822 * (and only if the irix_sgid_inherit compatibility variable is set). 820 * (and only if the irix_sgid_inherit compatibility variable is set).
823 */ 821 */
824 if ((irix_sgid_inherit) && 822 if ((irix_sgid_inherit) &&
825 (ip->i_d.di_mode & S_ISGID) && 823 (inode->i_mode & S_ISGID) &&
826 (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) { 824 (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid))))
827 ip->i_d.di_mode &= ~S_ISGID; 825 inode->i_mode &= ~S_ISGID;
828 }
829 826
830 ip->i_d.di_size = 0; 827 ip->i_d.di_size = 0;
831 ip->i_d.di_nextents = 0; 828 ip->i_d.di_nextents = 0;
832 ASSERT(ip->i_d.di_nblocks == 0); 829 ASSERT(ip->i_d.di_nblocks == 0);
833 830
834 tv = current_fs_time(mp->m_super); 831 tv = current_fs_time(mp->m_super);
835 ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; 832 inode->i_mtime = tv;
836 ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; 833 inode->i_atime = tv;
837 ip->i_d.di_atime = ip->i_d.di_mtime; 834 inode->i_ctime = tv;
838 ip->i_d.di_ctime = ip->i_d.di_mtime;
839 835
840 /*
841 * di_gen will have been taken care of in xfs_iread.
842 */
843 ip->i_d.di_extsize = 0; 836 ip->i_d.di_extsize = 0;
844 ip->i_d.di_dmevmask = 0; 837 ip->i_d.di_dmevmask = 0;
845 ip->i_d.di_dmstate = 0; 838 ip->i_d.di_dmstate = 0;
846 ip->i_d.di_flags = 0; 839 ip->i_d.di_flags = 0;
847 840
848 if (ip->i_d.di_version == 3) { 841 if (ip->i_d.di_version == 3) {
849 ASSERT(ip->i_d.di_ino == ino); 842 inode->i_version = 1;
850 ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid));
851 ip->i_d.di_crc = 0;
852 ip->i_d.di_changecount = 1;
853 ip->i_d.di_lsn = 0;
854 ip->i_d.di_flags2 = 0; 843 ip->i_d.di_flags2 = 0;
855 memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2)); 844 ip->i_d.di_crtime.t_sec = (__int32_t)tv.tv_sec;
856 ip->i_d.di_crtime = ip->i_d.di_mtime; 845 ip->i_d.di_crtime.t_nsec = (__int32_t)tv.tv_nsec;
857 } 846 }
858 847
859 848
@@ -1092,35 +1081,24 @@ xfs_dir_ialloc(
1092} 1081}
1093 1082
1094/* 1083/*
1095 * Decrement the link count on an inode & log the change. 1084 * Decrement the link count on an inode & log the change. If this causes the
1096 * If this causes the link count to go to zero, initiate the 1085 * link count to go to zero, move the inode to AGI unlinked list so that it can
1097 * logging activity required to truncate a file. 1086 * be freed when the last active reference goes away via xfs_inactive().
1098 */ 1087 */
1099int /* error */ 1088int /* error */
1100xfs_droplink( 1089xfs_droplink(
1101 xfs_trans_t *tp, 1090 xfs_trans_t *tp,
1102 xfs_inode_t *ip) 1091 xfs_inode_t *ip)
1103{ 1092{
1104 int error;
1105
1106 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 1093 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1107 1094
1108 ASSERT (ip->i_d.di_nlink > 0);
1109 ip->i_d.di_nlink--;
1110 drop_nlink(VFS_I(ip)); 1095 drop_nlink(VFS_I(ip));
1111 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1096 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1112 1097
1113 error = 0; 1098 if (VFS_I(ip)->i_nlink)
1114 if (ip->i_d.di_nlink == 0) { 1099 return 0;
1115 /* 1100
1116 * We're dropping the last link to this file. 1101 return xfs_iunlink(tp, ip);
1117 * Move the on-disk inode to the AGI unlinked list.
1118 * From xfs_inactive() we will pull the inode from
1119 * the list and free it.
1120 */
1121 error = xfs_iunlink(tp, ip);
1122 }
1123 return error;
1124} 1102}
1125 1103
1126/* 1104/*
@@ -1134,8 +1112,6 @@ xfs_bumplink(
1134 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 1112 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1135 1113
1136 ASSERT(ip->i_d.di_version > 1); 1114 ASSERT(ip->i_d.di_version > 1);
1137 ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE));
1138 ip->i_d.di_nlink++;
1139 inc_nlink(VFS_I(ip)); 1115 inc_nlink(VFS_I(ip));
1140 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1116 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1141 return 0; 1117 return 0;
@@ -1393,7 +1369,6 @@ xfs_create_tmpfile(
1393 */ 1369 */
1394 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); 1370 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1395 1371
1396 ip->i_d.di_nlink--;
1397 error = xfs_iunlink(tp, ip); 1372 error = xfs_iunlink(tp, ip);
1398 if (error) 1373 if (error)
1399 goto out_trans_cancel; 1374 goto out_trans_cancel;
@@ -1444,7 +1419,7 @@ xfs_link(
1444 1419
1445 trace_xfs_link(tdp, target_name); 1420 trace_xfs_link(tdp, target_name);
1446 1421
1447 ASSERT(!S_ISDIR(sip->i_d.di_mode)); 1422 ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
1448 1423
1449 if (XFS_FORCED_SHUTDOWN(mp)) 1424 if (XFS_FORCED_SHUTDOWN(mp))
1450 return -EIO; 1425 return -EIO;
@@ -1492,7 +1467,10 @@ xfs_link(
1492 1467
1493 xfs_bmap_init(&free_list, &first_block); 1468 xfs_bmap_init(&free_list, &first_block);
1494 1469
1495 if (sip->i_d.di_nlink == 0) { 1470 /*
1471 * Handle initial link state of O_TMPFILE inode
1472 */
1473 if (VFS_I(sip)->i_nlink == 0) {
1496 error = xfs_iunlink_remove(tp, sip); 1474 error = xfs_iunlink_remove(tp, sip);
1497 if (error) 1475 if (error)
1498 goto error_return; 1476 goto error_return;
@@ -1648,7 +1626,7 @@ xfs_release(
1648 xfs_mount_t *mp = ip->i_mount; 1626 xfs_mount_t *mp = ip->i_mount;
1649 int error; 1627 int error;
1650 1628
1651 if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0)) 1629 if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0))
1652 return 0; 1630 return 0;
1653 1631
1654 /* If this is a read-only mount, don't do this (would generate I/O) */ 1632 /* If this is a read-only mount, don't do this (would generate I/O) */
@@ -1679,7 +1657,7 @@ xfs_release(
1679 } 1657 }
1680 } 1658 }
1681 1659
1682 if (ip->i_d.di_nlink == 0) 1660 if (VFS_I(ip)->i_nlink == 0)
1683 return 0; 1661 return 0;
1684 1662
1685 if (xfs_can_free_eofblocks(ip, false)) { 1663 if (xfs_can_free_eofblocks(ip, false)) {
@@ -1883,7 +1861,7 @@ xfs_inactive(
1883 * If the inode is already free, then there can be nothing 1861 * If the inode is already free, then there can be nothing
1884 * to clean up here. 1862 * to clean up here.
1885 */ 1863 */
1886 if (ip->i_d.di_mode == 0) { 1864 if (VFS_I(ip)->i_mode == 0) {
1887 ASSERT(ip->i_df.if_real_bytes == 0); 1865 ASSERT(ip->i_df.if_real_bytes == 0);
1888 ASSERT(ip->i_df.if_broot_bytes == 0); 1866 ASSERT(ip->i_df.if_broot_bytes == 0);
1889 return; 1867 return;
@@ -1895,7 +1873,7 @@ xfs_inactive(
1895 if (mp->m_flags & XFS_MOUNT_RDONLY) 1873 if (mp->m_flags & XFS_MOUNT_RDONLY)
1896 return; 1874 return;
1897 1875
1898 if (ip->i_d.di_nlink != 0) { 1876 if (VFS_I(ip)->i_nlink != 0) {
1899 /* 1877 /*
1900 * force is true because we are evicting an inode from the 1878 * force is true because we are evicting an inode from the
1901 * cache. Post-eof blocks must be freed, lest we end up with 1879 * cache. Post-eof blocks must be freed, lest we end up with
@@ -1907,7 +1885,7 @@ xfs_inactive(
1907 return; 1885 return;
1908 } 1886 }
1909 1887
1910 if (S_ISREG(ip->i_d.di_mode) && 1888 if (S_ISREG(VFS_I(ip)->i_mode) &&
1911 (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || 1889 (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
1912 ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) 1890 ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
1913 truncate = 1; 1891 truncate = 1;
@@ -1916,7 +1894,7 @@ xfs_inactive(
1916 if (error) 1894 if (error)
1917 return; 1895 return;
1918 1896
1919 if (S_ISLNK(ip->i_d.di_mode)) 1897 if (S_ISLNK(VFS_I(ip)->i_mode))
1920 error = xfs_inactive_symlink(ip); 1898 error = xfs_inactive_symlink(ip);
1921 else if (truncate) 1899 else if (truncate)
1922 error = xfs_inactive_truncate(ip); 1900 error = xfs_inactive_truncate(ip);
@@ -1952,16 +1930,21 @@ xfs_inactive(
1952} 1930}
1953 1931
1954/* 1932/*
1955 * This is called when the inode's link count goes to 0. 1933 * This is called when the inode's link count goes to 0 or we are creating a
1956 * We place the on-disk inode on a list in the AGI. It 1934 * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
1957 * will be pulled from this list when the inode is freed. 1935 * set to true as the link count is dropped to zero by the VFS after we've
1936 * created the file successfully, so we have to add it to the unlinked list
1937 * while the link count is non-zero.
1938 *
1939 * We place the on-disk inode on a list in the AGI. It will be pulled from this
1940 * list when the inode is freed.
1958 */ 1941 */
1959int 1942STATIC int
1960xfs_iunlink( 1943xfs_iunlink(
1961 xfs_trans_t *tp, 1944 struct xfs_trans *tp,
1962 xfs_inode_t *ip) 1945 struct xfs_inode *ip)
1963{ 1946{
1964 xfs_mount_t *mp; 1947 xfs_mount_t *mp = tp->t_mountp;
1965 xfs_agi_t *agi; 1948 xfs_agi_t *agi;
1966 xfs_dinode_t *dip; 1949 xfs_dinode_t *dip;
1967 xfs_buf_t *agibp; 1950 xfs_buf_t *agibp;
@@ -1971,10 +1954,7 @@ xfs_iunlink(
1971 int offset; 1954 int offset;
1972 int error; 1955 int error;
1973 1956
1974 ASSERT(ip->i_d.di_nlink == 0); 1957 ASSERT(VFS_I(ip)->i_mode != 0);
1975 ASSERT(ip->i_d.di_mode != 0);
1976
1977 mp = tp->t_mountp;
1978 1958
1979 /* 1959 /*
1980 * Get the agi buffer first. It ensures lock ordering 1960 * Get the agi buffer first. It ensures lock ordering
@@ -2412,10 +2392,10 @@ xfs_ifree(
2412 struct xfs_icluster xic = { 0 }; 2392 struct xfs_icluster xic = { 0 };
2413 2393
2414 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 2394 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2415 ASSERT(ip->i_d.di_nlink == 0); 2395 ASSERT(VFS_I(ip)->i_nlink == 0);
2416 ASSERT(ip->i_d.di_nextents == 0); 2396 ASSERT(ip->i_d.di_nextents == 0);
2417 ASSERT(ip->i_d.di_anextents == 0); 2397 ASSERT(ip->i_d.di_anextents == 0);
2418 ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode)); 2398 ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
2419 ASSERT(ip->i_d.di_nblocks == 0); 2399 ASSERT(ip->i_d.di_nblocks == 0);
2420 2400
2421 /* 2401 /*
@@ -2429,7 +2409,7 @@ xfs_ifree(
2429 if (error) 2409 if (error)
2430 return error; 2410 return error;
2431 2411
2432 ip->i_d.di_mode = 0; /* mark incore inode as free */ 2412 VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
2433 ip->i_d.di_flags = 0; 2413 ip->i_d.di_flags = 0;
2434 ip->i_d.di_dmevmask = 0; 2414 ip->i_d.di_dmevmask = 0;
2435 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ 2415 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
@@ -2439,7 +2419,7 @@ xfs_ifree(
2439 * Bump the generation count so no one will be confused 2419 * Bump the generation count so no one will be confused
2440 * by reincarnations of this inode. 2420 * by reincarnations of this inode.
2441 */ 2421 */
2442 ip->i_d.di_gen++; 2422 VFS_I(ip)->i_generation++;
2443 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2423 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2444 2424
2445 if (xic.deleted) 2425 if (xic.deleted)
@@ -2526,7 +2506,7 @@ xfs_remove(
2526{ 2506{
2527 xfs_mount_t *mp = dp->i_mount; 2507 xfs_mount_t *mp = dp->i_mount;
2528 xfs_trans_t *tp = NULL; 2508 xfs_trans_t *tp = NULL;
2529 int is_dir = S_ISDIR(ip->i_d.di_mode); 2509 int is_dir = S_ISDIR(VFS_I(ip)->i_mode);
2530 int error = 0; 2510 int error = 0;
2531 xfs_bmap_free_t free_list; 2511 xfs_bmap_free_t free_list;
2532 xfs_fsblock_t first_block; 2512 xfs_fsblock_t first_block;
@@ -2580,8 +2560,8 @@ xfs_remove(
2580 * If we're removing a directory perform some additional validation. 2560 * If we're removing a directory perform some additional validation.
2581 */ 2561 */
2582 if (is_dir) { 2562 if (is_dir) {
2583 ASSERT(ip->i_d.di_nlink >= 2); 2563 ASSERT(VFS_I(ip)->i_nlink >= 2);
2584 if (ip->i_d.di_nlink != 2) { 2564 if (VFS_I(ip)->i_nlink != 2) {
2585 error = -ENOTEMPTY; 2565 error = -ENOTEMPTY;
2586 goto out_trans_cancel; 2566 goto out_trans_cancel;
2587 } 2567 }
@@ -2771,7 +2751,7 @@ xfs_cross_rename(
2771 if (dp1 != dp2) { 2751 if (dp1 != dp2) {
2772 dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 2752 dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2773 2753
2774 if (S_ISDIR(ip2->i_d.di_mode)) { 2754 if (S_ISDIR(VFS_I(ip2)->i_mode)) {
2775 error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, 2755 error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
2776 dp1->i_ino, first_block, 2756 dp1->i_ino, first_block,
2777 free_list, spaceres); 2757 free_list, spaceres);
@@ -2779,7 +2759,7 @@ xfs_cross_rename(
2779 goto out_trans_abort; 2759 goto out_trans_abort;
2780 2760
2781 /* transfer ip2 ".." reference to dp1 */ 2761 /* transfer ip2 ".." reference to dp1 */
2782 if (!S_ISDIR(ip1->i_d.di_mode)) { 2762 if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
2783 error = xfs_droplink(tp, dp2); 2763 error = xfs_droplink(tp, dp2);
2784 if (error) 2764 if (error)
2785 goto out_trans_abort; 2765 goto out_trans_abort;
@@ -2798,7 +2778,7 @@ xfs_cross_rename(
2798 ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 2778 ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2799 } 2779 }
2800 2780
2801 if (S_ISDIR(ip1->i_d.di_mode)) { 2781 if (S_ISDIR(VFS_I(ip1)->i_mode)) {
2802 error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, 2782 error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
2803 dp2->i_ino, first_block, 2783 dp2->i_ino, first_block,
2804 free_list, spaceres); 2784 free_list, spaceres);
@@ -2806,7 +2786,7 @@ xfs_cross_rename(
2806 goto out_trans_abort; 2786 goto out_trans_abort;
2807 2787
2808 /* transfer ip1 ".." reference to dp2 */ 2788 /* transfer ip1 ".." reference to dp2 */
2809 if (!S_ISDIR(ip2->i_d.di_mode)) { 2789 if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
2810 error = xfs_droplink(tp, dp1); 2790 error = xfs_droplink(tp, dp1);
2811 if (error) 2791 if (error)
2812 goto out_trans_abort; 2792 goto out_trans_abort;
@@ -2903,7 +2883,7 @@ xfs_rename(
2903 struct xfs_inode *inodes[__XFS_SORT_INODES]; 2883 struct xfs_inode *inodes[__XFS_SORT_INODES];
2904 int num_inodes = __XFS_SORT_INODES; 2884 int num_inodes = __XFS_SORT_INODES;
2905 bool new_parent = (src_dp != target_dp); 2885 bool new_parent = (src_dp != target_dp);
2906 bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode); 2886 bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
2907 int spaceres; 2887 int spaceres;
2908 int error; 2888 int error;
2909 2889
@@ -3032,12 +3012,12 @@ xfs_rename(
3032 * target and source are directories and that target can be 3012 * target and source are directories and that target can be
3033 * destroyed, or that neither is a directory. 3013 * destroyed, or that neither is a directory.
3034 */ 3014 */
3035 if (S_ISDIR(target_ip->i_d.di_mode)) { 3015 if (S_ISDIR(VFS_I(target_ip)->i_mode)) {
3036 /* 3016 /*
3037 * Make sure target dir is empty. 3017 * Make sure target dir is empty.
3038 */ 3018 */
3039 if (!(xfs_dir_isempty(target_ip)) || 3019 if (!(xfs_dir_isempty(target_ip)) ||
3040 (target_ip->i_d.di_nlink > 2)) { 3020 (VFS_I(target_ip)->i_nlink > 2)) {
3041 error = -EEXIST; 3021 error = -EEXIST;
3042 goto out_trans_cancel; 3022 goto out_trans_cancel;
3043 } 3023 }
@@ -3144,7 +3124,7 @@ xfs_rename(
3144 * intermediate state on disk. 3124 * intermediate state on disk.
3145 */ 3125 */
3146 if (wip) { 3126 if (wip) {
3147 ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0); 3127 ASSERT(VFS_I(wip)->i_nlink == 0);
3148 error = xfs_bumplink(tp, wip); 3128 error = xfs_bumplink(tp, wip);
3149 if (error) 3129 if (error)
3150 goto out_bmap_cancel; 3130 goto out_bmap_cancel;
@@ -3313,7 +3293,7 @@ cluster_corrupt_out:
3313 * mark it as stale and brelse. 3293 * mark it as stale and brelse.
3314 */ 3294 */
3315 if (bp->b_iodone) { 3295 if (bp->b_iodone) {
3316 XFS_BUF_UNDONE(bp); 3296 bp->b_flags &= ~XBF_DONE;
3317 xfs_buf_stale(bp); 3297 xfs_buf_stale(bp);
3318 xfs_buf_ioerror(bp, -EIO); 3298 xfs_buf_ioerror(bp, -EIO);
3319 xfs_buf_ioend(bp); 3299 xfs_buf_ioend(bp);
@@ -3462,14 +3442,7 @@ xfs_iflush_int(
3462 __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); 3442 __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
3463 goto corrupt_out; 3443 goto corrupt_out;
3464 } 3444 }
3465 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, 3445 if (S_ISREG(VFS_I(ip)->i_mode)) {
3466 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
3467 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3468 "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
3469 __func__, ip->i_ino, ip, ip->i_d.di_magic);
3470 goto corrupt_out;
3471 }
3472 if (S_ISREG(ip->i_d.di_mode)) {
3473 if (XFS_TEST_ERROR( 3446 if (XFS_TEST_ERROR(
3474 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 3447 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3475 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), 3448 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
@@ -3479,7 +3452,7 @@ xfs_iflush_int(
3479 __func__, ip->i_ino, ip); 3452 __func__, ip->i_ino, ip);
3480 goto corrupt_out; 3453 goto corrupt_out;
3481 } 3454 }
3482 } else if (S_ISDIR(ip->i_d.di_mode)) { 3455 } else if (S_ISDIR(VFS_I(ip)->i_mode)) {
3483 if (XFS_TEST_ERROR( 3456 if (XFS_TEST_ERROR(
3484 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 3457 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3485 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && 3458 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
@@ -3523,12 +3496,11 @@ xfs_iflush_int(
3523 ip->i_d.di_flushiter++; 3496 ip->i_d.di_flushiter++;
3524 3497
3525 /* 3498 /*
3526 * Copy the dirty parts of the inode into the on-disk 3499 * Copy the dirty parts of the inode into the on-disk inode. We always
3527 * inode. We always copy out the core of the inode, 3500 * copy out the core of the inode, because if the inode is dirty at all
3528 * because if the inode is dirty at all the core must 3501 * the core must be.
3529 * be.
3530 */ 3502 */
3531 xfs_dinode_to_disk(dip, &ip->i_d); 3503 xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
3532 3504
3533 /* Wrap, we never let the log put out DI_MAX_FLUSH */ 3505 /* Wrap, we never let the log put out DI_MAX_FLUSH */
3534 if (ip->i_d.di_flushiter == DI_MAX_FLUSH) 3506 if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
@@ -3580,10 +3552,6 @@ xfs_iflush_int(
3580 */ 3552 */
3581 xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); 3553 xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
3582 3554
3583 /* update the lsn in the on disk inode if required */
3584 if (ip->i_d.di_version == 3)
3585 dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn);
3586
3587 /* generate the checksum. */ 3555 /* generate the checksum. */
3588 xfs_dinode_calc_crc(mp, dip); 3556 xfs_dinode_calc_crc(mp, dip);
3589 3557
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ca9e11989cbd..43e1d51b15eb 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -63,7 +63,7 @@ typedef struct xfs_inode {
63 unsigned long i_flags; /* see defined flags below */ 63 unsigned long i_flags; /* see defined flags below */
64 unsigned int i_delayed_blks; /* count of delay alloc blks */ 64 unsigned int i_delayed_blks; /* count of delay alloc blks */
65 65
66 xfs_icdinode_t i_d; /* most of ondisk inode */ 66 struct xfs_icdinode i_d; /* most of ondisk inode */
67 67
68 /* VFS inode */ 68 /* VFS inode */
69 struct inode i_vnode; /* embedded VFS inode */ 69 struct inode i_vnode; /* embedded VFS inode */
@@ -88,7 +88,7 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
88 */ 88 */
89static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip) 89static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
90{ 90{
91 if (S_ISREG(ip->i_d.di_mode)) 91 if (S_ISREG(VFS_I(ip)->i_mode))
92 return i_size_read(VFS_I(ip)); 92 return i_size_read(VFS_I(ip));
93 return ip->i_d.di_size; 93 return ip->i_d.di_size;
94} 94}
@@ -369,7 +369,7 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
369 */ 369 */
370#define XFS_INHERIT_GID(pip) \ 370#define XFS_INHERIT_GID(pip) \
371 (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ 371 (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
372 ((pip)->i_d.di_mode & S_ISGID)) 372 (VFS_I(pip)->i_mode & S_ISGID))
373 373
374int xfs_release(struct xfs_inode *ip); 374int xfs_release(struct xfs_inode *ip);
375void xfs_inactive(struct xfs_inode *ip); 375void xfs_inactive(struct xfs_inode *ip);
@@ -405,8 +405,6 @@ int xfs_ifree(struct xfs_trans *, xfs_inode_t *,
405 struct xfs_bmap_free *); 405 struct xfs_bmap_free *);
406int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, 406int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
407 int, xfs_fsize_t); 407 int, xfs_fsize_t);
408int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
409
410void xfs_iext_realloc(xfs_inode_t *, int, int); 408void xfs_iext_realloc(xfs_inode_t *, int, int);
411 409
412void xfs_iunpin_wait(xfs_inode_t *); 410void xfs_iunpin_wait(xfs_inode_t *);
@@ -437,6 +435,8 @@ int xfs_update_prealloc_flags(struct xfs_inode *ip,
437int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, 435int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
438 xfs_fsize_t isize, bool *did_zeroing); 436 xfs_fsize_t isize, bool *did_zeroing);
439int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); 437int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
438loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start,
439 loff_t eof, int whence);
440 440
441 441
442/* from xfs_iops.c */ 442/* from xfs_iops.c */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d14b12b8cfef..c48b5b18d771 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -135,7 +135,7 @@ xfs_inode_item_size(
135 135
136 *nvecs += 2; 136 *nvecs += 2;
137 *nbytes += sizeof(struct xfs_inode_log_format) + 137 *nbytes += sizeof(struct xfs_inode_log_format) +
138 xfs_icdinode_size(ip->i_d.di_version); 138 xfs_log_dinode_size(ip->i_d.di_version);
139 139
140 xfs_inode_item_data_fork_size(iip, nvecs, nbytes); 140 xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
141 if (XFS_IFORK_Q(ip)) 141 if (XFS_IFORK_Q(ip))
@@ -322,6 +322,81 @@ xfs_inode_item_format_attr_fork(
322 } 322 }
323} 323}
324 324
325static void
326xfs_inode_to_log_dinode(
327 struct xfs_inode *ip,
328 struct xfs_log_dinode *to,
329 xfs_lsn_t lsn)
330{
331 struct xfs_icdinode *from = &ip->i_d;
332 struct inode *inode = VFS_I(ip);
333
334 to->di_magic = XFS_DINODE_MAGIC;
335
336 to->di_version = from->di_version;
337 to->di_format = from->di_format;
338 to->di_uid = from->di_uid;
339 to->di_gid = from->di_gid;
340 to->di_projid_lo = from->di_projid_lo;
341 to->di_projid_hi = from->di_projid_hi;
342
343 memset(to->di_pad, 0, sizeof(to->di_pad));
344 memset(to->di_pad3, 0, sizeof(to->di_pad3));
345 to->di_atime.t_sec = inode->i_atime.tv_sec;
346 to->di_atime.t_nsec = inode->i_atime.tv_nsec;
347 to->di_mtime.t_sec = inode->i_mtime.tv_sec;
348 to->di_mtime.t_nsec = inode->i_mtime.tv_nsec;
349 to->di_ctime.t_sec = inode->i_ctime.tv_sec;
350 to->di_ctime.t_nsec = inode->i_ctime.tv_nsec;
351 to->di_nlink = inode->i_nlink;
352 to->di_gen = inode->i_generation;
353 to->di_mode = inode->i_mode;
354
355 to->di_size = from->di_size;
356 to->di_nblocks = from->di_nblocks;
357 to->di_extsize = from->di_extsize;
358 to->di_nextents = from->di_nextents;
359 to->di_anextents = from->di_anextents;
360 to->di_forkoff = from->di_forkoff;
361 to->di_aformat = from->di_aformat;
362 to->di_dmevmask = from->di_dmevmask;
363 to->di_dmstate = from->di_dmstate;
364 to->di_flags = from->di_flags;
365
366 if (from->di_version == 3) {
367 to->di_changecount = inode->i_version;
368 to->di_crtime.t_sec = from->di_crtime.t_sec;
369 to->di_crtime.t_nsec = from->di_crtime.t_nsec;
370 to->di_flags2 = from->di_flags2;
371
372 to->di_ino = ip->i_ino;
373 to->di_lsn = lsn;
374 memset(to->di_pad2, 0, sizeof(to->di_pad2));
375 uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
376 to->di_flushiter = 0;
377 } else {
378 to->di_flushiter = from->di_flushiter;
379 }
380}
381
382/*
383 * Format the inode core. Current timestamp data is only in the VFS inode
384 * fields, so we need to grab them from there. Hence rather than just copying
385 * the XFS inode core structure, format the fields directly into the iovec.
386 */
387static void
388xfs_inode_item_format_core(
389 struct xfs_inode *ip,
390 struct xfs_log_vec *lv,
391 struct xfs_log_iovec **vecp)
392{
393 struct xfs_log_dinode *dic;
394
395 dic = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_ICORE);
396 xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn);
397 xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_d.di_version));
398}
399
325/* 400/*
326 * This is called to fill in the vector of log iovecs for the given inode 401 * This is called to fill in the vector of log iovecs for the given inode
327 * log item. It fills the first item with an inode log format structure, 402 * log item. It fills the first item with an inode log format structure,
@@ -351,10 +426,7 @@ xfs_inode_item_format(
351 ilf->ilf_size = 2; /* format + core */ 426 ilf->ilf_size = 2; /* format + core */
352 xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format)); 427 xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format));
353 428
354 xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE, 429 xfs_inode_item_format_core(ip, lv, &vecp);
355 &ip->i_d,
356 xfs_icdinode_size(ip->i_d.di_version));
357
358 xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp); 430 xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
359 if (XFS_IFORK_Q(ip)) { 431 if (XFS_IFORK_Q(ip)) {
360 xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp); 432 xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 478d04e07f95..bcb6c19ce3ea 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -114,7 +114,7 @@ xfs_find_handle(
114 handle.ha_fid.fid_len = sizeof(xfs_fid_t) - 114 handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
115 sizeof(handle.ha_fid.fid_len); 115 sizeof(handle.ha_fid.fid_len);
116 handle.ha_fid.fid_pad = 0; 116 handle.ha_fid.fid_pad = 0;
117 handle.ha_fid.fid_gen = ip->i_d.di_gen; 117 handle.ha_fid.fid_gen = inode->i_generation;
118 handle.ha_fid.fid_ino = ip->i_ino; 118 handle.ha_fid.fid_ino = ip->i_ino;
119 119
120 hsize = XFS_HSIZE(handle); 120 hsize = XFS_HSIZE(handle);
@@ -963,7 +963,7 @@ xfs_set_diflags(
963 di_flags |= XFS_DIFLAG_NODEFRAG; 963 di_flags |= XFS_DIFLAG_NODEFRAG;
964 if (xflags & FS_XFLAG_FILESTREAM) 964 if (xflags & FS_XFLAG_FILESTREAM)
965 di_flags |= XFS_DIFLAG_FILESTREAM; 965 di_flags |= XFS_DIFLAG_FILESTREAM;
966 if (S_ISDIR(ip->i_d.di_mode)) { 966 if (S_ISDIR(VFS_I(ip)->i_mode)) {
967 if (xflags & FS_XFLAG_RTINHERIT) 967 if (xflags & FS_XFLAG_RTINHERIT)
968 di_flags |= XFS_DIFLAG_RTINHERIT; 968 di_flags |= XFS_DIFLAG_RTINHERIT;
969 if (xflags & FS_XFLAG_NOSYMLINKS) 969 if (xflags & FS_XFLAG_NOSYMLINKS)
@@ -972,7 +972,7 @@ xfs_set_diflags(
972 di_flags |= XFS_DIFLAG_EXTSZINHERIT; 972 di_flags |= XFS_DIFLAG_EXTSZINHERIT;
973 if (xflags & FS_XFLAG_PROJINHERIT) 973 if (xflags & FS_XFLAG_PROJINHERIT)
974 di_flags |= XFS_DIFLAG_PROJINHERIT; 974 di_flags |= XFS_DIFLAG_PROJINHERIT;
975 } else if (S_ISREG(ip->i_d.di_mode)) { 975 } else if (S_ISREG(VFS_I(ip)->i_mode)) {
976 if (xflags & FS_XFLAG_REALTIME) 976 if (xflags & FS_XFLAG_REALTIME)
977 di_flags |= XFS_DIFLAG_REALTIME; 977 di_flags |= XFS_DIFLAG_REALTIME;
978 if (xflags & FS_XFLAG_EXTSIZE) 978 if (xflags & FS_XFLAG_EXTSIZE)
@@ -1060,23 +1060,86 @@ xfs_ioctl_setattr_xflags(
1060} 1060}
1061 1061
1062/* 1062/*
1063 * If we are changing DAX flags, we have to ensure the file is clean and any
1064 * cached objects in the address space are invalidated and removed. This
1065 * requires us to lock out other IO and page faults similar to a truncate
1066 * operation. The locks need to be held until the transaction has been committed
1067 * so that the cache invalidation is atomic with respect to the DAX flag
1068 * manipulation.
1069 */
1070static int
1071xfs_ioctl_setattr_dax_invalidate(
1072 struct xfs_inode *ip,
1073 struct fsxattr *fa,
1074 int *join_flags)
1075{
1076 struct inode *inode = VFS_I(ip);
1077 int error;
1078
1079 *join_flags = 0;
1080
1081 /*
1082 * It is only valid to set the DAX flag on regular files and
1083 * directories on filesystems where the block size is equal to the page
1084 * size. On directories it serves as an inherit hint.
1085 */
1086 if (fa->fsx_xflags & FS_XFLAG_DAX) {
1087 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
1088 return -EINVAL;
1089 if (ip->i_mount->m_sb.sb_blocksize != PAGE_SIZE)
1090 return -EINVAL;
1091 }
1092
1093 /* If the DAX state is not changing, we have nothing to do here. */
1094 if ((fa->fsx_xflags & FS_XFLAG_DAX) && IS_DAX(inode))
1095 return 0;
1096 if (!(fa->fsx_xflags & FS_XFLAG_DAX) && !IS_DAX(inode))
1097 return 0;
1098
1099 /* lock, flush and invalidate mapping in preparation for flag change */
1100 xfs_ilock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL);
1101 error = filemap_write_and_wait(inode->i_mapping);
1102 if (error)
1103 goto out_unlock;
1104 error = invalidate_inode_pages2(inode->i_mapping);
1105 if (error)
1106 goto out_unlock;
1107
1108 *join_flags = XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL;
1109 return 0;
1110
1111out_unlock:
1112 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL);
1113 return error;
1114
1115}
1116
1117/*
1063 * Set up the transaction structure for the setattr operation, checking that we 1118 * Set up the transaction structure for the setattr operation, checking that we
1064 * have permission to do so. On success, return a clean transaction and the 1119 * have permission to do so. On success, return a clean transaction and the
1065 * inode locked exclusively ready for further operation specific checks. On 1120 * inode locked exclusively ready for further operation specific checks. On
1066 * failure, return an error without modifying or locking the inode. 1121 * failure, return an error without modifying or locking the inode.
1122 *
1123 * The inode might already be IO locked on call. If this is the case, it is
1124 * indicated in @join_flags and we take full responsibility for ensuring they
1125 * are unlocked from now on. Hence if we have an error here, we still have to
1126 * unlock them. Otherwise, once they are joined to the transaction, they will
1127 * be unlocked on commit/cancel.
1067 */ 1128 */
1068static struct xfs_trans * 1129static struct xfs_trans *
1069xfs_ioctl_setattr_get_trans( 1130xfs_ioctl_setattr_get_trans(
1070 struct xfs_inode *ip) 1131 struct xfs_inode *ip,
1132 int join_flags)
1071{ 1133{
1072 struct xfs_mount *mp = ip->i_mount; 1134 struct xfs_mount *mp = ip->i_mount;
1073 struct xfs_trans *tp; 1135 struct xfs_trans *tp;
1074 int error; 1136 int error = -EROFS;
1075 1137
1076 if (mp->m_flags & XFS_MOUNT_RDONLY) 1138 if (mp->m_flags & XFS_MOUNT_RDONLY)
1077 return ERR_PTR(-EROFS); 1139 goto out_unlock;
1140 error = -EIO;
1078 if (XFS_FORCED_SHUTDOWN(mp)) 1141 if (XFS_FORCED_SHUTDOWN(mp))
1079 return ERR_PTR(-EIO); 1142 goto out_unlock;
1080 1143
1081 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); 1144 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
1082 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); 1145 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
@@ -1084,7 +1147,8 @@ xfs_ioctl_setattr_get_trans(
1084 goto out_cancel; 1147 goto out_cancel;
1085 1148
1086 xfs_ilock(ip, XFS_ILOCK_EXCL); 1149 xfs_ilock(ip, XFS_ILOCK_EXCL);
1087 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 1150 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags);
1151 join_flags = 0;
1088 1152
1089 /* 1153 /*
1090 * CAP_FOWNER overrides the following restrictions: 1154 * CAP_FOWNER overrides the following restrictions:
@@ -1104,6 +1168,9 @@ xfs_ioctl_setattr_get_trans(
1104 1168
1105out_cancel: 1169out_cancel:
1106 xfs_trans_cancel(tp); 1170 xfs_trans_cancel(tp);
1171out_unlock:
1172 if (join_flags)
1173 xfs_iunlock(ip, join_flags);
1107 return ERR_PTR(error); 1174 return ERR_PTR(error);
1108} 1175}
1109 1176
@@ -1128,14 +1195,14 @@ xfs_ioctl_setattr_check_extsize(
1128{ 1195{
1129 struct xfs_mount *mp = ip->i_mount; 1196 struct xfs_mount *mp = ip->i_mount;
1130 1197
1131 if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode)) 1198 if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(VFS_I(ip)->i_mode))
1132 return -EINVAL; 1199 return -EINVAL;
1133 1200
1134 if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) && 1201 if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
1135 !S_ISDIR(ip->i_d.di_mode)) 1202 !S_ISDIR(VFS_I(ip)->i_mode))
1136 return -EINVAL; 1203 return -EINVAL;
1137 1204
1138 if (S_ISREG(ip->i_d.di_mode) && ip->i_d.di_nextents && 1205 if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_d.di_nextents &&
1139 ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize)) 1206 ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
1140 return -EINVAL; 1207 return -EINVAL;
1141 1208
@@ -1202,6 +1269,7 @@ xfs_ioctl_setattr(
1202 struct xfs_dquot *pdqp = NULL; 1269 struct xfs_dquot *pdqp = NULL;
1203 struct xfs_dquot *olddquot = NULL; 1270 struct xfs_dquot *olddquot = NULL;
1204 int code; 1271 int code;
1272 int join_flags = 0;
1205 1273
1206 trace_xfs_ioctl_setattr(ip); 1274 trace_xfs_ioctl_setattr(ip);
1207 1275
@@ -1225,7 +1293,18 @@ xfs_ioctl_setattr(
1225 return code; 1293 return code;
1226 } 1294 }
1227 1295
1228 tp = xfs_ioctl_setattr_get_trans(ip); 1296 /*
1297 * Changing DAX config may require inode locking for mapping
1298 * invalidation. These need to be held all the way to transaction commit
1299 * or cancel time, so need to be passed through to
1300 * xfs_ioctl_setattr_get_trans() so it can apply them to the join call
1301 * appropriately.
1302 */
1303 code = xfs_ioctl_setattr_dax_invalidate(ip, fa, &join_flags);
1304 if (code)
1305 goto error_free_dquots;
1306
1307 tp = xfs_ioctl_setattr_get_trans(ip, join_flags);
1229 if (IS_ERR(tp)) { 1308 if (IS_ERR(tp)) {
1230 code = PTR_ERR(tp); 1309 code = PTR_ERR(tp);
1231 goto error_free_dquots; 1310 goto error_free_dquots;
@@ -1256,9 +1335,9 @@ xfs_ioctl_setattr(
1256 * successful return from chown() 1335 * successful return from chown()
1257 */ 1336 */
1258 1337
1259 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && 1338 if ((VFS_I(ip)->i_mode & (S_ISUID|S_ISGID)) &&
1260 !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID)) 1339 !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
1261 ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); 1340 VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID);
1262 1341
1263 /* Change the ownerships and register project quota modifications */ 1342 /* Change the ownerships and register project quota modifications */
1264 if (xfs_get_projid(ip) != fa->fsx_projid) { 1343 if (xfs_get_projid(ip) != fa->fsx_projid) {
@@ -1341,6 +1420,7 @@ xfs_ioc_setxflags(
1341 struct xfs_trans *tp; 1420 struct xfs_trans *tp;
1342 struct fsxattr fa; 1421 struct fsxattr fa;
1343 unsigned int flags; 1422 unsigned int flags;
1423 int join_flags = 0;
1344 int error; 1424 int error;
1345 1425
1346 if (copy_from_user(&flags, arg, sizeof(flags))) 1426 if (copy_from_user(&flags, arg, sizeof(flags)))
@@ -1357,7 +1437,18 @@ xfs_ioc_setxflags(
1357 if (error) 1437 if (error)
1358 return error; 1438 return error;
1359 1439
1360 tp = xfs_ioctl_setattr_get_trans(ip); 1440 /*
1441 * Changing DAX config may require inode locking for mapping
1442 * invalidation. These need to be held all the way to transaction commit
1443 * or cancel time, so need to be passed through to
1444 * xfs_ioctl_setattr_get_trans() so it can apply them to the join call
1445 * appropriately.
1446 */
1447 error = xfs_ioctl_setattr_dax_invalidate(ip, &fa, &join_flags);
1448 if (error)
1449 goto out_drop_write;
1450
1451 tp = xfs_ioctl_setattr_get_trans(ip, join_flags);
1361 if (IS_ERR(tp)) { 1452 if (IS_ERR(tp)) {
1362 error = PTR_ERR(tp); 1453 error = PTR_ERR(tp);
1363 goto out_drop_write; 1454 goto out_drop_write;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 76b71a1c6c32..fb7dc61f4a29 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -459,8 +459,8 @@ xfs_vn_getattr(
459 459
460 stat->size = XFS_ISIZE(ip); 460 stat->size = XFS_ISIZE(ip);
461 stat->dev = inode->i_sb->s_dev; 461 stat->dev = inode->i_sb->s_dev;
462 stat->mode = ip->i_d.di_mode; 462 stat->mode = inode->i_mode;
463 stat->nlink = ip->i_d.di_nlink; 463 stat->nlink = inode->i_nlink;
464 stat->uid = inode->i_uid; 464 stat->uid = inode->i_uid;
465 stat->gid = inode->i_gid; 465 stat->gid = inode->i_gid;
466 stat->ino = ip->i_ino; 466 stat->ino = ip->i_ino;
@@ -506,9 +506,6 @@ xfs_setattr_mode(
506 506
507 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 507 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
508 508
509 ip->i_d.di_mode &= S_IFMT;
510 ip->i_d.di_mode |= mode & ~S_IFMT;
511
512 inode->i_mode &= S_IFMT; 509 inode->i_mode &= S_IFMT;
513 inode->i_mode |= mode & ~S_IFMT; 510 inode->i_mode |= mode & ~S_IFMT;
514} 511}
@@ -522,21 +519,12 @@ xfs_setattr_time(
522 519
523 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 520 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
524 521
525 if (iattr->ia_valid & ATTR_ATIME) { 522 if (iattr->ia_valid & ATTR_ATIME)
526 inode->i_atime = iattr->ia_atime; 523 inode->i_atime = iattr->ia_atime;
527 ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; 524 if (iattr->ia_valid & ATTR_CTIME)
528 ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
529 }
530 if (iattr->ia_valid & ATTR_CTIME) {
531 inode->i_ctime = iattr->ia_ctime; 525 inode->i_ctime = iattr->ia_ctime;
532 ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; 526 if (iattr->ia_valid & ATTR_MTIME)
533 ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
534 }
535 if (iattr->ia_valid & ATTR_MTIME) {
536 inode->i_mtime = iattr->ia_mtime; 527 inode->i_mtime = iattr->ia_mtime;
537 ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
538 ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
539 }
540} 528}
541 529
542int 530int
@@ -661,9 +649,9 @@ xfs_setattr_nonsize(
661 * The set-user-ID and set-group-ID bits of a file will be 649 * The set-user-ID and set-group-ID bits of a file will be
662 * cleared upon successful return from chown() 650 * cleared upon successful return from chown()
663 */ 651 */
664 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && 652 if ((inode->i_mode & (S_ISUID|S_ISGID)) &&
665 !capable(CAP_FSETID)) 653 !capable(CAP_FSETID))
666 ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); 654 inode->i_mode &= ~(S_ISUID|S_ISGID);
667 655
668 /* 656 /*
669 * Change the ownerships and register quota modifications 657 * Change the ownerships and register quota modifications
@@ -773,7 +761,7 @@ xfs_setattr_size(
773 761
774 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 762 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
775 ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); 763 ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
776 ASSERT(S_ISREG(ip->i_d.di_mode)); 764 ASSERT(S_ISREG(inode->i_mode));
777 ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| 765 ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
778 ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); 766 ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
779 767
@@ -991,21 +979,13 @@ xfs_vn_update_time(
991 } 979 }
992 980
993 xfs_ilock(ip, XFS_ILOCK_EXCL); 981 xfs_ilock(ip, XFS_ILOCK_EXCL);
994 if (flags & S_CTIME) { 982 if (flags & S_CTIME)
995 inode->i_ctime = *now; 983 inode->i_ctime = *now;
996 ip->i_d.di_ctime.t_sec = (__int32_t)now->tv_sec; 984 if (flags & S_MTIME)
997 ip->i_d.di_ctime.t_nsec = (__int32_t)now->tv_nsec;
998 }
999 if (flags & S_MTIME) {
1000 inode->i_mtime = *now; 985 inode->i_mtime = *now;
1001 ip->i_d.di_mtime.t_sec = (__int32_t)now->tv_sec; 986 if (flags & S_ATIME)
1002 ip->i_d.di_mtime.t_nsec = (__int32_t)now->tv_nsec;
1003 }
1004 if (flags & S_ATIME) {
1005 inode->i_atime = *now; 987 inode->i_atime = *now;
1006 ip->i_d.di_atime.t_sec = (__int32_t)now->tv_sec; 988
1007 ip->i_d.di_atime.t_nsec = (__int32_t)now->tv_nsec;
1008 }
1009 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 989 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1010 xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); 990 xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
1011 return xfs_trans_commit(tp); 991 return xfs_trans_commit(tp);
@@ -1205,8 +1185,10 @@ xfs_diflags_to_iflags(
1205 inode->i_flags |= S_SYNC; 1185 inode->i_flags |= S_SYNC;
1206 if (flags & XFS_DIFLAG_NOATIME) 1186 if (flags & XFS_DIFLAG_NOATIME)
1207 inode->i_flags |= S_NOATIME; 1187 inode->i_flags |= S_NOATIME;
1208 if (ip->i_mount->m_flags & XFS_MOUNT_DAX || 1188 if (S_ISREG(inode->i_mode) &&
1209 ip->i_d.di_flags2 & XFS_DIFLAG2_DAX) 1189 ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE &&
1190 (ip->i_mount->m_flags & XFS_MOUNT_DAX ||
1191 ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))
1210 inode->i_flags |= S_DAX; 1192 inode->i_flags |= S_DAX;
1211} 1193}
1212 1194
@@ -1232,8 +1214,6 @@ xfs_setup_inode(
1232 /* make the inode look hashed for the writeback code */ 1214 /* make the inode look hashed for the writeback code */
1233 hlist_add_fake(&inode->i_hash); 1215 hlist_add_fake(&inode->i_hash);
1234 1216
1235 inode->i_mode = ip->i_d.di_mode;
1236 set_nlink(inode, ip->i_d.di_nlink);
1237 inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid); 1217 inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid);
1238 inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid); 1218 inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid);
1239 1219
@@ -1249,14 +1229,7 @@ xfs_setup_inode(
1249 break; 1229 break;
1250 } 1230 }
1251 1231
1252 inode->i_generation = ip->i_d.di_gen;
1253 i_size_write(inode, ip->i_d.di_size); 1232 i_size_write(inode, ip->i_d.di_size);
1254 inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec;
1255 inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec;
1256 inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
1257 inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
1258 inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
1259 inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
1260 xfs_diflags_to_iflags(inode, ip); 1233 xfs_diflags_to_iflags(inode, ip);
1261 1234
1262 ip->d_ops = ip->i_mount->m_nondir_inode_ops; 1235 ip->d_ops = ip->i_mount->m_nondir_inode_ops;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 930ebd86beba..ce73eb34620d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -57,6 +57,7 @@ xfs_bulkstat_one_int(
57{ 57{
58 struct xfs_icdinode *dic; /* dinode core info pointer */ 58 struct xfs_icdinode *dic; /* dinode core info pointer */
59 struct xfs_inode *ip; /* incore inode pointer */ 59 struct xfs_inode *ip; /* incore inode pointer */
60 struct inode *inode;
60 struct xfs_bstat *buf; /* return buffer */ 61 struct xfs_bstat *buf; /* return buffer */
61 int error = 0; /* error value */ 62 int error = 0; /* error value */
62 63
@@ -77,30 +78,33 @@ xfs_bulkstat_one_int(
77 78
78 ASSERT(ip != NULL); 79 ASSERT(ip != NULL);
79 ASSERT(ip->i_imap.im_blkno != 0); 80 ASSERT(ip->i_imap.im_blkno != 0);
81 inode = VFS_I(ip);
80 82
81 dic = &ip->i_d; 83 dic = &ip->i_d;
82 84
83 /* xfs_iget returns the following without needing 85 /* xfs_iget returns the following without needing
84 * further change. 86 * further change.
85 */ 87 */
86 buf->bs_nlink = dic->di_nlink;
87 buf->bs_projid_lo = dic->di_projid_lo; 88 buf->bs_projid_lo = dic->di_projid_lo;
88 buf->bs_projid_hi = dic->di_projid_hi; 89 buf->bs_projid_hi = dic->di_projid_hi;
89 buf->bs_ino = ino; 90 buf->bs_ino = ino;
90 buf->bs_mode = dic->di_mode;
91 buf->bs_uid = dic->di_uid; 91 buf->bs_uid = dic->di_uid;
92 buf->bs_gid = dic->di_gid; 92 buf->bs_gid = dic->di_gid;
93 buf->bs_size = dic->di_size; 93 buf->bs_size = dic->di_size;
94 buf->bs_atime.tv_sec = dic->di_atime.t_sec; 94
95 buf->bs_atime.tv_nsec = dic->di_atime.t_nsec; 95 buf->bs_nlink = inode->i_nlink;
96 buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; 96 buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
97 buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; 97 buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
98 buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; 98 buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
99 buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec; 99 buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
100 buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
101 buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
102 buf->bs_gen = inode->i_generation;
103 buf->bs_mode = inode->i_mode;
104
100 buf->bs_xflags = xfs_ip2xflags(ip); 105 buf->bs_xflags = xfs_ip2xflags(ip);
101 buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog; 106 buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
102 buf->bs_extents = dic->di_nextents; 107 buf->bs_extents = dic->di_nextents;
103 buf->bs_gen = dic->di_gen;
104 memset(buf->bs_pad, 0, sizeof(buf->bs_pad)); 108 memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
105 buf->bs_dmevmask = dic->di_dmevmask; 109 buf->bs_dmevmask = dic->di_dmevmask;
106 buf->bs_dmstate = dic->di_dmstate; 110 buf->bs_dmstate = dic->di_dmstate;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 9c9a1c9bcc7f..b49ccf5c1d75 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1212,7 +1212,7 @@ xlog_iodone(xfs_buf_t *bp)
1212 } 1212 }
1213 1213
1214 /* log I/O is always issued ASYNC */ 1214 /* log I/O is always issued ASYNC */
1215 ASSERT(XFS_BUF_ISASYNC(bp)); 1215 ASSERT(bp->b_flags & XBF_ASYNC);
1216 xlog_state_done_syncing(iclog, aborted); 1216 xlog_state_done_syncing(iclog, aborted);
1217 1217
1218 /* 1218 /*
@@ -1864,9 +1864,8 @@ xlog_sync(
1864 1864
1865 bp->b_io_length = BTOBB(count); 1865 bp->b_io_length = BTOBB(count);
1866 bp->b_fspriv = iclog; 1866 bp->b_fspriv = iclog;
1867 XFS_BUF_ZEROFLAGS(bp); 1867 bp->b_flags &= ~(XBF_FUA | XBF_FLUSH);
1868 XFS_BUF_ASYNC(bp); 1868 bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE);
1869 bp->b_flags |= XBF_SYNCIO;
1870 1869
1871 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) { 1870 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
1872 bp->b_flags |= XBF_FUA; 1871 bp->b_flags |= XBF_FUA;
@@ -1893,12 +1892,11 @@ xlog_sync(
1893 1892
1894 /* account for log which doesn't start at block #0 */ 1893 /* account for log which doesn't start at block #0 */
1895 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); 1894 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
1895
1896 /* 1896 /*
1897 * Don't call xfs_bwrite here. We do log-syncs even when the filesystem 1897 * Don't call xfs_bwrite here. We do log-syncs even when the filesystem
1898 * is shutting down. 1898 * is shutting down.
1899 */ 1899 */
1900 XFS_BUF_WRITE(bp);
1901
1902 error = xlog_bdstrat(bp); 1900 error = xlog_bdstrat(bp);
1903 if (error) { 1901 if (error) {
1904 xfs_buf_ioerror_alert(bp, "xlog_sync"); 1902 xfs_buf_ioerror_alert(bp, "xlog_sync");
@@ -1910,9 +1908,8 @@ xlog_sync(
1910 xfs_buf_associate_memory(bp, 1908 xfs_buf_associate_memory(bp,
1911 (char *)&iclog->ic_header + count, split); 1909 (char *)&iclog->ic_header + count, split);
1912 bp->b_fspriv = iclog; 1910 bp->b_fspriv = iclog;
1913 XFS_BUF_ZEROFLAGS(bp); 1911 bp->b_flags &= ~(XBF_FUA | XBF_FLUSH);
1914 XFS_BUF_ASYNC(bp); 1912 bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE);
1915 bp->b_flags |= XBF_SYNCIO;
1916 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) 1913 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
1917 bp->b_flags |= XBF_FUA; 1914 bp->b_flags |= XBF_FUA;
1918 1915
@@ -1921,7 +1918,6 @@ xlog_sync(
1921 1918
1922 /* account for internal log which doesn't start at block #0 */ 1919 /* account for internal log which doesn't start at block #0 */
1923 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); 1920 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
1924 XFS_BUF_WRITE(bp);
1925 error = xlog_bdstrat(bp); 1921 error = xlog_bdstrat(bp);
1926 if (error) { 1922 if (error) {
1927 xfs_buf_ioerror_alert(bp, "xlog_sync (split)"); 1923 xfs_buf_ioerror_alert(bp, "xlog_sync (split)");
@@ -2012,77 +2008,81 @@ xlog_print_tic_res(
2012 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); 2008 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
2013 2009
2014 /* match with XLOG_REG_TYPE_* in xfs_log.h */ 2010 /* match with XLOG_REG_TYPE_* in xfs_log.h */
2015 static char *res_type_str[XLOG_REG_TYPE_MAX] = { 2011#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str
2016 "bformat", 2012 static char *res_type_str[XLOG_REG_TYPE_MAX + 1] = {
2017 "bchunk", 2013 REG_TYPE_STR(BFORMAT, "bformat"),
2018 "efi_format", 2014 REG_TYPE_STR(BCHUNK, "bchunk"),
2019 "efd_format", 2015 REG_TYPE_STR(EFI_FORMAT, "efi_format"),
2020 "iformat", 2016 REG_TYPE_STR(EFD_FORMAT, "efd_format"),
2021 "icore", 2017 REG_TYPE_STR(IFORMAT, "iformat"),
2022 "iext", 2018 REG_TYPE_STR(ICORE, "icore"),
2023 "ibroot", 2019 REG_TYPE_STR(IEXT, "iext"),
2024 "ilocal", 2020 REG_TYPE_STR(IBROOT, "ibroot"),
2025 "iattr_ext", 2021 REG_TYPE_STR(ILOCAL, "ilocal"),
2026 "iattr_broot", 2022 REG_TYPE_STR(IATTR_EXT, "iattr_ext"),
2027 "iattr_local", 2023 REG_TYPE_STR(IATTR_BROOT, "iattr_broot"),
2028 "qformat", 2024 REG_TYPE_STR(IATTR_LOCAL, "iattr_local"),
2029 "dquot", 2025 REG_TYPE_STR(QFORMAT, "qformat"),
2030 "quotaoff", 2026 REG_TYPE_STR(DQUOT, "dquot"),
2031 "LR header", 2027 REG_TYPE_STR(QUOTAOFF, "quotaoff"),
2032 "unmount", 2028 REG_TYPE_STR(LRHEADER, "LR header"),
2033 "commit", 2029 REG_TYPE_STR(UNMOUNT, "unmount"),
2034 "trans header" 2030 REG_TYPE_STR(COMMIT, "commit"),
2031 REG_TYPE_STR(TRANSHDR, "trans header"),
2032 REG_TYPE_STR(ICREATE, "inode create")
2035 }; 2033 };
2034#undef REG_TYPE_STR
2035#define TRANS_TYPE_STR(type) [XFS_TRANS_##type] = #type
2036 static char *trans_type_str[XFS_TRANS_TYPE_MAX] = { 2036 static char *trans_type_str[XFS_TRANS_TYPE_MAX] = {
2037 "SETATTR_NOT_SIZE", 2037 TRANS_TYPE_STR(SETATTR_NOT_SIZE),
2038 "SETATTR_SIZE", 2038 TRANS_TYPE_STR(SETATTR_SIZE),
2039 "INACTIVE", 2039 TRANS_TYPE_STR(INACTIVE),
2040 "CREATE", 2040 TRANS_TYPE_STR(CREATE),
2041 "CREATE_TRUNC", 2041 TRANS_TYPE_STR(CREATE_TRUNC),
2042 "TRUNCATE_FILE", 2042 TRANS_TYPE_STR(TRUNCATE_FILE),
2043 "REMOVE", 2043 TRANS_TYPE_STR(REMOVE),
2044 "LINK", 2044 TRANS_TYPE_STR(LINK),
2045 "RENAME", 2045 TRANS_TYPE_STR(RENAME),
2046 "MKDIR", 2046 TRANS_TYPE_STR(MKDIR),
2047 "RMDIR", 2047 TRANS_TYPE_STR(RMDIR),
2048 "SYMLINK", 2048 TRANS_TYPE_STR(SYMLINK),
2049 "SET_DMATTRS", 2049 TRANS_TYPE_STR(SET_DMATTRS),
2050 "GROWFS", 2050 TRANS_TYPE_STR(GROWFS),
2051 "STRAT_WRITE", 2051 TRANS_TYPE_STR(STRAT_WRITE),
2052 "DIOSTRAT", 2052 TRANS_TYPE_STR(DIOSTRAT),
2053 "WRITE_SYNC", 2053 TRANS_TYPE_STR(WRITEID),
2054 "WRITEID", 2054 TRANS_TYPE_STR(ADDAFORK),
2055 "ADDAFORK", 2055 TRANS_TYPE_STR(ATTRINVAL),
2056 "ATTRINVAL", 2056 TRANS_TYPE_STR(ATRUNCATE),
2057 "ATRUNCATE", 2057 TRANS_TYPE_STR(ATTR_SET),
2058 "ATTR_SET", 2058 TRANS_TYPE_STR(ATTR_RM),
2059 "ATTR_RM", 2059 TRANS_TYPE_STR(ATTR_FLAG),
2060 "ATTR_FLAG", 2060 TRANS_TYPE_STR(CLEAR_AGI_BUCKET),
2061 "CLEAR_AGI_BUCKET", 2061 TRANS_TYPE_STR(SB_CHANGE),
2062 "QM_SBCHANGE", 2062 TRANS_TYPE_STR(DUMMY1),
2063 "DUMMY1", 2063 TRANS_TYPE_STR(DUMMY2),
2064 "DUMMY2", 2064 TRANS_TYPE_STR(QM_QUOTAOFF),
2065 "QM_QUOTAOFF", 2065 TRANS_TYPE_STR(QM_DQALLOC),
2066 "QM_DQALLOC", 2066 TRANS_TYPE_STR(QM_SETQLIM),
2067 "QM_SETQLIM", 2067 TRANS_TYPE_STR(QM_DQCLUSTER),
2068 "QM_DQCLUSTER", 2068 TRANS_TYPE_STR(QM_QINOCREATE),
2069 "QM_QINOCREATE", 2069 TRANS_TYPE_STR(QM_QUOTAOFF_END),
2070 "QM_QUOTAOFF_END", 2070 TRANS_TYPE_STR(FSYNC_TS),
2071 "FSYNC_TS", 2071 TRANS_TYPE_STR(GROWFSRT_ALLOC),
2072 "GROWFSRT_ALLOC", 2072 TRANS_TYPE_STR(GROWFSRT_ZERO),
2073 "GROWFSRT_ZERO", 2073 TRANS_TYPE_STR(GROWFSRT_FREE),
2074 "GROWFSRT_FREE", 2074 TRANS_TYPE_STR(SWAPEXT),
2075 "SWAPEXT", 2075 TRANS_TYPE_STR(CHECKPOINT),
2076 "CHECKPOINT", 2076 TRANS_TYPE_STR(ICREATE),
2077 "ICREATE", 2077 TRANS_TYPE_STR(CREATE_TMPFILE)
2078 "CREATE_TMPFILE"
2079 }; 2078 };
2079#undef TRANS_TYPE_STR
2080 2080
2081 xfs_warn(mp, "xlog_write: reservation summary:"); 2081 xfs_warn(mp, "xlog_write: reservation summary:");
2082 xfs_warn(mp, " trans type = %s (%u)", 2082 xfs_warn(mp, " trans type = %s (%u)",
2083 ((ticket->t_trans_type <= 0 || 2083 ((ticket->t_trans_type <= 0 ||
2084 ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? 2084 ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
2085 "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]), 2085 "bad-trans-type" : trans_type_str[ticket->t_trans_type]),
2086 ticket->t_trans_type); 2086 ticket->t_trans_type);
2087 xfs_warn(mp, " unit res = %d bytes", 2087 xfs_warn(mp, " unit res = %d bytes",
2088 ticket->t_unit_res); 2088 ticket->t_unit_res);
@@ -2101,7 +2101,7 @@ xlog_print_tic_res(
2101 uint r_type = ticket->t_res_arr[i].r_type; 2101 uint r_type = ticket->t_res_arr[i].r_type;
2102 xfs_warn(mp, "region[%u]: %s - %u bytes", i, 2102 xfs_warn(mp, "region[%u]: %s - %u bytes", i,
2103 ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? 2103 ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
2104 "bad-rtype" : res_type_str[r_type-1]), 2104 "bad-rtype" : res_type_str[r_type]),
2105 ticket->t_res_arr[i].r_len); 2105 ticket->t_res_arr[i].r_len);
2106 } 2106 }
2107 2107
@@ -3979,7 +3979,7 @@ xfs_log_force_umount(
3979 log->l_flags & XLOG_ACTIVE_RECOVERY) { 3979 log->l_flags & XLOG_ACTIVE_RECOVERY) {
3980 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; 3980 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
3981 if (mp->m_sb_bp) 3981 if (mp->m_sb_bp)
3982 XFS_BUF_DONE(mp->m_sb_bp); 3982 mp->m_sb_bp->b_flags |= XBF_DONE;
3983 return 0; 3983 return 0;
3984 } 3984 }
3985 3985
@@ -4009,7 +4009,7 @@ xfs_log_force_umount(
4009 spin_lock(&log->l_icloglock); 4009 spin_lock(&log->l_icloglock);
4010 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; 4010 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
4011 if (mp->m_sb_bp) 4011 if (mp->m_sb_bp)
4012 XFS_BUF_DONE(mp->m_sb_bp); 4012 mp->m_sb_bp->b_flags |= XBF_DONE;
4013 4013
4014 /* 4014 /*
4015 * Mark the log and the iclogs with IO error flags to prevent any 4015 * Mark the log and the iclogs with IO error flags to prevent any
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index be5568839442..396565f43247 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -190,7 +190,7 @@ xlog_bread_noalign(
190 ASSERT(nbblks <= bp->b_length); 190 ASSERT(nbblks <= bp->b_length);
191 191
192 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); 192 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
193 XFS_BUF_READ(bp); 193 bp->b_flags |= XBF_READ;
194 bp->b_io_length = nbblks; 194 bp->b_io_length = nbblks;
195 bp->b_error = 0; 195 bp->b_error = 0;
196 196
@@ -275,7 +275,6 @@ xlog_bwrite(
275 ASSERT(nbblks <= bp->b_length); 275 ASSERT(nbblks <= bp->b_length);
276 276
277 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); 277 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
278 XFS_BUF_ZEROFLAGS(bp);
279 xfs_buf_hold(bp); 278 xfs_buf_hold(bp);
280 xfs_buf_lock(bp); 279 xfs_buf_lock(bp);
281 bp->b_io_length = nbblks; 280 bp->b_io_length = nbblks;
@@ -2538,6 +2537,13 @@ xlog_recover_validate_buf_type(
2538 } 2537 }
2539 bp->b_ops = &xfs_sb_buf_ops; 2538 bp->b_ops = &xfs_sb_buf_ops;
2540 break; 2539 break;
2540#ifdef CONFIG_XFS_RT
2541 case XFS_BLFT_RTBITMAP_BUF:
2542 case XFS_BLFT_RTSUMMARY_BUF:
2543 /* no magic numbers for verification of RT buffers */
2544 bp->b_ops = &xfs_rtbuf_ops;
2545 break;
2546#endif /* CONFIG_XFS_RT */
2541 default: 2547 default:
2542 xfs_warn(mp, "Unknown buffer type %d!", 2548 xfs_warn(mp, "Unknown buffer type %d!",
2543 xfs_blft_from_flags(buf_f)); 2549 xfs_blft_from_flags(buf_f));
@@ -2858,7 +2864,7 @@ xfs_recover_inode_owner_change(
2858 return -ENOMEM; 2864 return -ENOMEM;
2859 2865
2860 /* instantiate the inode */ 2866 /* instantiate the inode */
2861 xfs_dinode_from_disk(&ip->i_d, dip); 2867 xfs_inode_from_disk(ip, dip);
2862 ASSERT(ip->i_d.di_version >= 3); 2868 ASSERT(ip->i_d.di_version >= 3);
2863 2869
2864 error = xfs_iformat_fork(ip, dip); 2870 error = xfs_iformat_fork(ip, dip);
@@ -2904,7 +2910,7 @@ xlog_recover_inode_pass2(
2904 int error; 2910 int error;
2905 int attr_index; 2911 int attr_index;
2906 uint fields; 2912 uint fields;
2907 xfs_icdinode_t *dicp; 2913 struct xfs_log_dinode *ldip;
2908 uint isize; 2914 uint isize;
2909 int need_free = 0; 2915 int need_free = 0;
2910 2916
@@ -2957,8 +2963,8 @@ xlog_recover_inode_pass2(
2957 error = -EFSCORRUPTED; 2963 error = -EFSCORRUPTED;
2958 goto out_release; 2964 goto out_release;
2959 } 2965 }
2960 dicp = item->ri_buf[1].i_addr; 2966 ldip = item->ri_buf[1].i_addr;
2961 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { 2967 if (unlikely(ldip->di_magic != XFS_DINODE_MAGIC)) {
2962 xfs_alert(mp, 2968 xfs_alert(mp,
2963 "%s: Bad inode log record, rec ptr 0x%p, ino %Ld", 2969 "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
2964 __func__, item, in_f->ilf_ino); 2970 __func__, item, in_f->ilf_ino);
@@ -2994,13 +3000,13 @@ xlog_recover_inode_pass2(
2994 * to skip replay when the on disk inode is newer than the log one 3000 * to skip replay when the on disk inode is newer than the log one
2995 */ 3001 */
2996 if (!xfs_sb_version_hascrc(&mp->m_sb) && 3002 if (!xfs_sb_version_hascrc(&mp->m_sb) &&
2997 dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { 3003 ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
2998 /* 3004 /*
2999 * Deal with the wrap case, DI_MAX_FLUSH is less 3005 * Deal with the wrap case, DI_MAX_FLUSH is less
3000 * than smaller numbers 3006 * than smaller numbers
3001 */ 3007 */
3002 if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && 3008 if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
3003 dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { 3009 ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
3004 /* do nothing */ 3010 /* do nothing */
3005 } else { 3011 } else {
3006 trace_xfs_log_recover_inode_skip(log, in_f); 3012 trace_xfs_log_recover_inode_skip(log, in_f);
@@ -3010,13 +3016,13 @@ xlog_recover_inode_pass2(
3010 } 3016 }
3011 3017
3012 /* Take the opportunity to reset the flush iteration count */ 3018 /* Take the opportunity to reset the flush iteration count */
3013 dicp->di_flushiter = 0; 3019 ldip->di_flushiter = 0;
3014 3020
3015 if (unlikely(S_ISREG(dicp->di_mode))) { 3021 if (unlikely(S_ISREG(ldip->di_mode))) {
3016 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 3022 if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
3017 (dicp->di_format != XFS_DINODE_FMT_BTREE)) { 3023 (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
3018 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", 3024 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
3019 XFS_ERRLEVEL_LOW, mp, dicp); 3025 XFS_ERRLEVEL_LOW, mp, ldip);
3020 xfs_alert(mp, 3026 xfs_alert(mp,
3021 "%s: Bad regular inode log record, rec ptr 0x%p, " 3027 "%s: Bad regular inode log record, rec ptr 0x%p, "
3022 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 3028 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
@@ -3024,12 +3030,12 @@ xlog_recover_inode_pass2(
3024 error = -EFSCORRUPTED; 3030 error = -EFSCORRUPTED;
3025 goto out_release; 3031 goto out_release;
3026 } 3032 }
3027 } else if (unlikely(S_ISDIR(dicp->di_mode))) { 3033 } else if (unlikely(S_ISDIR(ldip->di_mode))) {
3028 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 3034 if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
3029 (dicp->di_format != XFS_DINODE_FMT_BTREE) && 3035 (ldip->di_format != XFS_DINODE_FMT_BTREE) &&
3030 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { 3036 (ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
3031 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", 3037 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
3032 XFS_ERRLEVEL_LOW, mp, dicp); 3038 XFS_ERRLEVEL_LOW, mp, ldip);
3033 xfs_alert(mp, 3039 xfs_alert(mp,
3034 "%s: Bad dir inode log record, rec ptr 0x%p, " 3040 "%s: Bad dir inode log record, rec ptr 0x%p, "
3035 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 3041 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
@@ -3038,32 +3044,32 @@ xlog_recover_inode_pass2(
3038 goto out_release; 3044 goto out_release;
3039 } 3045 }
3040 } 3046 }
3041 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ 3047 if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
3042 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", 3048 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
3043 XFS_ERRLEVEL_LOW, mp, dicp); 3049 XFS_ERRLEVEL_LOW, mp, ldip);
3044 xfs_alert(mp, 3050 xfs_alert(mp,
3045 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " 3051 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
3046 "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", 3052 "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
3047 __func__, item, dip, bp, in_f->ilf_ino, 3053 __func__, item, dip, bp, in_f->ilf_ino,
3048 dicp->di_nextents + dicp->di_anextents, 3054 ldip->di_nextents + ldip->di_anextents,
3049 dicp->di_nblocks); 3055 ldip->di_nblocks);
3050 error = -EFSCORRUPTED; 3056 error = -EFSCORRUPTED;
3051 goto out_release; 3057 goto out_release;
3052 } 3058 }
3053 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { 3059 if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
3054 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", 3060 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
3055 XFS_ERRLEVEL_LOW, mp, dicp); 3061 XFS_ERRLEVEL_LOW, mp, ldip);
3056 xfs_alert(mp, 3062 xfs_alert(mp,
3057 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " 3063 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
3058 "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, 3064 "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
3059 item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); 3065 item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
3060 error = -EFSCORRUPTED; 3066 error = -EFSCORRUPTED;
3061 goto out_release; 3067 goto out_release;
3062 } 3068 }
3063 isize = xfs_icdinode_size(dicp->di_version); 3069 isize = xfs_log_dinode_size(ldip->di_version);
3064 if (unlikely(item->ri_buf[1].i_len > isize)) { 3070 if (unlikely(item->ri_buf[1].i_len > isize)) {
3065 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", 3071 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
3066 XFS_ERRLEVEL_LOW, mp, dicp); 3072 XFS_ERRLEVEL_LOW, mp, ldip);
3067 xfs_alert(mp, 3073 xfs_alert(mp,
3068 "%s: Bad inode log record length %d, rec ptr 0x%p", 3074 "%s: Bad inode log record length %d, rec ptr 0x%p",
3069 __func__, item->ri_buf[1].i_len, item); 3075 __func__, item->ri_buf[1].i_len, item);
@@ -3071,8 +3077,8 @@ xlog_recover_inode_pass2(
3071 goto out_release; 3077 goto out_release;
3072 } 3078 }
3073 3079
3074 /* The core is in in-core format */ 3080 /* recover the log dinode inode into the on disk inode */
3075 xfs_dinode_to_disk(dip, dicp); 3081 xfs_log_dinode_to_disk(ldip, dip);
3076 3082
3077 /* the rest is in on-disk format */ 3083 /* the rest is in on-disk format */
3078 if (item->ri_buf[1].i_len > isize) { 3084 if (item->ri_buf[1].i_len > isize) {
@@ -4402,8 +4408,8 @@ xlog_recover_process_one_iunlink(
4402 if (error) 4408 if (error)
4403 goto fail_iput; 4409 goto fail_iput;
4404 4410
4405 ASSERT(ip->i_d.di_nlink == 0); 4411 ASSERT(VFS_I(ip)->i_nlink == 0);
4406 ASSERT(ip->i_d.di_mode != 0); 4412 ASSERT(VFS_I(ip)->i_mode != 0);
4407 4413
4408 /* setup for the next pass */ 4414 /* setup for the next pass */
4409 agino = be32_to_cpu(dip->di_next_unlinked); 4415 agino = be32_to_cpu(dip->di_next_unlinked);
@@ -4957,6 +4963,7 @@ xlog_do_recover(
4957 xfs_daddr_t head_blk, 4963 xfs_daddr_t head_blk,
4958 xfs_daddr_t tail_blk) 4964 xfs_daddr_t tail_blk)
4959{ 4965{
4966 struct xfs_mount *mp = log->l_mp;
4960 int error; 4967 int error;
4961 xfs_buf_t *bp; 4968 xfs_buf_t *bp;
4962 xfs_sb_t *sbp; 4969 xfs_sb_t *sbp;
@@ -4971,7 +4978,7 @@ xlog_do_recover(
4971 /* 4978 /*
4972 * If IO errors happened during recovery, bail out. 4979 * If IO errors happened during recovery, bail out.
4973 */ 4980 */
4974 if (XFS_FORCED_SHUTDOWN(log->l_mp)) { 4981 if (XFS_FORCED_SHUTDOWN(mp)) {
4975 return -EIO; 4982 return -EIO;
4976 } 4983 }
4977 4984
@@ -4984,22 +4991,21 @@ xlog_do_recover(
4984 * or iunlinks they will have some entries in the AIL; so we look at 4991 * or iunlinks they will have some entries in the AIL; so we look at
4985 * the AIL to determine how to set the tail_lsn. 4992 * the AIL to determine how to set the tail_lsn.
4986 */ 4993 */
4987 xlog_assign_tail_lsn(log->l_mp); 4994 xlog_assign_tail_lsn(mp);
4988 4995
4989 /* 4996 /*
4990 * Now that we've finished replaying all buffer and inode 4997 * Now that we've finished replaying all buffer and inode
4991 * updates, re-read in the superblock and reverify it. 4998 * updates, re-read in the superblock and reverify it.
4992 */ 4999 */
4993 bp = xfs_getsb(log->l_mp, 0); 5000 bp = xfs_getsb(mp, 0);
4994 XFS_BUF_UNDONE(bp); 5001 bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
4995 ASSERT(!(XFS_BUF_ISWRITE(bp))); 5002 ASSERT(!(bp->b_flags & XBF_WRITE));
4996 XFS_BUF_READ(bp); 5003 bp->b_flags |= XBF_READ;
4997 XFS_BUF_UNASYNC(bp);
4998 bp->b_ops = &xfs_sb_buf_ops; 5004 bp->b_ops = &xfs_sb_buf_ops;
4999 5005
5000 error = xfs_buf_submit_wait(bp); 5006 error = xfs_buf_submit_wait(bp);
5001 if (error) { 5007 if (error) {
5002 if (!XFS_FORCED_SHUTDOWN(log->l_mp)) { 5008 if (!XFS_FORCED_SHUTDOWN(mp)) {
5003 xfs_buf_ioerror_alert(bp, __func__); 5009 xfs_buf_ioerror_alert(bp, __func__);
5004 ASSERT(0); 5010 ASSERT(0);
5005 } 5011 }
@@ -5008,14 +5014,17 @@ xlog_do_recover(
5008 } 5014 }
5009 5015
5010 /* Convert superblock from on-disk format */ 5016 /* Convert superblock from on-disk format */
5011 sbp = &log->l_mp->m_sb; 5017 sbp = &mp->m_sb;
5012 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); 5018 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
5013 ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
5014 ASSERT(xfs_sb_good_version(sbp));
5015 xfs_reinit_percpu_counters(log->l_mp);
5016
5017 xfs_buf_relse(bp); 5019 xfs_buf_relse(bp);
5018 5020
5021 /* re-initialise in-core superblock and geometry structures */
5022 xfs_reinit_percpu_counters(mp);
5023 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
5024 if (error) {
5025 xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
5026 return error;
5027 }
5019 5028
5020 xlog_recover_check_summary(log); 5029 xlog_recover_check_summary(log);
5021 5030
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bb753b359bee..536a0ee9cd5a 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -185,9 +185,6 @@ xfs_initialize_perag(
185 xfs_agnumber_t index; 185 xfs_agnumber_t index;
186 xfs_agnumber_t first_initialised = 0; 186 xfs_agnumber_t first_initialised = 0;
187 xfs_perag_t *pag; 187 xfs_perag_t *pag;
188 xfs_agino_t agino;
189 xfs_ino_t ino;
190 xfs_sb_t *sbp = &mp->m_sb;
191 int error = -ENOMEM; 188 int error = -ENOMEM;
192 189
193 /* 190 /*
@@ -230,22 +227,7 @@ xfs_initialize_perag(
230 radix_tree_preload_end(); 227 radix_tree_preload_end();
231 } 228 }
232 229
233 /* 230 index = xfs_set_inode_alloc(mp, agcount);
234 * If we mount with the inode64 option, or no inode overflows
235 * the legacy 32-bit address space clear the inode32 option.
236 */
237 agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
238 ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
239
240 if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
241 mp->m_flags |= XFS_MOUNT_32BITINODES;
242 else
243 mp->m_flags &= ~XFS_MOUNT_32BITINODES;
244
245 if (mp->m_flags & XFS_MOUNT_32BITINODES)
246 index = xfs_set_inode32(mp, agcount);
247 else
248 index = xfs_set_inode64(mp, agcount);
249 231
250 if (maxagi) 232 if (maxagi)
251 *maxagi = index; 233 *maxagi = index;
@@ -865,7 +847,7 @@ xfs_mountfs(
865 847
866 ASSERT(rip != NULL); 848 ASSERT(rip != NULL);
867 849
868 if (unlikely(!S_ISDIR(rip->i_d.di_mode))) { 850 if (unlikely(!S_ISDIR(VFS_I(rip)->i_mode))) {
869 xfs_warn(mp, "corrupted root inode %llu: not a directory", 851 xfs_warn(mp, "corrupted root inode %llu: not a directory",
870 (unsigned long long)rip->i_ino); 852 (unsigned long long)rip->i_ino);
871 xfs_iunlock(rip, XFS_ILOCK_EXCL); 853 xfs_iunlock(rip, XFS_ILOCK_EXCL);
@@ -1284,7 +1266,7 @@ xfs_getsb(
1284 } 1266 }
1285 1267
1286 xfs_buf_hold(bp); 1268 xfs_buf_hold(bp);
1287 ASSERT(XFS_BUF_ISDONE(bp)); 1269 ASSERT(bp->b_flags & XBF_DONE);
1288 return bp; 1270 return bp;
1289} 1271}
1290 1272
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b57098481c10..bac6b3435591 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -147,6 +147,17 @@ typedef struct xfs_mount {
147 * to various other kinds of pain inflicted on the pNFS server. 147 * to various other kinds of pain inflicted on the pNFS server.
148 */ 148 */
149 __uint32_t m_generation; 149 __uint32_t m_generation;
150
151#ifdef DEBUG
152 /*
153 * DEBUG mode instrumentation to test and/or trigger delayed allocation
154 * block killing in the event of failed writes. When enabled, all
155 * buffered writes are forced to fail. All delalloc blocks in the range
156 * of the write (including pre-existing delalloc blocks!) are tossed as
157 * part of the write failure error handling sequence.
158 */
159 bool m_fail_writes;
160#endif
150} xfs_mount_t; 161} xfs_mount_t;
151 162
152/* 163/*
@@ -166,9 +177,8 @@ typedef struct xfs_mount {
166#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */ 177#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */
167#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */ 178#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */
168#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */ 179#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */
169#define XFS_MOUNT_32BITINODES (1ULL << 14) /* do not create inodes above 180#define XFS_MOUNT_SMALL_INUMS (1ULL << 14) /* user wants 32bit inodes */
170 * 32 bits in size */ 181#define XFS_MOUNT_32BITINODES (1ULL << 15) /* inode32 allocator active */
171#define XFS_MOUNT_SMALL_INUMS (1ULL << 15) /* users wants 32bit inodes */
172#define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */ 182#define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */
173#define XFS_MOUNT_BARRIER (1ULL << 17) 183#define XFS_MOUNT_BARRIER (1ULL << 17)
174#define XFS_MOUNT_IKEEP (1ULL << 18) /* keep empty inode clusters*/ 184#define XFS_MOUNT_IKEEP (1ULL << 18) /* keep empty inode clusters*/
@@ -264,6 +274,20 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
264 return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks); 274 return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
265} 275}
266 276
277#ifdef DEBUG
278static inline bool
279xfs_mp_fail_writes(struct xfs_mount *mp)
280{
281 return mp->m_fail_writes;
282}
283#else
284static inline bool
285xfs_mp_fail_writes(struct xfs_mount *mp)
286{
287 return 0;
288}
289#endif
290
267/* 291/*
268 * Per-ag incore structure, copies of information in agf and agi, to improve the 292 * Per-ag incore structure, copies of information in agf and agi, to improve the
269 * performance of allocation group selection. 293 * performance of allocation group selection.
@@ -327,7 +351,6 @@ extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
327 bool reserved); 351 bool reserved);
328extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); 352extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
329 353
330extern int xfs_mount_log_sb(xfs_mount_t *);
331extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); 354extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
332extern int xfs_readsb(xfs_mount_t *, int); 355extern int xfs_readsb(xfs_mount_t *, int);
333extern void xfs_freesb(xfs_mount_t *); 356extern void xfs_freesb(xfs_mount_t *);
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
new file mode 100644
index 000000000000..184c44effdd5
--- /dev/null
+++ b/fs/xfs/xfs_ondisk.h
@@ -0,0 +1,117 @@
1/*
2 * Copyright (c) 2016 Oracle.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_ONDISK_H
19#define __XFS_ONDISK_H
20
21#define XFS_CHECK_STRUCT_SIZE(structname, size) \
22 BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \
23 #structname ") is wrong, expected " #size)
24
25static inline void __init
26xfs_check_ondisk_structs(void)
27{
28 /* ag/file structures */
29 XFS_CHECK_STRUCT_SIZE(struct xfs_acl, 4);
30 XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry, 12);
31 XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224);
32 XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36);
33 XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 336);
34 XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8);
35 XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16);
36 XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4);
37 XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72);
38 XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176);
39 XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104);
40 XFS_CHECK_STRUCT_SIZE(struct xfs_dqblk, 136);
41 XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 264);
42 XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr, 56);
43 XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key, 4);
44 XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec, 16);
45 XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp, 8);
46 XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8);
47 XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4);
48 XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8);
49 XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t, 4);
50
51 /* dir/attr trees */
52 XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr, 80);
53 XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock, 88);
54 XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_rmt_hdr, 56);
55 XFS_CHECK_STRUCT_SIZE(struct xfs_da3_blkinfo, 56);
56 XFS_CHECK_STRUCT_SIZE(struct xfs_da3_intnode, 64);
57 XFS_CHECK_STRUCT_SIZE(struct xfs_da3_node_hdr, 64);
58 XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_blk_hdr, 48);
59 XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_data_hdr, 64);
60 XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free, 64);
61 XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free_hdr, 64);
62 XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf, 64);
63 XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf_hdr, 64);
64 XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_entry_t, 8);
65 XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_hdr_t, 32);
66 XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_map_t, 4);
67 XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_local_t, 4);
68
69 /*
70 * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
71 * 4 bytes anyway so it's not obviously a problem. Hence for the moment
72 * we don't check this structure. This can be re-instated when the attr
73 * definitions are updated to use c99 VLA definitions.
74 *
75 XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12);
76 */
77
78 XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40);
79 XFS_CHECK_STRUCT_SIZE(xfs_attr_shortform_t, 8);
80 XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12);
81 XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16);
82 XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8);
83 XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16);
84 XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4);
85 XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16);
86 XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t, 6);
87 XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16);
88 XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16);
89 XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t, 4);
90 XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino8_t, 8);
91 XFS_CHECK_STRUCT_SIZE(xfs_dir2_inou_t, 8);
92 XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8);
93 XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16);
94 XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16);
95 XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4);
96 XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3);
97 XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10);
98 XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t, 2);
99
100 /* log structures */
101 XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24);
102 XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 28);
103 XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 32);
104 XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32, 28);
105 XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64, 32);
106 XFS_CHECK_STRUCT_SIZE(struct xfs_extent_32, 12);
107 XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16);
108 XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176);
109 XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28);
110 XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp, 8);
111 XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52);
112 XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_64, 56);
113 XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
114 XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16);
115}
116
117#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index 8147ac108820..93f74853961b 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -1,7 +1,7 @@
1#ifndef _XFS_PNFS_H 1#ifndef _XFS_PNFS_H
2#define _XFS_PNFS_H 1 2#define _XFS_PNFS_H 1
3 3
4#ifdef CONFIG_NFSD_PNFS 4#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT)
5int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset); 5int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
6int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, 6int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
7 struct iomap *iomap, bool write, u32 *device_generation); 7 struct iomap *iomap, bool write, u32 *device_generation);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 532ab79d38fe..be125e1758c1 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -560,6 +560,37 @@ xfs_qm_shrink_count(
560 return list_lru_shrink_count(&qi->qi_lru, sc); 560 return list_lru_shrink_count(&qi->qi_lru, sc);
561} 561}
562 562
563STATIC void
564xfs_qm_set_defquota(
565 xfs_mount_t *mp,
566 uint type,
567 xfs_quotainfo_t *qinf)
568{
569 xfs_dquot_t *dqp;
570 struct xfs_def_quota *defq;
571 int error;
572
573 error = xfs_qm_dqread(mp, 0, type, XFS_QMOPT_DOWARN, &dqp);
574
575 if (!error) {
576 xfs_disk_dquot_t *ddqp = &dqp->q_core;
577
578 defq = xfs_get_defquota(dqp, qinf);
579
580 /*
581 * Timers and warnings have been already set, let's just set the
582 * default limits for this quota type
583 */
584 defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
585 defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
586 defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
587 defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
588 defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
589 defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
590 xfs_qm_dqdestroy(dqp);
591 }
592}
593
563/* 594/*
564 * This initializes all the quota information that's kept in the 595 * This initializes all the quota information that's kept in the
565 * mount structure 596 * mount structure
@@ -606,19 +637,19 @@ xfs_qm_init_quotainfo(
606 * We try to get the limits from the superuser's limits fields. 637 * We try to get the limits from the superuser's limits fields.
607 * This is quite hacky, but it is standard quota practice. 638 * This is quite hacky, but it is standard quota practice.
608 * 639 *
609 * We look at the USR dquot with id == 0 first, but if user quotas
610 * are not enabled we goto the GRP dquot with id == 0.
611 * We don't really care to keep separate default limits for user
612 * and group quotas, at least not at this point.
613 *
614 * Since we may not have done a quotacheck by this point, just read 640 * Since we may not have done a quotacheck by this point, just read
615 * the dquot without attaching it to any hashtables or lists. 641 * the dquot without attaching it to any hashtables or lists.
642 *
643 * Timers and warnings are globally set by the first timer found in
644 * user/group/proj quota types, otherwise a default value is used.
645 * This should be split into different fields per quota type.
616 */ 646 */
617 error = xfs_qm_dqread(mp, 0, 647 error = xfs_qm_dqread(mp, 0,
618 XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : 648 XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER :
619 (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : 649 (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP :
620 XFS_DQ_PROJ), 650 XFS_DQ_PROJ),
621 XFS_QMOPT_DOWARN, &dqp); 651 XFS_QMOPT_DOWARN, &dqp);
652
622 if (!error) { 653 if (!error) {
623 xfs_disk_dquot_t *ddqp = &dqp->q_core; 654 xfs_disk_dquot_t *ddqp = &dqp->q_core;
624 655
@@ -639,13 +670,6 @@ xfs_qm_init_quotainfo(
639 be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT; 670 be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT;
640 qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ? 671 qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ?
641 be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT; 672 be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT;
642 qinf->qi_bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
643 qinf->qi_bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
644 qinf->qi_ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
645 qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
646 qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
647 qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
648
649 xfs_qm_dqdestroy(dqp); 673 xfs_qm_dqdestroy(dqp);
650 } else { 674 } else {
651 qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; 675 qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
@@ -656,6 +680,13 @@ xfs_qm_init_quotainfo(
656 qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; 680 qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
657 } 681 }
658 682
683 if (XFS_IS_UQUOTA_RUNNING(mp))
684 xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf);
685 if (XFS_IS_GQUOTA_RUNNING(mp))
686 xfs_qm_set_defquota(mp, XFS_DQ_GROUP, qinf);
687 if (XFS_IS_PQUOTA_RUNNING(mp))
688 xfs_qm_set_defquota(mp, XFS_DQ_PROJ, qinf);
689
659 qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; 690 qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
660 qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; 691 qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
661 qinf->qi_shrinker.seeks = DEFAULT_SEEKS; 692 qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 996a04064894..2975a822e9f0 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -53,6 +53,15 @@ extern struct kmem_zone *xfs_qm_dqtrxzone;
53 */ 53 */
54#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 54#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1
55 55
56struct xfs_def_quota {
57 xfs_qcnt_t bhardlimit; /* default data blk hard limit */
58 xfs_qcnt_t bsoftlimit; /* default data blk soft limit */
59 xfs_qcnt_t ihardlimit; /* default inode count hard limit */
60 xfs_qcnt_t isoftlimit; /* default inode count soft limit */
61 xfs_qcnt_t rtbhardlimit; /* default realtime blk hard limit */
62 xfs_qcnt_t rtbsoftlimit; /* default realtime blk soft limit */
63};
64
56/* 65/*
57 * Various quota information for individual filesystems. 66 * Various quota information for individual filesystems.
58 * The mount structure keeps a pointer to this. 67 * The mount structure keeps a pointer to this.
@@ -76,12 +85,9 @@ typedef struct xfs_quotainfo {
76 struct mutex qi_quotaofflock;/* to serialize quotaoff */ 85 struct mutex qi_quotaofflock;/* to serialize quotaoff */
77 xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ 86 xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */
78 uint qi_dqperchunk; /* # ondisk dqs in above chunk */ 87 uint qi_dqperchunk; /* # ondisk dqs in above chunk */
79 xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */ 88 struct xfs_def_quota qi_usr_default;
80 xfs_qcnt_t qi_bsoftlimit; /* default data blk soft limit */ 89 struct xfs_def_quota qi_grp_default;
81 xfs_qcnt_t qi_ihardlimit; /* default inode count hard limit */ 90 struct xfs_def_quota qi_prj_default;
82 xfs_qcnt_t qi_isoftlimit; /* default inode count soft limit */
83 xfs_qcnt_t qi_rtbhardlimit;/* default realtime blk hard limit */
84 xfs_qcnt_t qi_rtbsoftlimit;/* default realtime blk soft limit */
85 struct shrinker qi_shrinker; 91 struct shrinker qi_shrinker;
86} xfs_quotainfo_t; 92} xfs_quotainfo_t;
87 93
@@ -104,15 +110,15 @@ xfs_dquot_tree(
104} 110}
105 111
106static inline struct xfs_inode * 112static inline struct xfs_inode *
107xfs_dq_to_quota_inode(struct xfs_dquot *dqp) 113xfs_quota_inode(xfs_mount_t *mp, uint dq_flags)
108{ 114{
109 switch (dqp->dq_flags & XFS_DQ_ALLTYPES) { 115 switch (dq_flags & XFS_DQ_ALLTYPES) {
110 case XFS_DQ_USER: 116 case XFS_DQ_USER:
111 return dqp->q_mount->m_quotainfo->qi_uquotaip; 117 return mp->m_quotainfo->qi_uquotaip;
112 case XFS_DQ_GROUP: 118 case XFS_DQ_GROUP:
113 return dqp->q_mount->m_quotainfo->qi_gquotaip; 119 return mp->m_quotainfo->qi_gquotaip;
114 case XFS_DQ_PROJ: 120 case XFS_DQ_PROJ:
115 return dqp->q_mount->m_quotainfo->qi_pquotaip; 121 return mp->m_quotainfo->qi_pquotaip;
116 default: 122 default:
117 ASSERT(0); 123 ASSERT(0);
118 } 124 }
@@ -164,11 +170,27 @@ extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
164 170
165/* quota ops */ 171/* quota ops */
166extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); 172extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
167extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t, 173extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t *,
168 uint, struct qc_dqblk *); 174 uint, struct qc_dqblk *, uint);
169extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, 175extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
170 struct qc_dqblk *); 176 struct qc_dqblk *);
171extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); 177extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint);
172extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint); 178extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
173 179
180static inline struct xfs_def_quota *
181xfs_get_defquota(struct xfs_dquot *dqp, struct xfs_quotainfo *qi)
182{
183 struct xfs_def_quota *defq;
184
185 if (XFS_QM_ISUDQ(dqp))
186 defq = &qi->qi_usr_default;
187 else if (XFS_QM_ISGDQ(dqp))
188 defq = &qi->qi_grp_default;
189 else {
190 ASSERT(XFS_QM_ISPDQ(dqp));
191 defq = &qi->qi_prj_default;
192 }
193 return defq;
194}
195
174#endif /* __XFS_QM_H__ */ 196#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 3640c6e896af..f4d0e0a8f517 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -404,6 +404,7 @@ xfs_qm_scall_setqlim(
404 struct xfs_disk_dquot *ddq; 404 struct xfs_disk_dquot *ddq;
405 struct xfs_dquot *dqp; 405 struct xfs_dquot *dqp;
406 struct xfs_trans *tp; 406 struct xfs_trans *tp;
407 struct xfs_def_quota *defq;
407 int error; 408 int error;
408 xfs_qcnt_t hard, soft; 409 xfs_qcnt_t hard, soft;
409 410
@@ -431,6 +432,8 @@ xfs_qm_scall_setqlim(
431 ASSERT(error != -ENOENT); 432 ASSERT(error != -ENOENT);
432 goto out_unlock; 433 goto out_unlock;
433 } 434 }
435
436 defq = xfs_get_defquota(dqp, q);
434 xfs_dqunlock(dqp); 437 xfs_dqunlock(dqp);
435 438
436 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); 439 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
@@ -458,8 +461,8 @@ xfs_qm_scall_setqlim(
458 ddq->d_blk_softlimit = cpu_to_be64(soft); 461 ddq->d_blk_softlimit = cpu_to_be64(soft);
459 xfs_dquot_set_prealloc_limits(dqp); 462 xfs_dquot_set_prealloc_limits(dqp);
460 if (id == 0) { 463 if (id == 0) {
461 q->qi_bhardlimit = hard; 464 defq->bhardlimit = hard;
462 q->qi_bsoftlimit = soft; 465 defq->bsoftlimit = soft;
463 } 466 }
464 } else { 467 } else {
465 xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft); 468 xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft);
@@ -474,8 +477,8 @@ xfs_qm_scall_setqlim(
474 ddq->d_rtb_hardlimit = cpu_to_be64(hard); 477 ddq->d_rtb_hardlimit = cpu_to_be64(hard);
475 ddq->d_rtb_softlimit = cpu_to_be64(soft); 478 ddq->d_rtb_softlimit = cpu_to_be64(soft);
476 if (id == 0) { 479 if (id == 0) {
477 q->qi_rtbhardlimit = hard; 480 defq->rtbhardlimit = hard;
478 q->qi_rtbsoftlimit = soft; 481 defq->rtbsoftlimit = soft;
479 } 482 }
480 } else { 483 } else {
481 xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft); 484 xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft);
@@ -491,8 +494,8 @@ xfs_qm_scall_setqlim(
491 ddq->d_ino_hardlimit = cpu_to_be64(hard); 494 ddq->d_ino_hardlimit = cpu_to_be64(hard);
492 ddq->d_ino_softlimit = cpu_to_be64(soft); 495 ddq->d_ino_softlimit = cpu_to_be64(soft);
493 if (id == 0) { 496 if (id == 0) {
494 q->qi_ihardlimit = hard; 497 defq->ihardlimit = hard;
495 q->qi_isoftlimit = soft; 498 defq->isoftlimit = soft;
496 } 499 }
497 } else { 500 } else {
498 xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft); 501 xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft);
@@ -635,9 +638,10 @@ out:
635int 638int
636xfs_qm_scall_getquota( 639xfs_qm_scall_getquota(
637 struct xfs_mount *mp, 640 struct xfs_mount *mp,
638 xfs_dqid_t id, 641 xfs_dqid_t *id,
639 uint type, 642 uint type,
640 struct qc_dqblk *dst) 643 struct qc_dqblk *dst,
644 uint dqget_flags)
641{ 645{
642 struct xfs_dquot *dqp; 646 struct xfs_dquot *dqp;
643 int error; 647 int error;
@@ -647,7 +651,7 @@ xfs_qm_scall_getquota(
647 * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't 651 * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
648 * exist, we'll get ENOENT back. 652 * exist, we'll get ENOENT back.
649 */ 653 */
650 error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp); 654 error = xfs_qm_dqget(mp, NULL, *id, type, dqget_flags, &dqp);
651 if (error) 655 if (error)
652 return error; 656 return error;
653 657
@@ -660,6 +664,9 @@ xfs_qm_scall_getquota(
660 goto out_put; 664 goto out_put;
661 } 665 }
662 666
667 /* Fill in the ID we actually read from disk */
668 *id = be32_to_cpu(dqp->q_core.d_id);
669
663 memset(dst, 0, sizeof(*dst)); 670 memset(dst, 0, sizeof(*dst));
664 dst->d_spc_hardlimit = 671 dst->d_spc_hardlimit =
665 XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit)); 672 XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
@@ -701,7 +708,7 @@ xfs_qm_scall_getquota(
701 if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) || 708 if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) ||
702 (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) || 709 (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) ||
703 (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) && 710 (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) &&
704 id != 0) { 711 *id != 0) {
705 if ((dst->d_space > dst->d_spc_softlimit) && 712 if ((dst->d_space > dst->d_spc_softlimit) &&
706 (dst->d_spc_softlimit > 0)) { 713 (dst->d_spc_softlimit > 0)) {
707 ASSERT(dst->d_spc_timer != 0); 714 ASSERT(dst->d_spc_timer != 0);
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 7795e0d01382..f82d79a8c694 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -231,14 +231,45 @@ xfs_fs_get_dqblk(
231 struct qc_dqblk *qdq) 231 struct qc_dqblk *qdq)
232{ 232{
233 struct xfs_mount *mp = XFS_M(sb); 233 struct xfs_mount *mp = XFS_M(sb);
234 xfs_dqid_t id;
234 235
235 if (!XFS_IS_QUOTA_RUNNING(mp)) 236 if (!XFS_IS_QUOTA_RUNNING(mp))
236 return -ENOSYS; 237 return -ENOSYS;
237 if (!XFS_IS_QUOTA_ON(mp)) 238 if (!XFS_IS_QUOTA_ON(mp))
238 return -ESRCH; 239 return -ESRCH;
239 240
240 return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid), 241 id = from_kqid(&init_user_ns, qid);
241 xfs_quota_type(qid.type), qdq); 242 return xfs_qm_scall_getquota(mp, &id,
243 xfs_quota_type(qid.type), qdq, 0);
244}
245
246/* Return quota info for active quota >= this qid */
247STATIC int
248xfs_fs_get_nextdqblk(
249 struct super_block *sb,
250 struct kqid *qid,
251 struct qc_dqblk *qdq)
252{
253 int ret;
254 struct xfs_mount *mp = XFS_M(sb);
255 xfs_dqid_t id;
256
257 if (!XFS_IS_QUOTA_RUNNING(mp))
258 return -ENOSYS;
259 if (!XFS_IS_QUOTA_ON(mp))
260 return -ESRCH;
261
262 id = from_kqid(&init_user_ns, *qid);
263 ret = xfs_qm_scall_getquota(mp, &id,
264 xfs_quota_type(qid->type), qdq,
265 XFS_QMOPT_DQNEXT);
266 if (ret)
267 return ret;
268
269 /* ID may be different, so convert back what we got */
270 *qid = make_kqid(current_user_ns(), qid->type, id);
271 return 0;
272
242} 273}
243 274
244STATIC int 275STATIC int
@@ -267,5 +298,6 @@ const struct quotactl_ops xfs_quotactl_operations = {
267 .quota_disable = xfs_quota_disable, 298 .quota_disable = xfs_quota_disable,
268 .rm_xquota = xfs_fs_rm_xquota, 299 .rm_xquota = xfs_fs_rm_xquota,
269 .get_dqblk = xfs_fs_get_dqblk, 300 .get_dqblk = xfs_fs_get_dqblk,
301 .get_nextdqblk = xfs_fs_get_nextdqblk,
270 .set_dqblk = xfs_fs_set_dqblk, 302 .set_dqblk = xfs_fs_set_dqblk,
271}; 303};
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index be02a68b2fe2..abf44435d04a 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1272,7 +1272,7 @@ xfs_rtpick_extent(
1272 1272
1273 ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); 1273 ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
1274 1274
1275 seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime; 1275 seqp = (__uint64_t *)&VFS_I(mp->m_rbmip)->i_atime;
1276 if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) { 1276 if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
1277 mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; 1277 mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
1278 *seqp = 0; 1278 *seqp = 0;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 59c9b7bd958d..d760934109b5 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -45,6 +45,7 @@
45#include "xfs_filestream.h" 45#include "xfs_filestream.h"
46#include "xfs_quota.h" 46#include "xfs_quota.h"
47#include "xfs_sysfs.h" 47#include "xfs_sysfs.h"
48#include "xfs_ondisk.h"
48 49
49#include <linux/namei.h> 50#include <linux/namei.h>
50#include <linux/init.h> 51#include <linux/init.h>
@@ -65,83 +66,85 @@ static struct kset *xfs_kset; /* top-level xfs sysfs dir */
65static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ 66static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
66#endif 67#endif
67 68
68#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */
69#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */
70#define MNTOPT_LOGDEV "logdev" /* log device */
71#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */
72#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */
73#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */
74#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */
75#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */
76#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */
77#define MNTOPT_SWIDTH "swidth" /* data volume stripe width */
78#define MNTOPT_NOUUID "nouuid" /* ignore filesystem UUID */
79#define MNTOPT_MTPT "mtpt" /* filesystem mount point */
80#define MNTOPT_GRPID "grpid" /* group-ID from parent directory */
81#define MNTOPT_NOGRPID "nogrpid" /* group-ID from current process */
82#define MNTOPT_BSDGROUPS "bsdgroups" /* group-ID from parent directory */
83#define MNTOPT_SYSVGROUPS "sysvgroups" /* group-ID from current process */
84#define MNTOPT_ALLOCSIZE "allocsize" /* preferred allocation size */
85#define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */
86#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and
87 * unwritten extent conversion */
88#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */
89#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */
90#define MNTOPT_32BITINODE "inode32" /* inode allocation limited to
91 * XFS_MAXINUMBER_32 */
92#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */
93#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */
94#define MNTOPT_LARGEIO "largeio" /* report large I/O sizes in stat() */
95#define MNTOPT_NOLARGEIO "nolargeio" /* do not report large I/O sizes
96 * in stat(). */
97#define MNTOPT_ATTR2 "attr2" /* do use attr2 attribute format */
98#define MNTOPT_NOATTR2 "noattr2" /* do not use attr2 attribute format */
99#define MNTOPT_FILESTREAM "filestreams" /* use filestreams allocator */
100#define MNTOPT_QUOTA "quota" /* disk quotas (user) */
101#define MNTOPT_NOQUOTA "noquota" /* no quotas */
102#define MNTOPT_USRQUOTA "usrquota" /* user quota enabled */
103#define MNTOPT_GRPQUOTA "grpquota" /* group quota enabled */
104#define MNTOPT_PRJQUOTA "prjquota" /* project quota enabled */
105#define MNTOPT_UQUOTA "uquota" /* user quota (IRIX variant) */
106#define MNTOPT_GQUOTA "gquota" /* group quota (IRIX variant) */
107#define MNTOPT_PQUOTA "pquota" /* project quota (IRIX variant) */
108#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */
109#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
110#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
111#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */
112#define MNTOPT_DISCARD "discard" /* Discard unused blocks */
113#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */
114
115#define MNTOPT_DAX "dax" /* Enable direct access to bdev pages */
116
117/* 69/*
118 * Table driven mount option parser. 70 * Table driven mount option parser.
119 *
120 * Currently only used for remount, but it will be used for mount
121 * in the future, too.
122 */ 71 */
123enum { 72enum {
124 Opt_barrier, 73 Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_biosize,
125 Opt_nobarrier, 74 Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,
126 Opt_inode64, 75 Opt_mtpt, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
127 Opt_inode32, 76 Opt_allocsize, Opt_norecovery, Opt_barrier, Opt_nobarrier,
128 Opt_err 77 Opt_inode64, Opt_inode32, Opt_ikeep, Opt_noikeep,
78 Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2, Opt_filestreams,
79 Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota,
80 Opt_uquota, Opt_gquota, Opt_pquota,
81 Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
82 Opt_discard, Opt_nodiscard, Opt_dax, Opt_err,
129}; 83};
130 84
131static const match_table_t tokens = { 85static const match_table_t tokens = {
132 {Opt_barrier, "barrier"}, 86 {Opt_logbufs, "logbufs=%u"}, /* number of XFS log buffers */
133 {Opt_nobarrier, "nobarrier"}, 87 {Opt_logbsize, "logbsize=%s"}, /* size of XFS log buffers */
134 {Opt_inode64, "inode64"}, 88 {Opt_logdev, "logdev=%s"}, /* log device */
135 {Opt_inode32, "inode32"}, 89 {Opt_rtdev, "rtdev=%s"}, /* realtime I/O device */
136 {Opt_err, NULL} 90 {Opt_biosize, "biosize=%u"}, /* log2 of preferred buffered io size */
91 {Opt_wsync, "wsync"}, /* safe-mode nfs compatible mount */
92 {Opt_noalign, "noalign"}, /* turn off stripe alignment */
93 {Opt_swalloc, "swalloc"}, /* turn on stripe width allocation */
94 {Opt_sunit, "sunit=%u"}, /* data volume stripe unit */
95 {Opt_swidth, "swidth=%u"}, /* data volume stripe width */
96 {Opt_nouuid, "nouuid"}, /* ignore filesystem UUID */
97 {Opt_mtpt, "mtpt"}, /* filesystem mount point */
98 {Opt_grpid, "grpid"}, /* group-ID from parent directory */
99 {Opt_nogrpid, "nogrpid"}, /* group-ID from current process */
100 {Opt_bsdgroups, "bsdgroups"}, /* group-ID from parent directory */
101 {Opt_sysvgroups,"sysvgroups"}, /* group-ID from current process */
102 {Opt_allocsize, "allocsize=%s"},/* preferred allocation size */
103 {Opt_norecovery,"norecovery"}, /* don't run XFS recovery */
104 {Opt_barrier, "barrier"}, /* use writer barriers for log write and
105 * unwritten extent conversion */
106 {Opt_nobarrier, "nobarrier"}, /* .. disable */
107 {Opt_inode64, "inode64"}, /* inodes can be allocated anywhere */
108 {Opt_inode32, "inode32"}, /* inode allocation limited to
109 * XFS_MAXINUMBER_32 */
110 {Opt_ikeep, "ikeep"}, /* do not free empty inode clusters */
111 {Opt_noikeep, "noikeep"}, /* free empty inode clusters */
112 {Opt_largeio, "largeio"}, /* report large I/O sizes in stat() */
113 {Opt_nolargeio, "nolargeio"}, /* do not report large I/O sizes
114 * in stat(). */
115 {Opt_attr2, "attr2"}, /* do use attr2 attribute format */
116 {Opt_noattr2, "noattr2"}, /* do not use attr2 attribute format */
117 {Opt_filestreams,"filestreams"},/* use filestreams allocator */
118 {Opt_quota, "quota"}, /* disk quotas (user) */
119 {Opt_noquota, "noquota"}, /* no quotas */
120 {Opt_usrquota, "usrquota"}, /* user quota enabled */
121 {Opt_grpquota, "grpquota"}, /* group quota enabled */
122 {Opt_prjquota, "prjquota"}, /* project quota enabled */
123 {Opt_uquota, "uquota"}, /* user quota (IRIX variant) */
124 {Opt_gquota, "gquota"}, /* group quota (IRIX variant) */
125 {Opt_pquota, "pquota"}, /* project quota (IRIX variant) */
126 {Opt_uqnoenforce,"uqnoenforce"},/* user quota limit enforcement */
127 {Opt_gqnoenforce,"gqnoenforce"},/* group quota limit enforcement */
128 {Opt_pqnoenforce,"pqnoenforce"},/* project quota limit enforcement */
129 {Opt_qnoenforce, "qnoenforce"}, /* same as uqnoenforce */
130 {Opt_discard, "discard"}, /* Discard unused blocks */
131 {Opt_nodiscard, "nodiscard"}, /* Do not discard unused blocks */
132
133 {Opt_dax, "dax"}, /* Enable direct access to bdev pages */
134 {Opt_err, NULL},
137}; 135};
138 136
139 137
140STATIC int 138STATIC int
141suffix_kstrtoint(char *s, unsigned int base, int *res) 139suffix_kstrtoint(const substring_t *s, unsigned int base, int *res)
142{ 140{
143 int last, shift_left_factor = 0, _res; 141 int last, shift_left_factor = 0, _res;
144 char *value = s; 142 char *value;
143 int ret = 0;
144
145 value = match_strdup(s);
146 if (!value)
147 return -ENOMEM;
145 148
146 last = strlen(value) - 1; 149 last = strlen(value) - 1;
147 if (value[last] == 'K' || value[last] == 'k') { 150 if (value[last] == 'K' || value[last] == 'k') {
@@ -157,10 +160,11 @@ suffix_kstrtoint(char *s, unsigned int base, int *res)
157 value[last] = '\0'; 160 value[last] = '\0';
158 } 161 }
159 162
160 if (kstrtoint(s, base, &_res)) 163 if (kstrtoint(value, base, &_res))
161 return -EINVAL; 164 ret = -EINVAL;
165 kfree(value);
162 *res = _res << shift_left_factor; 166 *res = _res << shift_left_factor;
163 return 0; 167 return ret;
164} 168}
165 169
166/* 170/*
@@ -169,14 +173,19 @@ suffix_kstrtoint(char *s, unsigned int base, int *res)
169 * 173 *
170 * Note that this function leaks the various device name allocations on 174 * Note that this function leaks the various device name allocations on
171 * failure. The caller takes care of them. 175 * failure. The caller takes care of them.
176 *
177 * *sb is const because this is also used to test options on the remount
178 * path, and we don't want this to have any side effects at remount time.
179 * Today this function does not change *sb, but just to future-proof...
172 */ 180 */
173STATIC int 181STATIC int
174xfs_parseargs( 182xfs_parseargs(
175 struct xfs_mount *mp, 183 struct xfs_mount *mp,
176 char *options) 184 char *options)
177{ 185{
178 struct super_block *sb = mp->m_super; 186 const struct super_block *sb = mp->m_super;
179 char *this_char, *value; 187 char *p;
188 substring_t args[MAX_OPT_ARGS];
180 int dsunit = 0; 189 int dsunit = 0;
181 int dswidth = 0; 190 int dswidth = 0;
182 int iosize = 0; 191 int iosize = 0;
@@ -217,152 +226,152 @@ xfs_parseargs(
217 if (!options) 226 if (!options)
218 goto done; 227 goto done;
219 228
220 while ((this_char = strsep(&options, ",")) != NULL) { 229 while ((p = strsep(&options, ",")) != NULL) {
221 if (!*this_char) 230 int token;
231
232 if (!*p)
222 continue; 233 continue;
223 if ((value = strchr(this_char, '=')) != NULL)
224 *value++ = 0;
225 234
226 if (!strcmp(this_char, MNTOPT_LOGBUFS)) { 235 token = match_token(p, tokens, args);
227 if (!value || !*value) { 236 switch (token) {
228 xfs_warn(mp, "%s option requires an argument", 237 case Opt_logbufs:
229 this_char); 238 if (match_int(args, &mp->m_logbufs))
230 return -EINVAL;
231 }
232 if (kstrtoint(value, 10, &mp->m_logbufs))
233 return -EINVAL;
234 } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
235 if (!value || !*value) {
236 xfs_warn(mp, "%s option requires an argument",
237 this_char);
238 return -EINVAL;
239 }
240 if (suffix_kstrtoint(value, 10, &mp->m_logbsize))
241 return -EINVAL; 239 return -EINVAL;
242 } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { 240 break;
243 if (!value || !*value) { 241 case Opt_logbsize:
244 xfs_warn(mp, "%s option requires an argument", 242 if (suffix_kstrtoint(args, 10, &mp->m_logbsize))
245 this_char);
246 return -EINVAL; 243 return -EINVAL;
247 } 244 break;
248 mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL); 245 case Opt_logdev:
246 mp->m_logname = match_strdup(args);
249 if (!mp->m_logname) 247 if (!mp->m_logname)
250 return -ENOMEM; 248 return -ENOMEM;
251 } else if (!strcmp(this_char, MNTOPT_MTPT)) { 249 break;
252 xfs_warn(mp, "%s option not allowed on this system", 250 case Opt_mtpt:
253 this_char); 251 xfs_warn(mp, "%s option not allowed on this system", p);
254 return -EINVAL; 252 return -EINVAL;
255 } else if (!strcmp(this_char, MNTOPT_RTDEV)) { 253 case Opt_rtdev:
256 if (!value || !*value) { 254 mp->m_rtname = match_strdup(args);
257 xfs_warn(mp, "%s option requires an argument",
258 this_char);
259 return -EINVAL;
260 }
261 mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
262 if (!mp->m_rtname) 255 if (!mp->m_rtname)
263 return -ENOMEM; 256 return -ENOMEM;
264 } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE) || 257 break;
265 !strcmp(this_char, MNTOPT_BIOSIZE)) { 258 case Opt_allocsize:
266 if (!value || !*value) { 259 case Opt_biosize:
267 xfs_warn(mp, "%s option requires an argument", 260 if (suffix_kstrtoint(args, 10, &iosize))
268 this_char);
269 return -EINVAL;
270 }
271 if (suffix_kstrtoint(value, 10, &iosize))
272 return -EINVAL; 261 return -EINVAL;
273 iosizelog = ffs(iosize) - 1; 262 iosizelog = ffs(iosize) - 1;
274 } else if (!strcmp(this_char, MNTOPT_GRPID) || 263 break;
275 !strcmp(this_char, MNTOPT_BSDGROUPS)) { 264 case Opt_grpid:
265 case Opt_bsdgroups:
276 mp->m_flags |= XFS_MOUNT_GRPID; 266 mp->m_flags |= XFS_MOUNT_GRPID;
277 } else if (!strcmp(this_char, MNTOPT_NOGRPID) || 267 break;
278 !strcmp(this_char, MNTOPT_SYSVGROUPS)) { 268 case Opt_nogrpid:
269 case Opt_sysvgroups:
279 mp->m_flags &= ~XFS_MOUNT_GRPID; 270 mp->m_flags &= ~XFS_MOUNT_GRPID;
280 } else if (!strcmp(this_char, MNTOPT_WSYNC)) { 271 break;
272 case Opt_wsync:
281 mp->m_flags |= XFS_MOUNT_WSYNC; 273 mp->m_flags |= XFS_MOUNT_WSYNC;
282 } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { 274 break;
275 case Opt_norecovery:
283 mp->m_flags |= XFS_MOUNT_NORECOVERY; 276 mp->m_flags |= XFS_MOUNT_NORECOVERY;
284 } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { 277 break;
278 case Opt_noalign:
285 mp->m_flags |= XFS_MOUNT_NOALIGN; 279 mp->m_flags |= XFS_MOUNT_NOALIGN;
286 } else if (!strcmp(this_char, MNTOPT_SWALLOC)) { 280 break;
281 case Opt_swalloc:
287 mp->m_flags |= XFS_MOUNT_SWALLOC; 282 mp->m_flags |= XFS_MOUNT_SWALLOC;
288 } else if (!strcmp(this_char, MNTOPT_SUNIT)) { 283 break;
289 if (!value || !*value) { 284 case Opt_sunit:
290 xfs_warn(mp, "%s option requires an argument", 285 if (match_int(args, &dsunit))
291 this_char);
292 return -EINVAL;
293 }
294 if (kstrtoint(value, 10, &dsunit))
295 return -EINVAL;
296 } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
297 if (!value || !*value) {
298 xfs_warn(mp, "%s option requires an argument",
299 this_char);
300 return -EINVAL; 286 return -EINVAL;
301 } 287 break;
302 if (kstrtoint(value, 10, &dswidth)) 288 case Opt_swidth:
289 if (match_int(args, &dswidth))
303 return -EINVAL; 290 return -EINVAL;
304 } else if (!strcmp(this_char, MNTOPT_32BITINODE)) { 291 break;
292 case Opt_inode32:
305 mp->m_flags |= XFS_MOUNT_SMALL_INUMS; 293 mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
306 } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { 294 break;
295 case Opt_inode64:
307 mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; 296 mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
308 } else if (!strcmp(this_char, MNTOPT_NOUUID)) { 297 break;
298 case Opt_nouuid:
309 mp->m_flags |= XFS_MOUNT_NOUUID; 299 mp->m_flags |= XFS_MOUNT_NOUUID;
310 } else if (!strcmp(this_char, MNTOPT_BARRIER)) { 300 break;
301 case Opt_barrier:
311 mp->m_flags |= XFS_MOUNT_BARRIER; 302 mp->m_flags |= XFS_MOUNT_BARRIER;
312 } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) { 303 break;
304 case Opt_nobarrier:
313 mp->m_flags &= ~XFS_MOUNT_BARRIER; 305 mp->m_flags &= ~XFS_MOUNT_BARRIER;
314 } else if (!strcmp(this_char, MNTOPT_IKEEP)) { 306 break;
307 case Opt_ikeep:
315 mp->m_flags |= XFS_MOUNT_IKEEP; 308 mp->m_flags |= XFS_MOUNT_IKEEP;
316 } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) { 309 break;
310 case Opt_noikeep:
317 mp->m_flags &= ~XFS_MOUNT_IKEEP; 311 mp->m_flags &= ~XFS_MOUNT_IKEEP;
318 } else if (!strcmp(this_char, MNTOPT_LARGEIO)) { 312 break;
313 case Opt_largeio:
319 mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE; 314 mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
320 } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) { 315 break;
316 case Opt_nolargeio:
321 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; 317 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
322 } else if (!strcmp(this_char, MNTOPT_ATTR2)) { 318 break;
319 case Opt_attr2:
323 mp->m_flags |= XFS_MOUNT_ATTR2; 320 mp->m_flags |= XFS_MOUNT_ATTR2;
324 } else if (!strcmp(this_char, MNTOPT_NOATTR2)) { 321 break;
322 case Opt_noattr2:
325 mp->m_flags &= ~XFS_MOUNT_ATTR2; 323 mp->m_flags &= ~XFS_MOUNT_ATTR2;
326 mp->m_flags |= XFS_MOUNT_NOATTR2; 324 mp->m_flags |= XFS_MOUNT_NOATTR2;
327 } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) { 325 break;
326 case Opt_filestreams:
328 mp->m_flags |= XFS_MOUNT_FILESTREAMS; 327 mp->m_flags |= XFS_MOUNT_FILESTREAMS;
329 } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) { 328 break;
329 case Opt_noquota:
330 mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT; 330 mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
331 mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD; 331 mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
332 mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE; 332 mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE;
333 } else if (!strcmp(this_char, MNTOPT_QUOTA) || 333 break;
334 !strcmp(this_char, MNTOPT_UQUOTA) || 334 case Opt_quota:
335 !strcmp(this_char, MNTOPT_USRQUOTA)) { 335 case Opt_uquota:
336 case Opt_usrquota:
336 mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | 337 mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
337 XFS_UQUOTA_ENFD); 338 XFS_UQUOTA_ENFD);
338 } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) || 339 break;
339 !strcmp(this_char, MNTOPT_UQUOTANOENF)) { 340 case Opt_qnoenforce:
341 case Opt_uqnoenforce:
340 mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); 342 mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
341 mp->m_qflags &= ~XFS_UQUOTA_ENFD; 343 mp->m_qflags &= ~XFS_UQUOTA_ENFD;
342 } else if (!strcmp(this_char, MNTOPT_PQUOTA) || 344 break;
343 !strcmp(this_char, MNTOPT_PRJQUOTA)) { 345 case Opt_pquota:
346 case Opt_prjquota:
344 mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | 347 mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
345 XFS_PQUOTA_ENFD); 348 XFS_PQUOTA_ENFD);
346 } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) { 349 break;
350 case Opt_pqnoenforce:
347 mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); 351 mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
348 mp->m_qflags &= ~XFS_PQUOTA_ENFD; 352 mp->m_qflags &= ~XFS_PQUOTA_ENFD;
349 } else if (!strcmp(this_char, MNTOPT_GQUOTA) || 353 case Opt_gquota:
350 !strcmp(this_char, MNTOPT_GRPQUOTA)) { 354 case Opt_grpquota:
351 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | 355 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
352 XFS_GQUOTA_ENFD); 356 XFS_GQUOTA_ENFD);
353 } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { 357 break;
358 case Opt_gqnoenforce:
354 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); 359 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
355 mp->m_qflags &= ~XFS_GQUOTA_ENFD; 360 mp->m_qflags &= ~XFS_GQUOTA_ENFD;
356 } else if (!strcmp(this_char, MNTOPT_DISCARD)) { 361 break;
362 case Opt_discard:
357 mp->m_flags |= XFS_MOUNT_DISCARD; 363 mp->m_flags |= XFS_MOUNT_DISCARD;
358 } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { 364 break;
365 case Opt_nodiscard:
359 mp->m_flags &= ~XFS_MOUNT_DISCARD; 366 mp->m_flags &= ~XFS_MOUNT_DISCARD;
367 break;
360#ifdef CONFIG_FS_DAX 368#ifdef CONFIG_FS_DAX
361 } else if (!strcmp(this_char, MNTOPT_DAX)) { 369 case Opt_dax:
362 mp->m_flags |= XFS_MOUNT_DAX; 370 mp->m_flags |= XFS_MOUNT_DAX;
371 break;
363#endif 372#endif
364 } else { 373 default:
365 xfs_warn(mp, "unknown mount option [%s].", this_char); 374 xfs_warn(mp, "unknown mount option [%s].", p);
366 return -EINVAL; 375 return -EINVAL;
367 } 376 }
368 } 377 }
@@ -461,25 +470,25 @@ xfs_showargs(
461{ 470{
462 static struct proc_xfs_info xfs_info_set[] = { 471 static struct proc_xfs_info xfs_info_set[] = {
463 /* the few simple ones we can get from the mount struct */ 472 /* the few simple ones we can get from the mount struct */
464 { XFS_MOUNT_IKEEP, "," MNTOPT_IKEEP }, 473 { XFS_MOUNT_IKEEP, ",ikeep" },
465 { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC }, 474 { XFS_MOUNT_WSYNC, ",wsync" },
466 { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN }, 475 { XFS_MOUNT_NOALIGN, ",noalign" },
467 { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC }, 476 { XFS_MOUNT_SWALLOC, ",swalloc" },
468 { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, 477 { XFS_MOUNT_NOUUID, ",nouuid" },
469 { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY }, 478 { XFS_MOUNT_NORECOVERY, ",norecovery" },
470 { XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 }, 479 { XFS_MOUNT_ATTR2, ",attr2" },
471 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, 480 { XFS_MOUNT_FILESTREAMS, ",filestreams" },
472 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, 481 { XFS_MOUNT_GRPID, ",grpid" },
473 { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, 482 { XFS_MOUNT_DISCARD, ",discard" },
474 { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE }, 483 { XFS_MOUNT_SMALL_INUMS, ",inode32" },
475 { XFS_MOUNT_DAX, "," MNTOPT_DAX }, 484 { XFS_MOUNT_DAX, ",dax" },
476 { 0, NULL } 485 { 0, NULL }
477 }; 486 };
478 static struct proc_xfs_info xfs_info_unset[] = { 487 static struct proc_xfs_info xfs_info_unset[] = {
479 /* the few simple ones we can get from the mount struct */ 488 /* the few simple ones we can get from the mount struct */
480 { XFS_MOUNT_COMPAT_IOSIZE, "," MNTOPT_LARGEIO }, 489 { XFS_MOUNT_COMPAT_IOSIZE, ",largeio" },
481 { XFS_MOUNT_BARRIER, "," MNTOPT_NOBARRIER }, 490 { XFS_MOUNT_BARRIER, ",nobarrier" },
482 { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_64BITINODE }, 491 { XFS_MOUNT_SMALL_INUMS, ",inode64" },
483 { 0, NULL } 492 { 0, NULL }
484 }; 493 };
485 struct proc_xfs_info *xfs_infop; 494 struct proc_xfs_info *xfs_infop;
@@ -494,46 +503,46 @@ xfs_showargs(
494 } 503 }
495 504
496 if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) 505 if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
497 seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk", 506 seq_printf(m, ",allocsize=%dk",
498 (int)(1 << mp->m_writeio_log) >> 10); 507 (int)(1 << mp->m_writeio_log) >> 10);
499 508
500 if (mp->m_logbufs > 0) 509 if (mp->m_logbufs > 0)
501 seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs); 510 seq_printf(m, ",logbufs=%d", mp->m_logbufs);
502 if (mp->m_logbsize > 0) 511 if (mp->m_logbsize > 0)
503 seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10); 512 seq_printf(m, ",logbsize=%dk", mp->m_logbsize >> 10);
504 513
505 if (mp->m_logname) 514 if (mp->m_logname)
506 seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname); 515 seq_show_option(m, "logdev", mp->m_logname);
507 if (mp->m_rtname) 516 if (mp->m_rtname)
508 seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname); 517 seq_show_option(m, "rtdev", mp->m_rtname);
509 518
510 if (mp->m_dalign > 0) 519 if (mp->m_dalign > 0)
511 seq_printf(m, "," MNTOPT_SUNIT "=%d", 520 seq_printf(m, ",sunit=%d",
512 (int)XFS_FSB_TO_BB(mp, mp->m_dalign)); 521 (int)XFS_FSB_TO_BB(mp, mp->m_dalign));
513 if (mp->m_swidth > 0) 522 if (mp->m_swidth > 0)
514 seq_printf(m, "," MNTOPT_SWIDTH "=%d", 523 seq_printf(m, ",swidth=%d",
515 (int)XFS_FSB_TO_BB(mp, mp->m_swidth)); 524 (int)XFS_FSB_TO_BB(mp, mp->m_swidth));
516 525
517 if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD)) 526 if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD))
518 seq_puts(m, "," MNTOPT_USRQUOTA); 527 seq_puts(m, ",usrquota");
519 else if (mp->m_qflags & XFS_UQUOTA_ACCT) 528 else if (mp->m_qflags & XFS_UQUOTA_ACCT)
520 seq_puts(m, "," MNTOPT_UQUOTANOENF); 529 seq_puts(m, ",uqnoenforce");
521 530
522 if (mp->m_qflags & XFS_PQUOTA_ACCT) { 531 if (mp->m_qflags & XFS_PQUOTA_ACCT) {
523 if (mp->m_qflags & XFS_PQUOTA_ENFD) 532 if (mp->m_qflags & XFS_PQUOTA_ENFD)
524 seq_puts(m, "," MNTOPT_PRJQUOTA); 533 seq_puts(m, ",prjquota");
525 else 534 else
526 seq_puts(m, "," MNTOPT_PQUOTANOENF); 535 seq_puts(m, ",pqnoenforce");
527 } 536 }
528 if (mp->m_qflags & XFS_GQUOTA_ACCT) { 537 if (mp->m_qflags & XFS_GQUOTA_ACCT) {
529 if (mp->m_qflags & XFS_GQUOTA_ENFD) 538 if (mp->m_qflags & XFS_GQUOTA_ENFD)
530 seq_puts(m, "," MNTOPT_GRPQUOTA); 539 seq_puts(m, ",grpquota");
531 else 540 else
532 seq_puts(m, "," MNTOPT_GQUOTANOENF); 541 seq_puts(m, ",gqnoenforce");
533 } 542 }
534 543
535 if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) 544 if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
536 seq_puts(m, "," MNTOPT_NOQUOTA); 545 seq_puts(m, ",noquota");
537 546
538 return 0; 547 return 0;
539} 548}
@@ -572,23 +581,35 @@ xfs_max_file_offset(
572} 581}
573 582
574/* 583/*
575 * xfs_set_inode32() and xfs_set_inode64() are passed an agcount 584 * Set parameters for inode allocation heuristics, taking into account
576 * because in the growfs case, mp->m_sb.sb_agcount is not updated 585 * filesystem size and inode32/inode64 mount options; i.e. specifically
577 * yet to the potentially higher ag count. 586 * whether or not XFS_MOUNT_SMALL_INUMS is set.
587 *
588 * Inode allocation patterns are altered only if inode32 is requested
589 * (XFS_MOUNT_SMALL_INUMS), and the filesystem is sufficiently large.
590 * If altered, XFS_MOUNT_32BITINODES is set as well.
591 *
592 * An agcount independent of that in the mount structure is provided
593 * because in the growfs case, mp->m_sb.sb_agcount is not yet updated
594 * to the potentially higher ag count.
595 *
596 * Returns the maximum AG index which may contain inodes.
578 */ 597 */
579xfs_agnumber_t 598xfs_agnumber_t
580xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount) 599xfs_set_inode_alloc(
600 struct xfs_mount *mp,
601 xfs_agnumber_t agcount)
581{ 602{
582 xfs_agnumber_t index = 0; 603 xfs_agnumber_t index;
583 xfs_agnumber_t maxagi = 0; 604 xfs_agnumber_t maxagi = 0;
584 xfs_sb_t *sbp = &mp->m_sb; 605 xfs_sb_t *sbp = &mp->m_sb;
585 xfs_agnumber_t max_metadata; 606 xfs_agnumber_t max_metadata;
586 xfs_agino_t agino; 607 xfs_agino_t agino;
587 xfs_ino_t ino; 608 xfs_ino_t ino;
588 xfs_perag_t *pag;
589 609
590 /* Calculate how much should be reserved for inodes to meet 610 /*
591 * the max inode percentage. 611 * Calculate how much should be reserved for inodes to meet
612 * the max inode percentage. Used only for inode32.
592 */ 613 */
593 if (mp->m_maxicount) { 614 if (mp->m_maxicount) {
594 __uint64_t icount; 615 __uint64_t icount;
@@ -602,54 +623,48 @@ xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount)
602 max_metadata = agcount; 623 max_metadata = agcount;
603 } 624 }
604 625
626 /* Get the last possible inode in the filesystem */
605 agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); 627 agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
628 ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
629
630 /*
631 * If user asked for no more than 32-bit inodes, and the fs is
632 * sufficiently large, set XFS_MOUNT_32BITINODES if we must alter
633 * the allocator to accommodate the request.
634 */
635 if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
636 mp->m_flags |= XFS_MOUNT_32BITINODES;
637 else
638 mp->m_flags &= ~XFS_MOUNT_32BITINODES;
606 639
607 for (index = 0; index < agcount; index++) { 640 for (index = 0; index < agcount; index++) {
608 ino = XFS_AGINO_TO_INO(mp, index, agino); 641 struct xfs_perag *pag;
609 642
610 if (ino > XFS_MAXINUMBER_32) { 643 ino = XFS_AGINO_TO_INO(mp, index, agino);
611 pag = xfs_perag_get(mp, index);
612 pag->pagi_inodeok = 0;
613 pag->pagf_metadata = 0;
614 xfs_perag_put(pag);
615 continue;
616 }
617 644
618 pag = xfs_perag_get(mp, index); 645 pag = xfs_perag_get(mp, index);
619 pag->pagi_inodeok = 1;
620 maxagi++;
621 if (index < max_metadata)
622 pag->pagf_metadata = 1;
623 xfs_perag_put(pag);
624 }
625 mp->m_flags |= (XFS_MOUNT_32BITINODES |
626 XFS_MOUNT_SMALL_INUMS);
627 646
628 return maxagi; 647 if (mp->m_flags & XFS_MOUNT_32BITINODES) {
629} 648 if (ino > XFS_MAXINUMBER_32) {
630 649 pag->pagi_inodeok = 0;
631xfs_agnumber_t 650 pag->pagf_metadata = 0;
632xfs_set_inode64(struct xfs_mount *mp, xfs_agnumber_t agcount) 651 } else {
633{ 652 pag->pagi_inodeok = 1;
634 xfs_agnumber_t index = 0; 653 maxagi++;
635 654 if (index < max_metadata)
636 for (index = 0; index < agcount; index++) { 655 pag->pagf_metadata = 1;
637 struct xfs_perag *pag; 656 else
657 pag->pagf_metadata = 0;
658 }
659 } else {
660 pag->pagi_inodeok = 1;
661 pag->pagf_metadata = 0;
662 }
638 663
639 pag = xfs_perag_get(mp, index);
640 pag->pagi_inodeok = 1;
641 pag->pagf_metadata = 0;
642 xfs_perag_put(pag); 664 xfs_perag_put(pag);
643 } 665 }
644 666
645 /* There is no need for lock protection on m_flags, 667 return (mp->m_flags & XFS_MOUNT_32BITINODES) ? maxagi : agcount;
646 * the rw_semaphore of the VFS superblock is locked
647 * during mount/umount/remount operations, so this is
648 * enough to avoid concurency on the m_flags field
649 */
650 mp->m_flags &= ~(XFS_MOUNT_32BITINODES |
651 XFS_MOUNT_SMALL_INUMS);
652 return index;
653} 668}
654 669
655STATIC int 670STATIC int
@@ -1166,6 +1181,27 @@ xfs_quiesce_attr(
1166} 1181}
1167 1182
1168STATIC int 1183STATIC int
1184xfs_test_remount_options(
1185 struct super_block *sb,
1186 struct xfs_mount *mp,
1187 char *options)
1188{
1189 int error = 0;
1190 struct xfs_mount *tmp_mp;
1191
1192 tmp_mp = kmem_zalloc(sizeof(*tmp_mp), KM_MAYFAIL);
1193 if (!tmp_mp)
1194 return -ENOMEM;
1195
1196 tmp_mp->m_super = sb;
1197 error = xfs_parseargs(tmp_mp, options);
1198 xfs_free_fsname(tmp_mp);
1199 kfree(tmp_mp);
1200
1201 return error;
1202}
1203
1204STATIC int
1169xfs_fs_remount( 1205xfs_fs_remount(
1170 struct super_block *sb, 1206 struct super_block *sb,
1171 int *flags, 1207 int *flags,
@@ -1177,6 +1213,11 @@ xfs_fs_remount(
1177 char *p; 1213 char *p;
1178 int error; 1214 int error;
1179 1215
1216 /* First, check for complete junk; i.e. invalid options */
1217 error = xfs_test_remount_options(sb, mp, options);
1218 if (error)
1219 return error;
1220
1180 sync_filesystem(sb); 1221 sync_filesystem(sb);
1181 while ((p = strsep(&options, ",")) != NULL) { 1222 while ((p = strsep(&options, ",")) != NULL) {
1182 int token; 1223 int token;
@@ -1193,10 +1234,12 @@ xfs_fs_remount(
1193 mp->m_flags &= ~XFS_MOUNT_BARRIER; 1234 mp->m_flags &= ~XFS_MOUNT_BARRIER;
1194 break; 1235 break;
1195 case Opt_inode64: 1236 case Opt_inode64:
1196 mp->m_maxagi = xfs_set_inode64(mp, sbp->sb_agcount); 1237 mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
1238 mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
1197 break; 1239 break;
1198 case Opt_inode32: 1240 case Opt_inode32:
1199 mp->m_maxagi = xfs_set_inode32(mp, sbp->sb_agcount); 1241 mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
1242 mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
1200 break; 1243 break;
1201 default: 1244 default:
1202 /* 1245 /*
@@ -1344,9 +1387,8 @@ xfs_finish_flags(
1344 */ 1387 */
1345 if (xfs_sb_version_hascrc(&mp->m_sb) && 1388 if (xfs_sb_version_hascrc(&mp->m_sb) &&
1346 (mp->m_flags & XFS_MOUNT_NOATTR2)) { 1389 (mp->m_flags & XFS_MOUNT_NOATTR2)) {
1347 xfs_warn(mp, 1390 xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. "
1348"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.", 1391 "attr2 is always enabled for V5 filesystems.");
1349 MNTOPT_NOATTR2, MNTOPT_ATTR2);
1350 return -EINVAL; 1392 return -EINVAL;
1351 } 1393 }
1352 1394
@@ -1817,6 +1859,8 @@ init_xfs_fs(void)
1817{ 1859{
1818 int error; 1860 int error;
1819 1861
1862 xfs_check_ondisk_structs();
1863
1820 printk(KERN_INFO XFS_VERSION_STRING " with " 1864 printk(KERN_INFO XFS_VERSION_STRING " with "
1821 XFS_BUILD_OPTIONS " enabled\n"); 1865 XFS_BUILD_OPTIONS " enabled\n");
1822 1866
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 499058fea303..2dfb1ce4585f 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -65,8 +65,8 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
65 65
66extern void xfs_flush_inodes(struct xfs_mount *mp); 66extern void xfs_flush_inodes(struct xfs_mount *mp);
67extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); 67extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
68extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *, xfs_agnumber_t agcount); 68extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
69extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *, xfs_agnumber_t agcount); 69 xfs_agnumber_t agcount);
70 70
71extern const struct export_operations xfs_export_operations; 71extern const struct export_operations xfs_export_operations;
72extern const struct xattr_handler *xfs_xattr_handlers[]; 72extern const struct xattr_handler *xfs_xattr_handlers[];
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 641d625eb334..6ced4f143494 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -18,10 +18,13 @@
18 18
19#include "xfs.h" 19#include "xfs.h"
20#include "xfs_sysfs.h" 20#include "xfs_sysfs.h"
21#include "xfs_format.h"
21#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h"
22#include "xfs_log.h" 24#include "xfs_log.h"
23#include "xfs_log_priv.h" 25#include "xfs_log_priv.h"
24#include "xfs_stats.h" 26#include "xfs_stats.h"
27#include "xfs_mount.h"
25 28
26struct xfs_sysfs_attr { 29struct xfs_sysfs_attr {
27 struct attribute attr; 30 struct attribute attr;
@@ -45,16 +48,6 @@ to_attr(struct attribute *attr)
45 48
46#define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr 49#define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr
47 50
48/*
49 * xfs_mount kobject. This currently has no attributes and thus no need for show
50 * and store helpers. The mp kobject serves as the per-mount parent object that
51 * is identified by the fsname under sysfs.
52 */
53
54struct kobj_type xfs_mp_ktype = {
55 .release = xfs_sysfs_release,
56};
57
58STATIC ssize_t 51STATIC ssize_t
59xfs_sysfs_object_show( 52xfs_sysfs_object_show(
60 struct kobject *kobject, 53 struct kobject *kobject,
@@ -83,6 +76,71 @@ static const struct sysfs_ops xfs_sysfs_ops = {
83 .store = xfs_sysfs_object_store, 76 .store = xfs_sysfs_object_store,
84}; 77};
85 78
79/*
80 * xfs_mount kobject. The mp kobject also serves as the per-mount parent object
81 * that is identified by the fsname under sysfs.
82 */
83
84static inline struct xfs_mount *
85to_mp(struct kobject *kobject)
86{
87 struct xfs_kobj *kobj = to_kobj(kobject);
88
89 return container_of(kobj, struct xfs_mount, m_kobj);
90}
91
92#ifdef DEBUG
93
94STATIC ssize_t
95fail_writes_store(
96 struct kobject *kobject,
97 const char *buf,
98 size_t count)
99{
100 struct xfs_mount *mp = to_mp(kobject);
101 int ret;
102 int val;
103
104 ret = kstrtoint(buf, 0, &val);
105 if (ret)
106 return ret;
107
108 if (val == 1)
109 mp->m_fail_writes = true;
110 else if (val == 0)
111 mp->m_fail_writes = false;
112 else
113 return -EINVAL;
114
115 return count;
116}
117
118STATIC ssize_t
119fail_writes_show(
120 struct kobject *kobject,
121 char *buf)
122{
123 struct xfs_mount *mp = to_mp(kobject);
124
125 return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_writes ? 1 : 0);
126}
127XFS_SYSFS_ATTR_RW(fail_writes);
128
129#endif /* DEBUG */
130
131static struct attribute *xfs_mp_attrs[] = {
132#ifdef DEBUG
133 ATTR_LIST(fail_writes),
134#endif
135 NULL,
136};
137
138struct kobj_type xfs_mp_ktype = {
139 .release = xfs_sysfs_release,
140 .sysfs_ops = &xfs_sysfs_ops,
141 .default_attrs = xfs_mp_attrs,
142};
143
86#ifdef DEBUG 144#ifdef DEBUG
87/* debug */ 145/* debug */
88 146
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 391d797cb53f..c8d58426008e 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1296,11 +1296,7 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
1296DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); 1296DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
1297DEFINE_IOMAP_EVENT(xfs_get_blocks_found); 1297DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
1298DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); 1298DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
1299DEFINE_IOMAP_EVENT(xfs_gbmap_direct); 1299DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
1300DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
1301DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
1302DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
1303DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
1304 1300
1305DECLARE_EVENT_CLASS(xfs_simple_io_class, 1301DECLARE_EVENT_CLASS(xfs_simple_io_class,
1306 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), 1302 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1340,6 +1336,9 @@ DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
1340DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound); 1336DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
1341DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize); 1337DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
1342DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof); 1338DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
1339DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
1340DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
1341DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
1343 1342
1344DECLARE_EVENT_CLASS(xfs_itrunc_class, 1343DECLARE_EVENT_CLASS(xfs_itrunc_class,
1345 TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), 1344 TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 748b16aff45a..20c53666cb4b 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1028,6 +1028,8 @@ __xfs_trans_roll(
1028 struct xfs_trans_res tres; 1028 struct xfs_trans_res tres;
1029 int error; 1029 int error;
1030 1030
1031 *committed = 0;
1032
1031 /* 1033 /*
1032 * Ensure that the inode is always logged. 1034 * Ensure that the inode is always logged.
1033 */ 1035 */
@@ -1082,6 +1084,6 @@ xfs_trans_roll(
1082 struct xfs_trans **tpp, 1084 struct xfs_trans **tpp,
1083 struct xfs_inode *dp) 1085 struct xfs_inode *dp)
1084{ 1086{
1085 int committed = 0; 1087 int committed;
1086 return __xfs_trans_roll(tpp, dp, &committed); 1088 return __xfs_trans_roll(tpp, dp, &committed);
1087} 1089}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 4643070d7cae..e7c49cf43fbc 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -133,7 +133,6 @@ typedef struct xfs_trans {
133 * XFS transaction mechanism exported interfaces that are 133 * XFS transaction mechanism exported interfaces that are
134 * actually macros. 134 * actually macros.
135 */ 135 */
136#define xfs_trans_get_block_res(tp) ((tp)->t_blk_res)
137#define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) 136#define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC)
138 137
139#if defined(DEBUG) || defined(XFS_WARN) 138#if defined(DEBUG) || defined(XFS_WARN)
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 4f18fd92ca13..d6c9c3e9e02b 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -497,6 +497,7 @@ xfsaild(
497 long tout = 0; /* milliseconds */ 497 long tout = 0; /* milliseconds */
498 498
499 current->flags |= PF_MEMALLOC; 499 current->flags |= PF_MEMALLOC;
500 set_freezable();
500 501
501 while (!kthread_should_stop()) { 502 while (!kthread_should_stop()) {
502 if (tout && tout <= 20) 503 if (tout && tout <= 20)
@@ -519,14 +520,14 @@ xfsaild(
519 if (!xfs_ail_min(ailp) && 520 if (!xfs_ail_min(ailp) &&
520 ailp->xa_target == ailp->xa_target_prev) { 521 ailp->xa_target == ailp->xa_target_prev) {
521 spin_unlock(&ailp->xa_lock); 522 spin_unlock(&ailp->xa_lock);
522 schedule(); 523 freezable_schedule();
523 tout = 0; 524 tout = 0;
524 continue; 525 continue;
525 } 526 }
526 spin_unlock(&ailp->xa_lock); 527 spin_unlock(&ailp->xa_lock);
527 528
528 if (tout) 529 if (tout)
529 schedule_timeout(msecs_to_jiffies(tout)); 530 freezable_schedule_timeout(msecs_to_jiffies(tout));
530 531
531 __set_current_state(TASK_RUNNING); 532 __set_current_state(TASK_RUNNING);
532 533
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 75798412859a..8ee29ca132dc 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -155,7 +155,7 @@ xfs_trans_get_buf_map(
155 ASSERT(xfs_buf_islocked(bp)); 155 ASSERT(xfs_buf_islocked(bp));
156 if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) { 156 if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
157 xfs_buf_stale(bp); 157 xfs_buf_stale(bp);
158 XFS_BUF_DONE(bp); 158 bp->b_flags |= XBF_DONE;
159 } 159 }
160 160
161 ASSERT(bp->b_transp == tp); 161 ASSERT(bp->b_transp == tp);
@@ -518,7 +518,7 @@ xfs_trans_log_buf(xfs_trans_t *tp,
518 * inside the b_bdstrat callback so that this won't get written to 518 * inside the b_bdstrat callback so that this won't get written to
519 * disk. 519 * disk.
520 */ 520 */
521 XFS_BUF_DONE(bp); 521 bp->b_flags |= XBF_DONE;
522 522
523 ASSERT(atomic_read(&bip->bli_refcount) > 0); 523 ASSERT(atomic_read(&bip->bli_refcount) > 0);
524 bp->b_iodone = xfs_buf_iodone_callbacks; 524 bp->b_iodone = xfs_buf_iodone_callbacks;
@@ -534,8 +534,8 @@ xfs_trans_log_buf(xfs_trans_t *tp,
534 */ 534 */
535 if (bip->bli_flags & XFS_BLI_STALE) { 535 if (bip->bli_flags & XFS_BLI_STALE) {
536 bip->bli_flags &= ~XFS_BLI_STALE; 536 bip->bli_flags &= ~XFS_BLI_STALE;
537 ASSERT(XFS_BUF_ISSTALE(bp)); 537 ASSERT(bp->b_flags & XBF_STALE);
538 XFS_BUF_UNSTALE(bp); 538 bp->b_flags &= ~XBF_STALE;
539 bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL; 539 bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
540 } 540 }
541 541
@@ -600,7 +600,7 @@ xfs_trans_binval(
600 * If the buffer is already invalidated, then 600 * If the buffer is already invalidated, then
601 * just return. 601 * just return.
602 */ 602 */
603 ASSERT(XFS_BUF_ISSTALE(bp)); 603 ASSERT(bp->b_flags & XBF_STALE);
604 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); 604 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
605 ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF)); 605 ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF));
606 ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK)); 606 ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK));
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 995170194df0..c3d547211d16 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -609,17 +609,20 @@ xfs_trans_dqresv(
609 xfs_qcnt_t total_count; 609 xfs_qcnt_t total_count;
610 xfs_qcnt_t *resbcountp; 610 xfs_qcnt_t *resbcountp;
611 xfs_quotainfo_t *q = mp->m_quotainfo; 611 xfs_quotainfo_t *q = mp->m_quotainfo;
612 struct xfs_def_quota *defq;
612 613
613 614
614 xfs_dqlock(dqp); 615 xfs_dqlock(dqp);
615 616
617 defq = xfs_get_defquota(dqp, q);
618
616 if (flags & XFS_TRANS_DQ_RES_BLKS) { 619 if (flags & XFS_TRANS_DQ_RES_BLKS) {
617 hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit); 620 hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
618 if (!hardlimit) 621 if (!hardlimit)
619 hardlimit = q->qi_bhardlimit; 622 hardlimit = defq->bhardlimit;
620 softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit); 623 softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit);
621 if (!softlimit) 624 if (!softlimit)
622 softlimit = q->qi_bsoftlimit; 625 softlimit = defq->bsoftlimit;
623 timer = be32_to_cpu(dqp->q_core.d_btimer); 626 timer = be32_to_cpu(dqp->q_core.d_btimer);
624 warns = be16_to_cpu(dqp->q_core.d_bwarns); 627 warns = be16_to_cpu(dqp->q_core.d_bwarns);
625 warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit; 628 warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
@@ -628,10 +631,10 @@ xfs_trans_dqresv(
628 ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); 631 ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
629 hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit); 632 hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit);
630 if (!hardlimit) 633 if (!hardlimit)
631 hardlimit = q->qi_rtbhardlimit; 634 hardlimit = defq->rtbhardlimit;
632 softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit); 635 softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit);
633 if (!softlimit) 636 if (!softlimit)
634 softlimit = q->qi_rtbsoftlimit; 637 softlimit = defq->rtbsoftlimit;
635 timer = be32_to_cpu(dqp->q_core.d_rtbtimer); 638 timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
636 warns = be16_to_cpu(dqp->q_core.d_rtbwarns); 639 warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
637 warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit; 640 warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
@@ -672,10 +675,10 @@ xfs_trans_dqresv(
672 warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit; 675 warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
673 hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); 676 hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
674 if (!hardlimit) 677 if (!hardlimit)
675 hardlimit = q->qi_ihardlimit; 678 hardlimit = defq->ihardlimit;
676 softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); 679 softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
677 if (!softlimit) 680 if (!softlimit)
678 softlimit = q->qi_isoftlimit; 681 softlimit = defq->isoftlimit;
679 682
680 if (hardlimit && total_count > hardlimit) { 683 if (hardlimit && total_count > hardlimit) {
681 xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN); 684 xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index b97f1df910ab..11a3af08b5c7 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -75,18 +75,10 @@ xfs_trans_ichgtime(
75 75
76 tv = current_fs_time(inode->i_sb); 76 tv = current_fs_time(inode->i_sb);
77 77
78 if ((flags & XFS_ICHGTIME_MOD) && 78 if (flags & XFS_ICHGTIME_MOD)
79 !timespec_equal(&inode->i_mtime, &tv)) {
80 inode->i_mtime = tv; 79 inode->i_mtime = tv;
81 ip->i_d.di_mtime.t_sec = tv.tv_sec; 80 if (flags & XFS_ICHGTIME_CHG)
82 ip->i_d.di_mtime.t_nsec = tv.tv_nsec;
83 }
84 if ((flags & XFS_ICHGTIME_CHG) &&
85 !timespec_equal(&inode->i_ctime, &tv)) {
86 inode->i_ctime = tv; 81 inode->i_ctime = tv;
87 ip->i_d.di_ctime.t_sec = tv.tv_sec;
88 ip->i_d.di_ctime.t_nsec = tv.tv_nsec;
89 }
90} 82}
91 83
92/* 84/*
@@ -125,7 +117,7 @@ xfs_trans_log_inode(
125 */ 117 */
126 if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) && 118 if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
127 IS_I_VERSION(VFS_I(ip))) { 119 IS_I_VERSION(VFS_I(ip))) {
128 ip->i_d.di_changecount = ++VFS_I(ip)->i_version; 120 VFS_I(ip)->i_version++;
129 flags |= XFS_ILOG_CORE; 121 flags |= XFS_ILOG_CORE;
130 } 122 }
131 123