aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig4
-rw-r--r--fs/attr.c11
-rw-r--r--fs/autofs4/autofs_i.h8
-rw-r--r--fs/autofs4/dev-ioctl.c4
-rw-r--r--fs/autofs4/expire.c9
-rw-r--r--fs/autofs4/inode.c24
-rw-r--r--fs/autofs4/root.c83
-rw-r--r--fs/autofs4/waitq.c5
-rw-r--r--fs/bad_inode.c2
-rw-r--r--fs/binfmt_aout.c5
-rw-r--r--fs/binfmt_elf.c9
-rw-r--r--fs/binfmt_elf_fdpic.c6
-rw-r--r--fs/binfmt_em86.c5
-rw-r--r--fs/binfmt_flat.c5
-rw-r--r--fs/binfmt_misc.c10
-rw-r--r--fs/binfmt_script.c8
-rw-r--r--fs/binfmt_som.c5
-rw-r--r--fs/block_dev.c170
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c2
-rw-r--r--fs/btrfs/backref.c16
-rw-r--r--fs/btrfs/btrfs_inode.h4
-rw-r--r--fs/btrfs/check-integrity.c31
-rw-r--r--fs/btrfs/compression.c6
-rw-r--r--fs/btrfs/ctree.c241
-rw-r--r--fs/btrfs/ctree.h184
-rw-r--r--fs/btrfs/delayed-inode.c11
-rw-r--r--fs/btrfs/dev-replace.c856
-rw-r--r--fs/btrfs/dev-replace.h44
-rw-r--r--fs/btrfs/dir-item.c59
-rw-r--r--fs/btrfs/disk-io.c146
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c229
-rw-r--r--fs/btrfs/extent_io.c37
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/extent_map.c27
-rw-r--r--fs/btrfs/extent_map.h2
-rw-r--r--fs/btrfs/file-item.c21
-rw-r--r--fs/btrfs/file.c425
-rw-r--r--fs/btrfs/free-space-cache.c51
-rw-r--r--fs/btrfs/inode-map.c5
-rw-r--r--fs/btrfs/inode.c484
-rw-r--r--fs/btrfs/ioctl.c319
-rw-r--r--fs/btrfs/ioctl.h48
-rw-r--r--fs/btrfs/math.h44
-rw-r--r--fs/btrfs/ordered-data.c90
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/reada.c31
-rw-r--r--fs/btrfs/relocation.c40
-rw-r--r--fs/btrfs/root-tree.c4
-rw-r--r--fs/btrfs/scrub.c1836
-rw-r--r--fs/btrfs/send.c8
-rw-r--r--fs/btrfs/super.c48
-rw-r--r--fs/btrfs/transaction.c170
-rw-r--r--fs/btrfs/transaction.h2
-rw-r--r--fs/btrfs/tree-log.c477
-rw-r--r--fs/btrfs/volumes.c966
-rw-r--r--fs/btrfs/volumes.h35
-rw-r--r--fs/btrfs/xattr.c13
-rw-r--r--fs/buffer.c163
-rw-r--r--fs/ceph/dir.c4
-rw-r--r--fs/ceph/export.c4
-rw-r--r--fs/ceph/file.c6
-rw-r--r--fs/cifs/Kconfig10
-rw-r--r--fs/cifs/README2
-rw-r--r--fs/cifs/cifs_debug.h72
-rw-r--r--fs/cifs/cifsacl.c777
-rw-r--r--fs/cifs/cifsacl.h66
-rw-r--r--fs/cifs/cifsfs.c25
-rw-r--r--fs/cifs/cifsglob.h36
-rw-r--r--fs/cifs/cifsproto.h10
-rw-r--r--fs/cifs/connect.c310
-rw-r--r--fs/cifs/dir.c43
-rw-r--r--fs/cifs/file.c206
-rw-r--r--fs/cifs/inode.c7
-rw-r--r--fs/cifs/netmisc.c14
-rw-r--r--fs/cifs/readdir.c55
-rw-r--r--fs/cifs/smb1ops.c35
-rw-r--r--fs/cifs/smb2file.c12
-rw-r--r--fs/cifs/smb2ops.c103
-rw-r--r--fs/cifs/smb2pdu.c5
-rw-r--r--fs/cifs/smb2proto.h4
-rw-r--r--fs/cifs/smb2transport.c13
-rw-r--r--fs/compat_ioctl.c3
-rw-r--r--fs/configfs/dir.c4
-rw-r--r--fs/coredump.c4
-rw-r--r--fs/debugfs/inode.c1
-rw-r--r--fs/devpts/inode.c61
-rw-r--r--fs/direct-io.c8
-rw-r--r--fs/dlm/Kconfig2
-rw-r--r--fs/dlm/dlm_internal.h1
-rw-r--r--fs/dlm/lock.c16
-rw-r--r--fs/dlm/lowcomms.c5
-rw-r--r--fs/dlm/recover.c37
-rw-r--r--fs/eventfd.c20
-rw-r--r--fs/eventpoll.c66
-rw-r--r--fs/exec.c56
-rw-r--r--fs/exofs/inode.c16
-rw-r--r--fs/exportfs/expfs.c19
-rw-r--r--fs/ext3/balloc.c5
-rw-r--r--fs/ext3/dir.c6
-rw-r--r--fs/ext3/inode.c3
-rw-r--r--fs/ext3/super.c3
-rw-r--r--fs/ext4/Kconfig15
-rw-r--r--fs/ext4/Makefile4
-rw-r--r--fs/ext4/acl.c6
-rw-r--r--fs/ext4/dir.c47
-rw-r--r--fs/ext4/ext4.h167
-rw-r--r--fs/ext4/ext4_extents.h40
-rw-r--r--fs/ext4/ext4_jbd2.h7
-rw-r--r--fs/ext4/extents.c480
-rw-r--r--fs/ext4/extents_status.c500
-rw-r--r--fs/ext4/extents_status.h45
-rw-r--r--fs/ext4/file.c338
-rw-r--r--fs/ext4/fsync.c6
-rw-r--r--fs/ext4/ialloc.c6
-rw-r--r--fs/ext4/indirect.c5
-rw-r--r--fs/ext4/inline.c1884
-rw-r--r--fs/ext4/inode.c629
-rw-r--r--fs/ext4/mballoc.c60
-rw-r--r--fs/ext4/migrate.c1
-rw-r--r--fs/ext4/move_extent.c1
-rw-r--r--fs/ext4/namei.c531
-rw-r--r--fs/ext4/page-io.c3
-rw-r--r--fs/ext4/resize.c17
-rw-r--r--fs/ext4/super.c57
-rw-r--r--fs/ext4/symlink.c4
-rw-r--r--fs/ext4/xattr.c110
-rw-r--r--fs/ext4/xattr.h158
-rw-r--r--fs/fat/fat.h3
-rw-r--r--fs/fat/inode.c55
-rw-r--r--fs/fat/misc.c9
-rw-r--r--fs/fhandle.c4
-rw-r--r--fs/file.c21
-rw-r--r--fs/fs-writeback.c4
-rw-r--r--fs/fs_struct.c24
-rw-r--r--fs/fuse/dev.c4
-rw-r--r--fs/fuse/dir.c20
-rw-r--r--fs/fuse/file.c8
-rw-r--r--fs/fuse/fuse_i.h4
-rw-r--r--fs/fuse/inode.c23
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/bmap.c54
-rw-r--r--fs/gfs2/dir.c7
-rw-r--r--fs/gfs2/file.c28
-rw-r--r--fs/gfs2/glock.c42
-rw-r--r--fs/gfs2/glock.h54
-rw-r--r--fs/gfs2/glops.c19
-rw-r--r--fs/gfs2/incore.h6
-rw-r--r--fs/gfs2/inode.c209
-rw-r--r--fs/gfs2/lock_dlm.c20
-rw-r--r--fs/gfs2/lops.c16
-rw-r--r--fs/gfs2/ops_fstype.c3
-rw-r--r--fs/gfs2/quota.c17
-rw-r--r--fs/gfs2/rgrp.c172
-rw-r--r--fs/gfs2/rgrp.h3
-rw-r--r--fs/gfs2/super.c3
-rw-r--r--fs/gfs2/trace_gfs2.h2
-rw-r--r--fs/gfs2/trans.c8
-rw-r--r--fs/gfs2/xattr.c2
-rw-r--r--fs/hppfs/hppfs.c2
-rw-r--r--fs/hugetlbfs/inode.c111
-rw-r--r--fs/inode.c18
-rw-r--r--fs/internal.h1
-rw-r--r--fs/jbd/transaction.c4
-rw-r--r--fs/jbd2/journal.c1
-rw-r--r--fs/jbd2/transaction.c13
-rw-r--r--fs/libfs.c4
-rw-r--r--fs/lockd/clnt4xdr.c8
-rw-r--r--fs/lockd/clntproc.c3
-rw-r--r--fs/lockd/clntxdr.c8
-rw-r--r--fs/lockd/host.c15
-rw-r--r--fs/lockd/mon.c3
-rw-r--r--fs/logfs/inode.c2
-rw-r--r--fs/mount.h3
-rw-r--r--fs/namei.c5
-rw-r--r--fs/namespace.c212
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/nfs/Makefile2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c1
-rw-r--r--fs/nfs/cache_lib.c1
-rw-r--r--fs/nfs/callback.h4
-rw-r--r--fs/nfs/callback_proc.c17
-rw-r--r--fs/nfs/callback_xdr.c5
-rw-r--r--fs/nfs/client.c9
-rw-r--r--fs/nfs/dir.c35
-rw-r--r--fs/nfs/direct.c17
-rw-r--r--fs/nfs/file.c10
-rw-r--r--fs/nfs/idmap.c12
-rw-r--r--fs/nfs/inode.c10
-rw-r--r--fs/nfs/internal.h42
-rw-r--r--fs/nfs/mount_clnt.c7
-rw-r--r--fs/nfs/nfs2xdr.c4
-rw-r--r--fs/nfs/nfs3proc.c6
-rw-r--r--fs/nfs/nfs3xdr.c7
-rw-r--r--fs/nfs/nfs4_fs.h29
-rw-r--r--fs/nfs/nfs4client.c5
-rw-r--r--fs/nfs/nfs4file.c1
-rw-r--r--fs/nfs/nfs4filelayout.c45
-rw-r--r--fs/nfs/nfs4filelayoutdev.c3
-rw-r--r--fs/nfs/nfs4proc.c820
-rw-r--r--fs/nfs/nfs4session.c552
-rw-r--r--fs/nfs/nfs4session.h142
-rw-r--r--fs/nfs/nfs4state.c143
-rw-r--r--fs/nfs/nfs4super.c1
-rw-r--r--fs/nfs/nfs4xdr.c52
-rw-r--r--fs/nfs/objlayout/objlayout.c11
-rw-r--r--fs/nfs/pnfs.c17
-rw-r--r--fs/nfs/proc.c43
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfs/write.c31
-rw-r--r--fs/nilfs2/page.c2
-rw-r--r--fs/notify/Makefile2
-rw-r--r--fs/notify/fanotify/Kconfig2
-rw-r--r--fs/notify/fanotify/fanotify.c1
-rw-r--r--fs/notify/fanotify/fanotify_user.c5
-rw-r--r--fs/notify/fdinfo.c179
-rw-r--r--fs/notify/fdinfo.h27
-rw-r--r--fs/notify/inode_mark.c5
-rw-r--r--fs/notify/inotify/inotify_user.c2
-rw-r--r--fs/notify/notification.c2
-rw-r--r--fs/ocfs2/extent_map.c12
-rw-r--r--fs/ocfs2/file.c11
-rw-r--r--fs/open.c2
-rw-r--r--fs/pnode.h1
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/array.c27
-rw-r--r--fs/proc/base.c293
-rw-r--r--fs/proc/fd.c2
-rw-r--r--fs/proc/generic.c26
-rw-r--r--fs/proc/inode.c6
-rw-r--r--fs/proc/internal.h1
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/namespaces.c185
-rw-r--r--fs/proc/proc_devtree.c6
-rw-r--r--fs/proc/proc_sysctl.c9
-rw-r--r--fs/proc/root.c17
-rw-r--r--fs/proc/self.c59
-rw-r--r--fs/proc/task_mmu.c59
-rw-r--r--fs/pstore/ftrace.c4
-rw-r--r--fs/pstore/inode.c13
-rw-r--r--fs/pstore/internal.h2
-rw-r--r--fs/pstore/platform.c14
-rw-r--r--fs/pstore/ram.c51
-rw-r--r--fs/quota/quota.c4
-rw-r--r--fs/read_write.c40
-rw-r--r--fs/reiserfs/inode.c10
-rw-r--r--fs/reiserfs/stree.c4
-rw-r--r--fs/reiserfs/super.c60
-rw-r--r--fs/seq_file.c4
-rw-r--r--fs/signalfd.c18
-rw-r--r--fs/splice.c5
-rw-r--r--fs/sysfs/file.c4
-rw-r--r--fs/sysfs/mount.c1
-rw-r--r--fs/ubifs/debug.c8
-rw-r--r--fs/ubifs/dir.c4
-rw-r--r--fs/ubifs/find.c12
-rw-r--r--fs/ubifs/lprops.c6
-rw-r--r--fs/ubifs/ubifs.h3
-rw-r--r--fs/udf/inode.c14
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/Makefile4
-rw-r--r--fs/xfs/uuid.h6
-rw-r--r--fs/xfs/xfs_ag.h5
-rw-r--r--fs/xfs/xfs_alloc.c183
-rw-r--r--fs/xfs/xfs_alloc.h6
-rw-r--r--fs/xfs/xfs_alloc_btree.c79
-rw-r--r--fs/xfs/xfs_alloc_btree.h2
-rw-r--r--fs/xfs/xfs_aops.c137
-rw-r--r--fs/xfs/xfs_attr.c103
-rw-r--r--fs/xfs/xfs_attr_leaf.c163
-rw-r--r--fs/xfs/xfs_attr_leaf.h6
-rw-r--r--fs/xfs/xfs_bmap.c127
-rw-r--r--fs/xfs/xfs_bmap.h9
-rw-r--r--fs/xfs/xfs_bmap_btree.c63
-rw-r--r--fs/xfs/xfs_bmap_btree.h1
-rw-r--r--fs/xfs/xfs_btree.c111
-rw-r--r--fs/xfs/xfs_btree.h22
-rw-r--r--fs/xfs/xfs_buf.c73
-rw-r--r--fs/xfs/xfs_buf.h27
-rw-r--r--fs/xfs/xfs_buf_item.c18
-rw-r--r--fs/xfs/xfs_cksum.h63
-rw-r--r--fs/xfs/xfs_da_btree.c141
-rw-r--r--fs/xfs/xfs_da_btree.h10
-rw-r--r--fs/xfs/xfs_dfrag.c13
-rw-r--r--fs/xfs/xfs_dir2_block.c436
-rw-r--r--fs/xfs/xfs_dir2_data.c170
-rw-r--r--fs/xfs/xfs_dir2_leaf.c172
-rw-r--r--fs/xfs/xfs_dir2_node.c288
-rw-r--r--fs/xfs/xfs_dir2_priv.h19
-rw-r--r--fs/xfs/xfs_dquot.c134
-rw-r--r--fs/xfs/xfs_dquot.h2
-rw-r--r--fs/xfs/xfs_export.c1
-rw-r--r--fs/xfs/xfs_file.c42
-rw-r--r--fs/xfs/xfs_fs.h33
-rw-r--r--fs/xfs/xfs_fs_subr.c96
-rw-r--r--fs/xfs/xfs_fsops.c158
-rw-r--r--fs/xfs/xfs_globals.c4
-rw-r--r--fs/xfs/xfs_ialloc.c84
-rw-r--r--fs/xfs/xfs_ialloc.h4
-rw-r--r--fs/xfs/xfs_ialloc_btree.c55
-rw-r--r--fs/xfs/xfs_ialloc_btree.h2
-rw-r--r--fs/xfs/xfs_icache.c (renamed from fs/xfs/xfs_sync.c)914
-rw-r--r--fs/xfs/xfs_icache.h (renamed from fs/xfs/xfs_sync.h)28
-rw-r--r--fs/xfs/xfs_iget.c705
-rw-r--r--fs/xfs/xfs_inode.c440
-rw-r--r--fs/xfs/xfs_inode.h12
-rw-r--r--fs/xfs/xfs_ioctl.c23
-rw-r--r--fs/xfs/xfs_iomap.c35
-rw-r--r--fs/xfs/xfs_iops.c8
-rw-r--r--fs/xfs/xfs_itable.c4
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.c260
-rw-r--r--fs/xfs/xfs_log.h4
-rw-r--r--fs/xfs/xfs_log_priv.h12
-rw-r--r--fs/xfs/xfs_log_recover.c148
-rw-r--r--fs/xfs/xfs_mount.c163
-rw-r--r--fs/xfs/xfs_mount.h13
-rw-r--r--fs/xfs/xfs_qm.c22
-rw-r--r--fs/xfs/xfs_qm_syscalls.c6
-rw-r--r--fs/xfs/xfs_rtalloc.c16
-rw-r--r--fs/xfs/xfs_sb.h7
-rw-r--r--fs/xfs/xfs_super.c148
-rw-r--r--fs/xfs/xfs_super.h1
-rw-r--r--fs/xfs/xfs_sysctl.c9
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_trace.h60
-rw-r--r--fs/xfs/xfs_trans.h19
-rw-r--r--fs/xfs/xfs_trans_buf.c9
-rw-r--r--fs/xfs/xfs_vnodeops.c168
-rw-r--r--fs/xfs/xfs_vnodeops.h9
332 files changed, 17315 insertions, 8424 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index f95ae3a027f3..eaff24a19502 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -28,8 +28,8 @@ config FS_MBCACHE
28 tristate 28 tristate
29 default y if EXT2_FS=y && EXT2_FS_XATTR 29 default y if EXT2_FS=y && EXT2_FS_XATTR
30 default y if EXT3_FS=y && EXT3_FS_XATTR 30 default y if EXT3_FS=y && EXT3_FS_XATTR
31 default y if EXT4_FS=y && EXT4_FS_XATTR 31 default y if EXT4_FS=y
32 default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR 32 default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS
33 33
34source "fs/reiserfs/Kconfig" 34source "fs/reiserfs/Kconfig"
35source "fs/jfs/Kconfig" 35source "fs/jfs/Kconfig"
diff --git a/fs/attr.c b/fs/attr.c
index cce7df53b694..1449adb14ef6 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -49,14 +49,15 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
49 /* Make sure a caller can chown. */ 49 /* Make sure a caller can chown. */
50 if ((ia_valid & ATTR_UID) && 50 if ((ia_valid & ATTR_UID) &&
51 (!uid_eq(current_fsuid(), inode->i_uid) || 51 (!uid_eq(current_fsuid(), inode->i_uid) ||
52 !uid_eq(attr->ia_uid, inode->i_uid)) && !capable(CAP_CHOWN)) 52 !uid_eq(attr->ia_uid, inode->i_uid)) &&
53 !inode_capable(inode, CAP_CHOWN))
53 return -EPERM; 54 return -EPERM;
54 55
55 /* Make sure caller can chgrp. */ 56 /* Make sure caller can chgrp. */
56 if ((ia_valid & ATTR_GID) && 57 if ((ia_valid & ATTR_GID) &&
57 (!uid_eq(current_fsuid(), inode->i_uid) || 58 (!uid_eq(current_fsuid(), inode->i_uid) ||
58 (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) && 59 (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) &&
59 !capable(CAP_CHOWN)) 60 !inode_capable(inode, CAP_CHOWN))
60 return -EPERM; 61 return -EPERM;
61 62
62 /* Make sure a caller can chmod. */ 63 /* Make sure a caller can chmod. */
@@ -65,7 +66,8 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
65 return -EPERM; 66 return -EPERM;
66 /* Also check the setgid bit! */ 67 /* Also check the setgid bit! */
67 if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : 68 if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
68 inode->i_gid) && !capable(CAP_FSETID)) 69 inode->i_gid) &&
70 !inode_capable(inode, CAP_FSETID))
69 attr->ia_mode &= ~S_ISGID; 71 attr->ia_mode &= ~S_ISGID;
70 } 72 }
71 73
@@ -157,7 +159,8 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
157 if (ia_valid & ATTR_MODE) { 159 if (ia_valid & ATTR_MODE) {
158 umode_t mode = attr->ia_mode; 160 umode_t mode = attr->ia_mode;
159 161
160 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) 162 if (!in_group_p(inode->i_gid) &&
163 !inode_capable(inode, CAP_FSETID))
161 mode &= ~S_ISGID; 164 mode &= ~S_ISGID;
162 inode->i_mode = mode; 165 inode->i_mode = mode;
163 } 166 }
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 908e18455413..b785e7707959 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -74,8 +74,8 @@ struct autofs_info {
74 unsigned long last_used; 74 unsigned long last_used;
75 atomic_t count; 75 atomic_t count;
76 76
77 uid_t uid; 77 kuid_t uid;
78 gid_t gid; 78 kgid_t gid;
79}; 79};
80 80
81#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ 81#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
@@ -89,8 +89,8 @@ struct autofs_wait_queue {
89 struct qstr name; 89 struct qstr name;
90 u32 dev; 90 u32 dev;
91 u64 ino; 91 u64 ino;
92 uid_t uid; 92 kuid_t uid;
93 gid_t gid; 93 kgid_t gid;
94 pid_t pid; 94 pid_t pid;
95 pid_t tgid; 95 pid_t tgid;
96 /* This is for status reporting upon return */ 96 /* This is for status reporting upon return */
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index a16214109d31..9f68a37bb2b2 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -437,8 +437,8 @@ static int autofs_dev_ioctl_requester(struct file *fp,
437 err = 0; 437 err = 0;
438 autofs4_expire_wait(path.dentry); 438 autofs4_expire_wait(path.dentry);
439 spin_lock(&sbi->fs_lock); 439 spin_lock(&sbi->fs_lock);
440 param->requester.uid = ino->uid; 440 param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid);
441 param->requester.gid = ino->gid; 441 param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid);
442 spin_unlock(&sbi->fs_lock); 442 spin_unlock(&sbi->fs_lock);
443 } 443 }
444 path_put(&path); 444 path_put(&path);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 842d00048a65..01443ce43ee7 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -548,15 +548,6 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
548 548
549 spin_lock(&sbi->fs_lock); 549 spin_lock(&sbi->fs_lock);
550 ino->flags &= ~AUTOFS_INF_EXPIRING; 550 ino->flags &= ~AUTOFS_INF_EXPIRING;
551 spin_lock(&dentry->d_lock);
552 if (!ret) {
553 if ((IS_ROOT(dentry) ||
554 (autofs_type_indirect(sbi->type) &&
555 IS_ROOT(dentry->d_parent))) &&
556 !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
557 __managed_dentry_set_automount(dentry);
558 }
559 spin_unlock(&dentry->d_lock);
560 complete_all(&ino->expire_complete); 551 complete_all(&ino->expire_complete);
561 spin_unlock(&sbi->fs_lock); 552 spin_unlock(&sbi->fs_lock);
562 dput(dentry); 553 dput(dentry);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 8a4fed8ead30..b104726e2d0a 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -36,8 +36,8 @@ struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
36 36
37void autofs4_clean_ino(struct autofs_info *ino) 37void autofs4_clean_ino(struct autofs_info *ino)
38{ 38{
39 ino->uid = 0; 39 ino->uid = GLOBAL_ROOT_UID;
40 ino->gid = 0; 40 ino->gid = GLOBAL_ROOT_GID;
41 ino->last_used = jiffies; 41 ino->last_used = jiffies;
42} 42}
43 43
@@ -79,10 +79,12 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
79 return 0; 79 return 0;
80 80
81 seq_printf(m, ",fd=%d", sbi->pipefd); 81 seq_printf(m, ",fd=%d", sbi->pipefd);
82 if (root_inode->i_uid != 0) 82 if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID))
83 seq_printf(m, ",uid=%u", root_inode->i_uid); 83 seq_printf(m, ",uid=%u",
84 if (root_inode->i_gid != 0) 84 from_kuid_munged(&init_user_ns, root_inode->i_uid));
85 seq_printf(m, ",gid=%u", root_inode->i_gid); 85 if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID))
86 seq_printf(m, ",gid=%u",
87 from_kgid_munged(&init_user_ns, root_inode->i_gid));
86 seq_printf(m, ",pgrp=%d", sbi->oz_pgrp); 88 seq_printf(m, ",pgrp=%d", sbi->oz_pgrp);
87 seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); 89 seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ);
88 seq_printf(m, ",minproto=%d", sbi->min_proto); 90 seq_printf(m, ",minproto=%d", sbi->min_proto);
@@ -126,7 +128,7 @@ static const match_table_t tokens = {
126 {Opt_err, NULL} 128 {Opt_err, NULL}
127}; 129};
128 130
129static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, 131static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
130 pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto) 132 pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto)
131{ 133{
132 char *p; 134 char *p;
@@ -159,12 +161,16 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
159 case Opt_uid: 161 case Opt_uid:
160 if (match_int(args, &option)) 162 if (match_int(args, &option))
161 return 1; 163 return 1;
162 *uid = option; 164 *uid = make_kuid(current_user_ns(), option);
165 if (!uid_valid(*uid))
166 return 1;
163 break; 167 break;
164 case Opt_gid: 168 case Opt_gid:
165 if (match_int(args, &option)) 169 if (match_int(args, &option))
166 return 1; 170 return 1;
167 *gid = option; 171 *gid = make_kgid(current_user_ns(), option);
172 if (!gid_valid(*gid))
173 return 1;
168 break; 174 break;
169 case Opt_pgrp: 175 case Opt_pgrp:
170 if (match_int(args, &option)) 176 if (match_int(args, &option))
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 91b11650722e..c93447604da8 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -124,13 +124,10 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
124 * it. 124 * it.
125 */ 125 */
126 spin_lock(&sbi->lookup_lock); 126 spin_lock(&sbi->lookup_lock);
127 spin_lock(&dentry->d_lock); 127 if (!d_mountpoint(dentry) && simple_empty(dentry)) {
128 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
129 spin_unlock(&dentry->d_lock);
130 spin_unlock(&sbi->lookup_lock); 128 spin_unlock(&sbi->lookup_lock);
131 return -ENOENT; 129 return -ENOENT;
132 } 130 }
133 spin_unlock(&dentry->d_lock);
134 spin_unlock(&sbi->lookup_lock); 131 spin_unlock(&sbi->lookup_lock);
135 132
136out: 133out:
@@ -355,7 +352,6 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
355 status = autofs4_mount_wait(dentry); 352 status = autofs4_mount_wait(dentry);
356 if (status) 353 if (status)
357 return ERR_PTR(status); 354 return ERR_PTR(status);
358 spin_lock(&sbi->fs_lock);
359 goto done; 355 goto done;
360 } 356 }
361 357
@@ -364,8 +360,11 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
364 * having d_mountpoint() true, so there's no need to call back 360 * having d_mountpoint() true, so there's no need to call back
365 * to the daemon. 361 * to the daemon.
366 */ 362 */
367 if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) 363 if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
364 spin_unlock(&sbi->fs_lock);
368 goto done; 365 goto done;
366 }
367
369 if (!d_mountpoint(dentry)) { 368 if (!d_mountpoint(dentry)) {
370 /* 369 /*
371 * It's possible that user space hasn't removed directories 370 * It's possible that user space hasn't removed directories
@@ -379,15 +378,13 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
379 * require user space behave. 378 * require user space behave.
380 */ 379 */
381 if (sbi->version > 4) { 380 if (sbi->version > 4) {
382 if (have_submounts(dentry)) 381 if (have_submounts(dentry)) {
382 spin_unlock(&sbi->fs_lock);
383 goto done; 383 goto done;
384 }
384 } else { 385 } else {
385 spin_lock(&dentry->d_lock); 386 if (!simple_empty(dentry))
386 if (!list_empty(&dentry->d_subdirs)) {
387 spin_unlock(&dentry->d_lock);
388 goto done; 387 goto done;
389 }
390 spin_unlock(&dentry->d_lock);
391 } 388 }
392 ino->flags |= AUTOFS_INF_PENDING; 389 ino->flags |= AUTOFS_INF_PENDING;
393 spin_unlock(&sbi->fs_lock); 390 spin_unlock(&sbi->fs_lock);
@@ -399,28 +396,8 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
399 return ERR_PTR(status); 396 return ERR_PTR(status);
400 } 397 }
401 } 398 }
402done:
403 if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
404 /*
405 * Any needed mounting has been completed and the path
406 * updated so clear DCACHE_NEED_AUTOMOUNT so we don't
407 * call ->d_automount() on rootless multi-mounts since
408 * it can lead to an incorrect ELOOP error return.
409 *
410 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
411 * symlinks as in all other cases the dentry will be covered by
412 * an actual mount so ->d_automount() won't be called during
413 * the follow.
414 */
415 spin_lock(&dentry->d_lock);
416 if ((!d_mountpoint(dentry) &&
417 !list_empty(&dentry->d_subdirs)) ||
418 (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
419 __managed_dentry_clear_automount(dentry);
420 spin_unlock(&dentry->d_lock);
421 }
422 spin_unlock(&sbi->fs_lock); 399 spin_unlock(&sbi->fs_lock);
423 400done:
424 /* Mount succeeded, check if we ended up with a new dentry */ 401 /* Mount succeeded, check if we ended up with a new dentry */
425 dentry = autofs4_mountpoint_changed(path); 402 dentry = autofs4_mountpoint_changed(path);
426 if (!dentry) 403 if (!dentry)
@@ -432,6 +409,8 @@ done:
432int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) 409int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
433{ 410{
434 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 411 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
412 struct autofs_info *ino = autofs4_dentry_ino(dentry);
413 int status;
435 414
436 DPRINTK("dentry=%p %.*s", 415 DPRINTK("dentry=%p %.*s",
437 dentry, dentry->d_name.len, dentry->d_name.name); 416 dentry, dentry->d_name.len, dentry->d_name.name);
@@ -456,7 +435,32 @@ int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
456 * This dentry may be under construction so wait on mount 435 * This dentry may be under construction so wait on mount
457 * completion. 436 * completion.
458 */ 437 */
459 return autofs4_mount_wait(dentry); 438 status = autofs4_mount_wait(dentry);
439 if (status)
440 return status;
441
442 spin_lock(&sbi->fs_lock);
443 /*
444 * If the dentry has been selected for expire while we slept
445 * on the lock then it might go away. We'll deal with that in
446 * ->d_automount() and wait on a new mount if the expire
447 * succeeds or return here if it doesn't (since there's no
448 * mount to follow with a rootless multi-mount).
449 */
450 if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
451 /*
452 * Any needed mounting has been completed and the path
453 * updated so check if this is a rootless multi-mount so
454 * we can avoid needless calls ->d_automount() and avoid
455 * an incorrect ELOOP error return.
456 */
457 if ((!d_mountpoint(dentry) && !simple_empty(dentry)) ||
458 (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
459 status = -EISDIR;
460 }
461 spin_unlock(&sbi->fs_lock);
462
463 return status;
460} 464}
461 465
462/* Lookups in the root directory */ 466/* Lookups in the root directory */
@@ -599,9 +603,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
599 603
600 spin_lock(&sbi->lookup_lock); 604 spin_lock(&sbi->lookup_lock);
601 __autofs4_add_expiring(dentry); 605 __autofs4_add_expiring(dentry);
602 spin_lock(&dentry->d_lock); 606 d_drop(dentry);
603 __d_drop(dentry);
604 spin_unlock(&dentry->d_lock);
605 spin_unlock(&sbi->lookup_lock); 607 spin_unlock(&sbi->lookup_lock);
606 608
607 return 0; 609 return 0;
@@ -672,15 +674,12 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
672 return -EACCES; 674 return -EACCES;
673 675
674 spin_lock(&sbi->lookup_lock); 676 spin_lock(&sbi->lookup_lock);
675 spin_lock(&dentry->d_lock); 677 if (!simple_empty(dentry)) {
676 if (!list_empty(&dentry->d_subdirs)) {
677 spin_unlock(&dentry->d_lock);
678 spin_unlock(&sbi->lookup_lock); 678 spin_unlock(&sbi->lookup_lock);
679 return -ENOTEMPTY; 679 return -ENOTEMPTY;
680 } 680 }
681 __autofs4_add_expiring(dentry); 681 __autofs4_add_expiring(dentry);
682 __d_drop(dentry); 682 d_drop(dentry);
683 spin_unlock(&dentry->d_lock);
684 spin_unlock(&sbi->lookup_lock); 683 spin_unlock(&sbi->lookup_lock);
685 684
686 if (sbi->version < 5) 685 if (sbi->version < 5)
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index dce436e595c1..03bc1d347d8e 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -154,6 +154,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
154 case autofs_ptype_expire_direct: 154 case autofs_ptype_expire_direct:
155 { 155 {
156 struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; 156 struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
157 struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns;
157 158
158 pktsz = sizeof(*packet); 159 pktsz = sizeof(*packet);
159 160
@@ -163,8 +164,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
163 packet->name[wq->name.len] = '\0'; 164 packet->name[wq->name.len] = '\0';
164 packet->dev = wq->dev; 165 packet->dev = wq->dev;
165 packet->ino = wq->ino; 166 packet->ino = wq->ino;
166 packet->uid = wq->uid; 167 packet->uid = from_kuid_munged(user_ns, wq->uid);
167 packet->gid = wq->gid; 168 packet->gid = from_kgid_munged(user_ns, wq->gid);
168 packet->pid = wq->pid; 169 packet->pid = wq->pid;
169 packet->tgid = wq->tgid; 170 packet->tgid = wq->tgid;
170 break; 171 break;
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index b1342ffb3cf6..922ad460bff9 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -16,7 +16,7 @@
16#include <linux/poll.h> 16#include <linux/poll.h>
17 17
18 18
19static loff_t bad_file_llseek(struct file *file, loff_t offset, int origin) 19static loff_t bad_file_llseek(struct file *file, loff_t offset, int whence)
20{ 20{
21 return -EIO; 21 return -EIO;
22} 22}
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 0e7a6f81ae36..6043567b95c2 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -30,7 +30,7 @@
30#include <asm/cacheflush.h> 30#include <asm/cacheflush.h>
31#include <asm/a.out-core.h> 31#include <asm/a.out-core.h>
32 32
33static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); 33static int load_aout_binary(struct linux_binprm *);
34static int load_aout_library(struct file*); 34static int load_aout_library(struct file*);
35 35
36#ifdef CONFIG_COREDUMP 36#ifdef CONFIG_COREDUMP
@@ -201,8 +201,9 @@ static unsigned long __user *create_aout_tables(char __user *p, struct linux_bin
201 * libraries. There is no binary dependent code anywhere else. 201 * libraries. There is no binary dependent code anywhere else.
202 */ 202 */
203 203
204static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) 204static int load_aout_binary(struct linux_binprm * bprm)
205{ 205{
206 struct pt_regs *regs = current_pt_regs();
206 struct exec ex; 207 struct exec ex;
207 unsigned long error; 208 unsigned long error;
208 unsigned long fd_offset; 209 unsigned long fd_offset;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index fbd9f60bd763..0c42cdbabecf 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -44,7 +44,7 @@
44#define user_siginfo_t siginfo_t 44#define user_siginfo_t siginfo_t
45#endif 45#endif
46 46
47static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); 47static int load_elf_binary(struct linux_binprm *bprm);
48static int load_elf_library(struct file *); 48static int load_elf_library(struct file *);
49static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, 49static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
50 int, int, unsigned long); 50 int, int, unsigned long);
@@ -558,7 +558,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
558#endif 558#endif
559} 559}
560 560
561static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) 561static int load_elf_binary(struct linux_binprm *bprm)
562{ 562{
563 struct file *interpreter = NULL; /* to shut gcc up */ 563 struct file *interpreter = NULL; /* to shut gcc up */
564 unsigned long load_addr = 0, load_bias = 0; 564 unsigned long load_addr = 0, load_bias = 0;
@@ -575,6 +575,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
575 unsigned long reloc_func_desc __maybe_unused = 0; 575 unsigned long reloc_func_desc __maybe_unused = 0;
576 int executable_stack = EXSTACK_DEFAULT; 576 int executable_stack = EXSTACK_DEFAULT;
577 unsigned long def_flags = 0; 577 unsigned long def_flags = 0;
578 struct pt_regs *regs = current_pt_regs();
578 struct { 579 struct {
579 struct elfhdr elf_ex; 580 struct elfhdr elf_ex;
580 struct elfhdr interp_elf_ex; 581 struct elfhdr interp_elf_ex;
@@ -1600,8 +1601,10 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
1600 info->thread = NULL; 1601 info->thread = NULL;
1601 1602
1602 psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); 1603 psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
1603 if (psinfo == NULL) 1604 if (psinfo == NULL) {
1605 info->psinfo.data = NULL; /* So we don't free this wrongly */
1604 return 0; 1606 return 0;
1607 }
1605 1608
1606 fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); 1609 fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
1607 1610
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index a46049154107..dc84732e554f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -56,7 +56,7 @@ typedef char *elf_caddr_t;
56 56
57MODULE_LICENSE("GPL"); 57MODULE_LICENSE("GPL");
58 58
59static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *); 59static int load_elf_fdpic_binary(struct linux_binprm *);
60static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *); 60static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *);
61static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *, 61static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *,
62 struct mm_struct *, const char *); 62 struct mm_struct *, const char *);
@@ -164,10 +164,10 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params,
164/* 164/*
165 * load an fdpic binary into various bits of memory 165 * load an fdpic binary into various bits of memory
166 */ 166 */
167static int load_elf_fdpic_binary(struct linux_binprm *bprm, 167static int load_elf_fdpic_binary(struct linux_binprm *bprm)
168 struct pt_regs *regs)
169{ 168{
170 struct elf_fdpic_params exec_params, interp_params; 169 struct elf_fdpic_params exec_params, interp_params;
170 struct pt_regs *regs = current_pt_regs();
171 struct elf_phdr *phdr; 171 struct elf_phdr *phdr;
172 unsigned long stack_size, entryaddr; 172 unsigned long stack_size, entryaddr;
173#ifdef ELF_FDPIC_PLAT_INIT 173#ifdef ELF_FDPIC_PLAT_INIT
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 2790c7e1912e..037a3e2b045b 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -22,7 +22,7 @@
22#define EM86_INTERP "/usr/bin/em86" 22#define EM86_INTERP "/usr/bin/em86"
23#define EM86_I_NAME "em86" 23#define EM86_I_NAME "em86"
24 24
25static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs) 25static int load_em86(struct linux_binprm *bprm)
26{ 26{
27 char *interp, *i_name, *i_arg; 27 char *interp, *i_name, *i_arg;
28 struct file * file; 28 struct file * file;
@@ -42,7 +42,6 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
42 return -ENOEXEC; 42 return -ENOEXEC;
43 } 43 }
44 44
45 bprm->recursion_depth++; /* Well, the bang-shell is implicit... */
46 allow_write_access(bprm->file); 45 allow_write_access(bprm->file);
47 fput(bprm->file); 46 fput(bprm->file);
48 bprm->file = NULL; 47 bprm->file = NULL;
@@ -90,7 +89,7 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
90 if (retval < 0) 89 if (retval < 0)
91 return retval; 90 return retval;
92 91
93 return search_binary_handler(bprm, regs); 92 return search_binary_handler(bprm);
94} 93}
95 94
96static struct linux_binfmt em86_format = { 95static struct linux_binfmt em86_format = {
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index e280352b28f9..b56371981d16 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -88,7 +88,7 @@ struct lib_info {
88static int load_flat_shared_library(int id, struct lib_info *p); 88static int load_flat_shared_library(int id, struct lib_info *p);
89#endif 89#endif
90 90
91static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs); 91static int load_flat_binary(struct linux_binprm *);
92static int flat_core_dump(struct coredump_params *cprm); 92static int flat_core_dump(struct coredump_params *cprm);
93 93
94static struct linux_binfmt flat_format = { 94static struct linux_binfmt flat_format = {
@@ -858,9 +858,10 @@ out:
858 * libraries. There is no binary dependent code anywhere else. 858 * libraries. There is no binary dependent code anywhere else.
859 */ 859 */
860 860
861static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) 861static int load_flat_binary(struct linux_binprm * bprm)
862{ 862{
863 struct lib_info libinfo; 863 struct lib_info libinfo;
864 struct pt_regs *regs = current_pt_regs();
864 unsigned long p = bprm->p; 865 unsigned long p = bprm->p;
865 unsigned long stack_len; 866 unsigned long stack_len;
866 unsigned long start_addr; 867 unsigned long start_addr;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 790b3cddca67..9be335fb8a7c 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -104,7 +104,7 @@ static Node *check_file(struct linux_binprm *bprm)
104/* 104/*
105 * the loader itself 105 * the loader itself
106 */ 106 */
107static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) 107static int load_misc_binary(struct linux_binprm *bprm)
108{ 108{
109 Node *fmt; 109 Node *fmt;
110 struct file * interp_file = NULL; 110 struct file * interp_file = NULL;
@@ -117,10 +117,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
117 if (!enabled) 117 if (!enabled)
118 goto _ret; 118 goto _ret;
119 119
120 retval = -ENOEXEC;
121 if (bprm->recursion_depth > BINPRM_MAX_RECURSION)
122 goto _ret;
123
124 /* to keep locking time low, we copy the interpreter string */ 120 /* to keep locking time low, we copy the interpreter string */
125 read_lock(&entries_lock); 121 read_lock(&entries_lock);
126 fmt = check_file(bprm); 122 fmt = check_file(bprm);
@@ -197,9 +193,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
197 if (retval < 0) 193 if (retval < 0)
198 goto _error; 194 goto _error;
199 195
200 bprm->recursion_depth++; 196 retval = search_binary_handler(bprm);
201
202 retval = search_binary_handler (bprm, regs);
203 if (retval < 0) 197 if (retval < 0)
204 goto _error; 198 goto _error;
205 199
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index d3b8c1f63155..1610a91637e5 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -14,7 +14,7 @@
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16 16
17static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) 17static int load_script(struct linux_binprm *bprm)
18{ 18{
19 const char *i_arg, *i_name; 19 const char *i_arg, *i_name;
20 char *cp; 20 char *cp;
@@ -22,15 +22,13 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
22 char interp[BINPRM_BUF_SIZE]; 22 char interp[BINPRM_BUF_SIZE];
23 int retval; 23 int retval;
24 24
25 if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') || 25 if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
26 (bprm->recursion_depth > BINPRM_MAX_RECURSION))
27 return -ENOEXEC; 26 return -ENOEXEC;
28 /* 27 /*
29 * This section does the #! interpretation. 28 * This section does the #! interpretation.
30 * Sorta complicated, but hopefully it will work. -TYT 29 * Sorta complicated, but hopefully it will work. -TYT
31 */ 30 */
32 31
33 bprm->recursion_depth++;
34 allow_write_access(bprm->file); 32 allow_write_access(bprm->file);
35 fput(bprm->file); 33 fput(bprm->file);
36 bprm->file = NULL; 34 bprm->file = NULL;
@@ -95,7 +93,7 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
95 retval = prepare_binprm(bprm); 93 retval = prepare_binprm(bprm);
96 if (retval < 0) 94 if (retval < 0)
97 return retval; 95 return retval;
98 return search_binary_handler(bprm,regs); 96 return search_binary_handler(bprm);
99} 97}
100 98
101static struct linux_binfmt script_format = { 99static struct linux_binfmt script_format = {
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 4517aaff61b4..4e00ed68d4a6 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -35,7 +35,7 @@
35 35
36#include <linux/elf.h> 36#include <linux/elf.h>
37 37
38static int load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs); 38static int load_som_binary(struct linux_binprm * bprm);
39static int load_som_library(struct file *); 39static int load_som_library(struct file *);
40 40
41/* 41/*
@@ -180,13 +180,14 @@ out:
180 */ 180 */
181 181
182static int 182static int
183load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) 183load_som_binary(struct linux_binprm * bprm)
184{ 184{
185 int retval; 185 int retval;
186 unsigned int size; 186 unsigned int size;
187 unsigned long som_entry; 187 unsigned long som_entry;
188 struct som_hdr *som_ex; 188 struct som_hdr *som_ex;
189 struct som_exec_auxhdr *hpuxhdr; 189 struct som_exec_auxhdr *hpuxhdr;
190 struct pt_regs *regs = current_pt_regs();
190 191
191 /* Get the exec-header */ 192 /* Get the exec-header */
192 som_ex = (struct som_hdr *) bprm->buf; 193 som_ex = (struct som_hdr *) bprm->buf;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1a1e5e3b1eaf..172f8491a2bd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -70,19 +70,6 @@ static void bdev_inode_switch_bdi(struct inode *inode,
70 spin_unlock(&dst->wb.list_lock); 70 spin_unlock(&dst->wb.list_lock);
71} 71}
72 72
73sector_t blkdev_max_block(struct block_device *bdev)
74{
75 sector_t retval = ~((sector_t)0);
76 loff_t sz = i_size_read(bdev->bd_inode);
77
78 if (sz) {
79 unsigned int size = block_size(bdev);
80 unsigned int sizebits = blksize_bits(size);
81 retval = (sz >> sizebits);
82 }
83 return retval;
84}
85
86/* Kill _all_ buffers and pagecache , dirty or not.. */ 73/* Kill _all_ buffers and pagecache , dirty or not.. */
87void kill_bdev(struct block_device *bdev) 74void kill_bdev(struct block_device *bdev)
88{ 75{
@@ -116,8 +103,6 @@ EXPORT_SYMBOL(invalidate_bdev);
116 103
117int set_blocksize(struct block_device *bdev, int size) 104int set_blocksize(struct block_device *bdev, int size)
118{ 105{
119 struct address_space *mapping;
120
121 /* Size must be a power of two, and between 512 and PAGE_SIZE */ 106 /* Size must be a power of two, and between 512 and PAGE_SIZE */
122 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) 107 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
123 return -EINVAL; 108 return -EINVAL;
@@ -126,19 +111,6 @@ int set_blocksize(struct block_device *bdev, int size)
126 if (size < bdev_logical_block_size(bdev)) 111 if (size < bdev_logical_block_size(bdev))
127 return -EINVAL; 112 return -EINVAL;
128 113
129 /* Prevent starting I/O or mapping the device */
130 percpu_down_write(&bdev->bd_block_size_semaphore);
131
132 /* Check that the block device is not memory mapped */
133 mapping = bdev->bd_inode->i_mapping;
134 mutex_lock(&mapping->i_mmap_mutex);
135 if (mapping_mapped(mapping)) {
136 mutex_unlock(&mapping->i_mmap_mutex);
137 percpu_up_write(&bdev->bd_block_size_semaphore);
138 return -EBUSY;
139 }
140 mutex_unlock(&mapping->i_mmap_mutex);
141
142 /* Don't change the size if it is same as current */ 114 /* Don't change the size if it is same as current */
143 if (bdev->bd_block_size != size) { 115 if (bdev->bd_block_size != size) {
144 sync_blockdev(bdev); 116 sync_blockdev(bdev);
@@ -146,9 +118,6 @@ int set_blocksize(struct block_device *bdev, int size)
146 bdev->bd_inode->i_blkbits = blksize_bits(size); 118 bdev->bd_inode->i_blkbits = blksize_bits(size);
147 kill_bdev(bdev); 119 kill_bdev(bdev);
148 } 120 }
149
150 percpu_up_write(&bdev->bd_block_size_semaphore);
151
152 return 0; 121 return 0;
153} 122}
154 123
@@ -181,52 +150,12 @@ static int
181blkdev_get_block(struct inode *inode, sector_t iblock, 150blkdev_get_block(struct inode *inode, sector_t iblock,
182 struct buffer_head *bh, int create) 151 struct buffer_head *bh, int create)
183{ 152{
184 if (iblock >= blkdev_max_block(I_BDEV(inode))) {
185 if (create)
186 return -EIO;
187
188 /*
189 * for reads, we're just trying to fill a partial page.
190 * return a hole, they will have to call get_block again
191 * before they can fill it, and they will get -EIO at that
192 * time
193 */
194 return 0;
195 }
196 bh->b_bdev = I_BDEV(inode); 153 bh->b_bdev = I_BDEV(inode);
197 bh->b_blocknr = iblock; 154 bh->b_blocknr = iblock;
198 set_buffer_mapped(bh); 155 set_buffer_mapped(bh);
199 return 0; 156 return 0;
200} 157}
201 158
202static int
203blkdev_get_blocks(struct inode *inode, sector_t iblock,
204 struct buffer_head *bh, int create)
205{
206 sector_t end_block = blkdev_max_block(I_BDEV(inode));
207 unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
208
209 if ((iblock + max_blocks) > end_block) {
210 max_blocks = end_block - iblock;
211 if ((long)max_blocks <= 0) {
212 if (create)
213 return -EIO; /* write fully beyond EOF */
214 /*
215 * It is a read which is fully beyond EOF. We return
216 * a !buffer_mapped buffer
217 */
218 max_blocks = 0;
219 }
220 }
221
222 bh->b_bdev = I_BDEV(inode);
223 bh->b_blocknr = iblock;
224 bh->b_size = max_blocks << inode->i_blkbits;
225 if (max_blocks)
226 set_buffer_mapped(bh);
227 return 0;
228}
229
230static ssize_t 159static ssize_t
231blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 160blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
232 loff_t offset, unsigned long nr_segs) 161 loff_t offset, unsigned long nr_segs)
@@ -235,7 +164,7 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
235 struct inode *inode = file->f_mapping->host; 164 struct inode *inode = file->f_mapping->host;
236 165
237 return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, 166 return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
238 nr_segs, blkdev_get_blocks, NULL, NULL, 0); 167 nr_segs, blkdev_get_block, NULL, NULL, 0);
239} 168}
240 169
241int __sync_blockdev(struct block_device *bdev, int wait) 170int __sync_blockdev(struct block_device *bdev, int wait)
@@ -392,7 +321,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
392 * for a block special file file->f_path.dentry->d_inode->i_size is zero 321 * for a block special file file->f_path.dentry->d_inode->i_size is zero
393 * so we compute the size by hand (just as in block_read/write above) 322 * so we compute the size by hand (just as in block_read/write above)
394 */ 323 */
395static loff_t block_llseek(struct file *file, loff_t offset, int origin) 324static loff_t block_llseek(struct file *file, loff_t offset, int whence)
396{ 325{
397 struct inode *bd_inode = file->f_mapping->host; 326 struct inode *bd_inode = file->f_mapping->host;
398 loff_t size; 327 loff_t size;
@@ -402,7 +331,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
402 size = i_size_read(bd_inode); 331 size = i_size_read(bd_inode);
403 332
404 retval = -EINVAL; 333 retval = -EINVAL;
405 switch (origin) { 334 switch (whence) {
406 case SEEK_END: 335 case SEEK_END:
407 offset += size; 336 offset += size;
408 break; 337 break;
@@ -459,12 +388,6 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
459 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); 388 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
460 if (!ei) 389 if (!ei)
461 return NULL; 390 return NULL;
462
463 if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
464 kmem_cache_free(bdev_cachep, ei);
465 return NULL;
466 }
467
468 return &ei->vfs_inode; 391 return &ei->vfs_inode;
469} 392}
470 393
@@ -473,8 +396,6 @@ static void bdev_i_callback(struct rcu_head *head)
473 struct inode *inode = container_of(head, struct inode, i_rcu); 396 struct inode *inode = container_of(head, struct inode, i_rcu);
474 struct bdev_inode *bdi = BDEV_I(inode); 397 struct bdev_inode *bdi = BDEV_I(inode);
475 398
476 percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);
477
478 kmem_cache_free(bdev_cachep, bdi); 399 kmem_cache_free(bdev_cachep, bdi);
479} 400}
480 401
@@ -1593,22 +1514,6 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1593 return blkdev_ioctl(bdev, mode, cmd, arg); 1514 return blkdev_ioctl(bdev, mode, cmd, arg);
1594} 1515}
1595 1516
1596ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
1597 unsigned long nr_segs, loff_t pos)
1598{
1599 ssize_t ret;
1600 struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
1601
1602 percpu_down_read(&bdev->bd_block_size_semaphore);
1603
1604 ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
1605
1606 percpu_up_read(&bdev->bd_block_size_semaphore);
1607
1608 return ret;
1609}
1610EXPORT_SYMBOL_GPL(blkdev_aio_read);
1611
1612/* 1517/*
1613 * Write data to the block device. Only intended for the block device itself 1518 * Write data to the block device. Only intended for the block device itself
1614 * and the raw driver which basically is a fake block device. 1519 * and the raw driver which basically is a fake block device.
@@ -1620,16 +1525,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1620 unsigned long nr_segs, loff_t pos) 1525 unsigned long nr_segs, loff_t pos)
1621{ 1526{
1622 struct file *file = iocb->ki_filp; 1527 struct file *file = iocb->ki_filp;
1623 struct block_device *bdev = I_BDEV(file->f_mapping->host);
1624 struct blk_plug plug; 1528 struct blk_plug plug;
1625 ssize_t ret; 1529 ssize_t ret;
1626 1530
1627 BUG_ON(iocb->ki_pos != pos); 1531 BUG_ON(iocb->ki_pos != pos);
1628 1532
1629 blk_start_plug(&plug); 1533 blk_start_plug(&plug);
1630
1631 percpu_down_read(&bdev->bd_block_size_semaphore);
1632
1633 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 1534 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
1634 if (ret > 0 || ret == -EIOCBQUEUED) { 1535 if (ret > 0 || ret == -EIOCBQUEUED) {
1635 ssize_t err; 1536 ssize_t err;
@@ -1638,62 +1539,27 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1638 if (err < 0 && ret > 0) 1539 if (err < 0 && ret > 0)
1639 ret = err; 1540 ret = err;
1640 } 1541 }
1641
1642 percpu_up_read(&bdev->bd_block_size_semaphore);
1643
1644 blk_finish_plug(&plug); 1542 blk_finish_plug(&plug);
1645
1646 return ret; 1543 return ret;
1647} 1544}
1648EXPORT_SYMBOL_GPL(blkdev_aio_write); 1545EXPORT_SYMBOL_GPL(blkdev_aio_write);
1649 1546
1650static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) 1547static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
1651{ 1548 unsigned long nr_segs, loff_t pos)
1652 int ret;
1653 struct block_device *bdev = I_BDEV(file->f_mapping->host);
1654
1655 percpu_down_read(&bdev->bd_block_size_semaphore);
1656
1657 ret = generic_file_mmap(file, vma);
1658
1659 percpu_up_read(&bdev->bd_block_size_semaphore);
1660
1661 return ret;
1662}
1663
1664static ssize_t blkdev_splice_read(struct file *file, loff_t *ppos,
1665 struct pipe_inode_info *pipe, size_t len,
1666 unsigned int flags)
1667{
1668 ssize_t ret;
1669 struct block_device *bdev = I_BDEV(file->f_mapping->host);
1670
1671 percpu_down_read(&bdev->bd_block_size_semaphore);
1672
1673 ret = generic_file_splice_read(file, ppos, pipe, len, flags);
1674
1675 percpu_up_read(&bdev->bd_block_size_semaphore);
1676
1677 return ret;
1678}
1679
1680static ssize_t blkdev_splice_write(struct pipe_inode_info *pipe,
1681 struct file *file, loff_t *ppos, size_t len,
1682 unsigned int flags)
1683{ 1549{
1684 ssize_t ret; 1550 struct file *file = iocb->ki_filp;
1685 struct block_device *bdev = I_BDEV(file->f_mapping->host); 1551 struct inode *bd_inode = file->f_mapping->host;
1686 1552 loff_t size = i_size_read(bd_inode);
1687 percpu_down_read(&bdev->bd_block_size_semaphore);
1688
1689 ret = generic_file_splice_write(pipe, file, ppos, len, flags);
1690 1553
1691 percpu_up_read(&bdev->bd_block_size_semaphore); 1554 if (pos >= size)
1555 return 0;
1692 1556
1693 return ret; 1557 size -= pos;
1558 if (size < INT_MAX)
1559 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
1560 return generic_file_aio_read(iocb, iov, nr_segs, pos);
1694} 1561}
1695 1562
1696
1697/* 1563/*
1698 * Try to release a page associated with block device when the system 1564 * Try to release a page associated with block device when the system
1699 * is under memory pressure. 1565 * is under memory pressure.
@@ -1724,16 +1590,16 @@ const struct file_operations def_blk_fops = {
1724 .llseek = block_llseek, 1590 .llseek = block_llseek,
1725 .read = do_sync_read, 1591 .read = do_sync_read,
1726 .write = do_sync_write, 1592 .write = do_sync_write,
1727 .aio_read = blkdev_aio_read, 1593 .aio_read = blkdev_aio_read,
1728 .aio_write = blkdev_aio_write, 1594 .aio_write = blkdev_aio_write,
1729 .mmap = blkdev_mmap, 1595 .mmap = generic_file_mmap,
1730 .fsync = blkdev_fsync, 1596 .fsync = blkdev_fsync,
1731 .unlocked_ioctl = block_ioctl, 1597 .unlocked_ioctl = block_ioctl,
1732#ifdef CONFIG_COMPAT 1598#ifdef CONFIG_COMPAT
1733 .compat_ioctl = compat_blkdev_ioctl, 1599 .compat_ioctl = compat_blkdev_ioctl,
1734#endif 1600#endif
1735 .splice_read = blkdev_splice_read, 1601 .splice_read = generic_file_splice_read,
1736 .splice_write = blkdev_splice_write, 1602 .splice_write = generic_file_splice_write,
1737}; 1603};
1738 1604
1739int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) 1605int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d7fcdba141a2..7df3e0f0ee51 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o ulist.o qgroup.o send.o 11 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o 14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 0c16e3dbfd56..e15d2b0d8d3b 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -121,6 +121,8 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
121 ret = posix_acl_equiv_mode(acl, &inode->i_mode); 121 ret = posix_acl_equiv_mode(acl, &inode->i_mode);
122 if (ret < 0) 122 if (ret < 0)
123 return ret; 123 return ret;
124 if (ret == 0)
125 acl = NULL;
124 } 126 }
125 ret = 0; 127 ret = 0;
126 break; 128 break;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 208d8aa5b07e..04edf69be875 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -461,6 +461,7 @@ static int __merge_refs(struct list_head *head, int mode)
461 pos2 = n2, n2 = pos2->next) { 461 pos2 = n2, n2 = pos2->next) {
462 struct __prelim_ref *ref2; 462 struct __prelim_ref *ref2;
463 struct __prelim_ref *xchg; 463 struct __prelim_ref *xchg;
464 struct extent_inode_elem *eie;
464 465
465 ref2 = list_entry(pos2, struct __prelim_ref, list); 466 ref2 = list_entry(pos2, struct __prelim_ref, list);
466 467
@@ -472,12 +473,20 @@ static int __merge_refs(struct list_head *head, int mode)
472 ref1 = ref2; 473 ref1 = ref2;
473 ref2 = xchg; 474 ref2 = xchg;
474 } 475 }
475 ref1->count += ref2->count;
476 } else { 476 } else {
477 if (ref1->parent != ref2->parent) 477 if (ref1->parent != ref2->parent)
478 continue; 478 continue;
479 ref1->count += ref2->count;
480 } 479 }
480
481 eie = ref1->inode_list;
482 while (eie && eie->next)
483 eie = eie->next;
484 if (eie)
485 eie->next = ref2->inode_list;
486 else
487 ref1->inode_list = ref2->inode_list;
488 ref1->count += ref2->count;
489
481 list_del(&ref2->list); 490 list_del(&ref2->list);
482 kfree(ref2); 491 kfree(ref2);
483 } 492 }
@@ -890,8 +899,7 @@ again:
890 while (!list_empty(&prefs)) { 899 while (!list_empty(&prefs)) {
891 ref = list_first_entry(&prefs, struct __prelim_ref, list); 900 ref = list_first_entry(&prefs, struct __prelim_ref, list);
892 list_del(&ref->list); 901 list_del(&ref->list);
893 if (ref->count < 0) 902 WARN_ON(ref->count < 0);
894 WARN_ON(1);
895 if (ref->count && ref->root_id && ref->parent == 0) { 903 if (ref->count && ref->root_id && ref->parent == 0) {
896 /* no parent == root of tree */ 904 /* no parent == root of tree */
897 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); 905 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ed8ca7ca5eff..2a8c242bc4f5 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -39,6 +39,7 @@
39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5 39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
41#define BTRFS_INODE_NEEDS_FULL_SYNC 7 41#define BTRFS_INODE_NEEDS_FULL_SYNC 7
42#define BTRFS_INODE_COPY_EVERYTHING 8
42 43
43/* in memory btrfs inode */ 44/* in memory btrfs inode */
44struct btrfs_inode { 45struct btrfs_inode {
@@ -90,6 +91,9 @@ struct btrfs_inode {
90 91
91 unsigned long runtime_flags; 92 unsigned long runtime_flags;
92 93
94 /* Keep track of who's O_SYNC/fsycing currently */
95 atomic_t sync_writers;
96
93 /* full 64 bit generation number, struct vfs_inode doesn't have a big 97 /* full 64 bit generation number, struct vfs_inode doesn't have a big
94 * enough field for this. 98 * enough field for this.
95 */ 99 */
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 5a3e45db642a..11d47bfb62b4 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -137,7 +137,7 @@ struct btrfsic_block {
137 unsigned int never_written:1; /* block was added because it was 137 unsigned int never_written:1; /* block was added because it was
138 * referenced, not because it was 138 * referenced, not because it was
139 * written */ 139 * written */
140 unsigned int mirror_num:2; /* large enough to hold 140 unsigned int mirror_num; /* large enough to hold
141 * BTRFS_SUPER_MIRROR_MAX */ 141 * BTRFS_SUPER_MIRROR_MAX */
142 struct btrfsic_dev_state *dev_state; 142 struct btrfsic_dev_state *dev_state;
143 u64 dev_bytenr; /* key, physical byte num on disk */ 143 u64 dev_bytenr; /* key, physical byte num on disk */
@@ -723,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
723 } 723 }
724 724
725 num_copies = 725 num_copies =
726 btrfs_num_copies(&state->root->fs_info->mapping_tree, 726 btrfs_num_copies(state->root->fs_info,
727 next_bytenr, state->metablock_size); 727 next_bytenr, state->metablock_size);
728 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 728 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
729 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 729 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -903,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror(
903 } 903 }
904 904
905 num_copies = 905 num_copies =
906 btrfs_num_copies(&state->root->fs_info->mapping_tree, 906 btrfs_num_copies(state->root->fs_info,
907 next_bytenr, state->metablock_size); 907 next_bytenr, state->metablock_size);
908 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 908 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
909 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 909 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1287,7 +1287,7 @@ static int btrfsic_create_link_to_next_block(
1287 *next_blockp = NULL; 1287 *next_blockp = NULL;
1288 if (0 == *num_copiesp) { 1288 if (0 == *num_copiesp) {
1289 *num_copiesp = 1289 *num_copiesp =
1290 btrfs_num_copies(&state->root->fs_info->mapping_tree, 1290 btrfs_num_copies(state->root->fs_info,
1291 next_bytenr, state->metablock_size); 1291 next_bytenr, state->metablock_size);
1292 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1292 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1293 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 1293 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1489,7 +1489,7 @@ static int btrfsic_handle_extent_data(
1489 chunk_len = num_bytes; 1489 chunk_len = num_bytes;
1490 1490
1491 num_copies = 1491 num_copies =
1492 btrfs_num_copies(&state->root->fs_info->mapping_tree, 1492 btrfs_num_copies(state->root->fs_info,
1493 next_bytenr, state->datablock_size); 1493 next_bytenr, state->datablock_size);
1494 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1494 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1495 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 1495 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1582,9 +1582,21 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1582 struct btrfs_device *device; 1582 struct btrfs_device *device;
1583 1583
1584 length = len; 1584 length = len;
1585 ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, 1585 ret = btrfs_map_block(state->root->fs_info, READ,
1586 bytenr, &length, &multi, mirror_num); 1586 bytenr, &length, &multi, mirror_num);
1587 1587
1588 if (ret) {
1589 block_ctx_out->start = 0;
1590 block_ctx_out->dev_bytenr = 0;
1591 block_ctx_out->len = 0;
1592 block_ctx_out->dev = NULL;
1593 block_ctx_out->datav = NULL;
1594 block_ctx_out->pagev = NULL;
1595 block_ctx_out->mem_to_free = NULL;
1596
1597 return ret;
1598 }
1599
1588 device = multi->stripes[0].dev; 1600 device = multi->stripes[0].dev;
1589 block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); 1601 block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
1590 block_ctx_out->dev_bytenr = multi->stripes[0].physical; 1602 block_ctx_out->dev_bytenr = multi->stripes[0].physical;
@@ -1594,8 +1606,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1594 block_ctx_out->pagev = NULL; 1606 block_ctx_out->pagev = NULL;
1595 block_ctx_out->mem_to_free = NULL; 1607 block_ctx_out->mem_to_free = NULL;
1596 1608
1597 if (0 == ret) 1609 kfree(multi);
1598 kfree(multi);
1599 if (NULL == block_ctx_out->dev) { 1610 if (NULL == block_ctx_out->dev) {
1600 ret = -ENXIO; 1611 ret = -ENXIO;
1601 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); 1612 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
@@ -2463,7 +2474,7 @@ static int btrfsic_process_written_superblock(
2463 } 2474 }
2464 2475
2465 num_copies = 2476 num_copies =
2466 btrfs_num_copies(&state->root->fs_info->mapping_tree, 2477 btrfs_num_copies(state->root->fs_info,
2467 next_bytenr, BTRFS_SUPER_INFO_SIZE); 2478 next_bytenr, BTRFS_SUPER_INFO_SIZE);
2468 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 2479 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
2469 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 2480 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -2960,7 +2971,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
2960 struct btrfsic_block_data_ctx block_ctx; 2971 struct btrfsic_block_data_ctx block_ctx;
2961 int match = 0; 2972 int match = 0;
2962 2973
2963 num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, 2974 num_copies = btrfs_num_copies(state->root->fs_info,
2964 bytenr, state->metablock_size); 2975 bytenr, state->metablock_size);
2965 2976
2966 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { 2977 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c6467aa88bee..94ab2f80e7e3 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -687,7 +687,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
687 687
688 ret = btrfs_map_bio(root, READ, comp_bio, 688 ret = btrfs_map_bio(root, READ, comp_bio,
689 mirror_num, 0); 689 mirror_num, 0);
690 BUG_ON(ret); /* -ENOMEM */ 690 if (ret)
691 bio_endio(comp_bio, ret);
691 692
692 bio_put(comp_bio); 693 bio_put(comp_bio);
693 694
@@ -712,7 +713,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
712 } 713 }
713 714
714 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); 715 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
715 BUG_ON(ret); /* -ENOMEM */ 716 if (ret)
717 bio_endio(comp_bio, ret);
716 718
717 bio_put(comp_bio); 719 bio_put(comp_bio);
718 return 0; 720 return 0;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cdfb4c49a806..c7b67cf24bba 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,8 +38,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
38 struct extent_buffer *dst_buf, 38 struct extent_buffer *dst_buf,
39 struct extent_buffer *src_buf); 39 struct extent_buffer *src_buf);
40static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 40static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
41 struct btrfs_path *path, int level, int slot, 41 struct btrfs_path *path, int level, int slot);
42 int tree_mod_log);
43static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 42static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
44 struct extent_buffer *eb); 43 struct extent_buffer *eb);
45struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, 44struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -776,8 +775,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
776 775
777static noinline void 776static noinline void
778tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, 777tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
779 struct extent_buffer *eb, 778 struct extent_buffer *eb, int slot, int atomic)
780 struct btrfs_disk_key *disk_key, int slot, int atomic)
781{ 779{
782 int ret; 780 int ret;
783 781
@@ -1140,13 +1138,13 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1140 switch (tm->op) { 1138 switch (tm->op) {
1141 case MOD_LOG_KEY_REMOVE_WHILE_FREEING: 1139 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1142 BUG_ON(tm->slot < n); 1140 BUG_ON(tm->slot < n);
1143 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1144 case MOD_LOG_KEY_REMOVE: 1141 case MOD_LOG_KEY_REMOVE:
1142 n++;
1143 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1145 btrfs_set_node_key(eb, &tm->key, tm->slot); 1144 btrfs_set_node_key(eb, &tm->key, tm->slot);
1146 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 1145 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1147 btrfs_set_node_ptr_generation(eb, tm->slot, 1146 btrfs_set_node_ptr_generation(eb, tm->slot,
1148 tm->generation); 1147 tm->generation);
1149 n++;
1150 break; 1148 break;
1151 case MOD_LOG_KEY_REPLACE: 1149 case MOD_LOG_KEY_REPLACE:
1152 BUG_ON(tm->slot >= n); 1150 BUG_ON(tm->slot >= n);
@@ -1361,19 +1359,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1361 u64 search_start; 1359 u64 search_start;
1362 int ret; 1360 int ret;
1363 1361
1364 if (trans->transaction != root->fs_info->running_transaction) { 1362 if (trans->transaction != root->fs_info->running_transaction)
1365 printk(KERN_CRIT "trans %llu running %llu\n", 1363 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1366 (unsigned long long)trans->transid, 1364 (unsigned long long)trans->transid,
1367 (unsigned long long) 1365 (unsigned long long)
1368 root->fs_info->running_transaction->transid); 1366 root->fs_info->running_transaction->transid);
1369 WARN_ON(1); 1367
1370 } 1368 if (trans->transid != root->fs_info->generation)
1371 if (trans->transid != root->fs_info->generation) { 1369 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1372 printk(KERN_CRIT "trans %llu running %llu\n",
1373 (unsigned long long)trans->transid, 1370 (unsigned long long)trans->transid,
1374 (unsigned long long)root->fs_info->generation); 1371 (unsigned long long)root->fs_info->generation);
1375 WARN_ON(1);
1376 }
1377 1372
1378 if (!should_cow_block(trans, root, buf)) { 1373 if (!should_cow_block(trans, root, buf)) {
1379 *cow_ret = buf; 1374 *cow_ret = buf;
@@ -1469,10 +1464,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1469 if (cache_only && parent_level != 1) 1464 if (cache_only && parent_level != 1)
1470 return 0; 1465 return 0;
1471 1466
1472 if (trans->transaction != root->fs_info->running_transaction) 1467 WARN_ON(trans->transaction != root->fs_info->running_transaction);
1473 WARN_ON(1); 1468 WARN_ON(trans->transid != root->fs_info->generation);
1474 if (trans->transid != root->fs_info->generation)
1475 WARN_ON(1);
1476 1469
1477 parent_nritems = btrfs_header_nritems(parent); 1470 parent_nritems = btrfs_header_nritems(parent);
1478 blocksize = btrfs_level_size(root, parent_level - 1); 1471 blocksize = btrfs_level_size(root, parent_level - 1);
@@ -1827,7 +1820,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1827 if (btrfs_header_nritems(right) == 0) { 1820 if (btrfs_header_nritems(right) == 0) {
1828 clean_tree_block(trans, root, right); 1821 clean_tree_block(trans, root, right);
1829 btrfs_tree_unlock(right); 1822 btrfs_tree_unlock(right);
1830 del_ptr(trans, root, path, level + 1, pslot + 1, 1); 1823 del_ptr(trans, root, path, level + 1, pslot + 1);
1831 root_sub_used(root, right->len); 1824 root_sub_used(root, right->len);
1832 btrfs_free_tree_block(trans, root, right, 0, 1); 1825 btrfs_free_tree_block(trans, root, right, 0, 1);
1833 free_extent_buffer_stale(right); 1826 free_extent_buffer_stale(right);
@@ -1836,7 +1829,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1836 struct btrfs_disk_key right_key; 1829 struct btrfs_disk_key right_key;
1837 btrfs_node_key(right, &right_key, 0); 1830 btrfs_node_key(right, &right_key, 0);
1838 tree_mod_log_set_node_key(root->fs_info, parent, 1831 tree_mod_log_set_node_key(root->fs_info, parent,
1839 &right_key, pslot + 1, 0); 1832 pslot + 1, 0);
1840 btrfs_set_node_key(parent, &right_key, pslot + 1); 1833 btrfs_set_node_key(parent, &right_key, pslot + 1);
1841 btrfs_mark_buffer_dirty(parent); 1834 btrfs_mark_buffer_dirty(parent);
1842 } 1835 }
@@ -1871,7 +1864,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1871 if (btrfs_header_nritems(mid) == 0) { 1864 if (btrfs_header_nritems(mid) == 0) {
1872 clean_tree_block(trans, root, mid); 1865 clean_tree_block(trans, root, mid);
1873 btrfs_tree_unlock(mid); 1866 btrfs_tree_unlock(mid);
1874 del_ptr(trans, root, path, level + 1, pslot, 1); 1867 del_ptr(trans, root, path, level + 1, pslot);
1875 root_sub_used(root, mid->len); 1868 root_sub_used(root, mid->len);
1876 btrfs_free_tree_block(trans, root, mid, 0, 1); 1869 btrfs_free_tree_block(trans, root, mid, 0, 1);
1877 free_extent_buffer_stale(mid); 1870 free_extent_buffer_stale(mid);
@@ -1880,7 +1873,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1880 /* update the parent key to reflect our changes */ 1873 /* update the parent key to reflect our changes */
1881 struct btrfs_disk_key mid_key; 1874 struct btrfs_disk_key mid_key;
1882 btrfs_node_key(mid, &mid_key, 0); 1875 btrfs_node_key(mid, &mid_key, 0);
1883 tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, 1876 tree_mod_log_set_node_key(root->fs_info, parent,
1884 pslot, 0); 1877 pslot, 0);
1885 btrfs_set_node_key(parent, &mid_key, pslot); 1878 btrfs_set_node_key(parent, &mid_key, pslot);
1886 btrfs_mark_buffer_dirty(parent); 1879 btrfs_mark_buffer_dirty(parent);
@@ -1980,7 +1973,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1980 orig_slot += left_nr; 1973 orig_slot += left_nr;
1981 btrfs_node_key(mid, &disk_key, 0); 1974 btrfs_node_key(mid, &disk_key, 0);
1982 tree_mod_log_set_node_key(root->fs_info, parent, 1975 tree_mod_log_set_node_key(root->fs_info, parent,
1983 &disk_key, pslot, 0); 1976 pslot, 0);
1984 btrfs_set_node_key(parent, &disk_key, pslot); 1977 btrfs_set_node_key(parent, &disk_key, pslot);
1985 btrfs_mark_buffer_dirty(parent); 1978 btrfs_mark_buffer_dirty(parent);
1986 if (btrfs_header_nritems(left) > orig_slot) { 1979 if (btrfs_header_nritems(left) > orig_slot) {
@@ -2033,7 +2026,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2033 2026
2034 btrfs_node_key(right, &disk_key, 0); 2027 btrfs_node_key(right, &disk_key, 0);
2035 tree_mod_log_set_node_key(root->fs_info, parent, 2028 tree_mod_log_set_node_key(root->fs_info, parent,
2036 &disk_key, pslot + 1, 0); 2029 pslot + 1, 0);
2037 btrfs_set_node_key(parent, &disk_key, pslot + 1); 2030 btrfs_set_node_key(parent, &disk_key, pslot + 1);
2038 btrfs_mark_buffer_dirty(parent); 2031 btrfs_mark_buffer_dirty(parent);
2039 2032
@@ -2219,6 +2212,9 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
2219 int no_skips = 0; 2212 int no_skips = 0;
2220 struct extent_buffer *t; 2213 struct extent_buffer *t;
2221 2214
2215 if (path->really_keep_locks)
2216 return;
2217
2222 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2218 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2223 if (!path->nodes[i]) 2219 if (!path->nodes[i])
2224 break; 2220 break;
@@ -2266,7 +2262,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2266{ 2262{
2267 int i; 2263 int i;
2268 2264
2269 if (path->keep_locks) 2265 if (path->keep_locks || path->really_keep_locks)
2270 return; 2266 return;
2271 2267
2272 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2268 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -2499,7 +2495,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2499 if (!cow) 2495 if (!cow)
2500 write_lock_level = -1; 2496 write_lock_level = -1;
2501 2497
2502 if (cow && (p->keep_locks || p->lowest_level)) 2498 if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level))
2503 write_lock_level = BTRFS_MAX_LEVEL; 2499 write_lock_level = BTRFS_MAX_LEVEL;
2504 2500
2505 min_write_lock_level = write_lock_level; 2501 min_write_lock_level = write_lock_level;
@@ -2568,7 +2564,10 @@ again:
2568 * must have write locks on this node and the 2564 * must have write locks on this node and the
2569 * parent 2565 * parent
2570 */ 2566 */
2571 if (level + 1 > write_lock_level) { 2567 if (level > write_lock_level ||
2568 (level + 1 > write_lock_level &&
2569 level + 1 < BTRFS_MAX_LEVEL &&
2570 p->nodes[level + 1])) {
2572 write_lock_level = level + 1; 2571 write_lock_level = level + 1;
2573 btrfs_release_path(p); 2572 btrfs_release_path(p);
2574 goto again; 2573 goto again;
@@ -2917,7 +2916,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
2917 if (!path->nodes[i]) 2916 if (!path->nodes[i])
2918 break; 2917 break;
2919 t = path->nodes[i]; 2918 t = path->nodes[i];
2920 tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1); 2919 tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
2921 btrfs_set_node_key(t, key, tslot); 2920 btrfs_set_node_key(t, key, tslot);
2922 btrfs_mark_buffer_dirty(path->nodes[i]); 2921 btrfs_mark_buffer_dirty(path->nodes[i]);
2923 if (tslot != 0) 2922 if (tslot != 0)
@@ -3302,14 +3301,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
3302 */ 3301 */
3303static int leaf_space_used(struct extent_buffer *l, int start, int nr) 3302static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3304{ 3303{
3304 struct btrfs_item *start_item;
3305 struct btrfs_item *end_item;
3306 struct btrfs_map_token token;
3305 int data_len; 3307 int data_len;
3306 int nritems = btrfs_header_nritems(l); 3308 int nritems = btrfs_header_nritems(l);
3307 int end = min(nritems, start + nr) - 1; 3309 int end = min(nritems, start + nr) - 1;
3308 3310
3309 if (!nr) 3311 if (!nr)
3310 return 0; 3312 return 0;
3311 data_len = btrfs_item_end_nr(l, start); 3313 btrfs_init_map_token(&token);
3312 data_len = data_len - btrfs_item_offset_nr(l, end); 3314 start_item = btrfs_item_nr(l, start);
3315 end_item = btrfs_item_nr(l, end);
3316 data_len = btrfs_token_item_offset(l, start_item, &token) +
3317 btrfs_token_item_size(l, start_item, &token);
3318 data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3313 data_len += sizeof(struct btrfs_item) * nr; 3319 data_len += sizeof(struct btrfs_item) * nr;
3314 WARN_ON(data_len < 0); 3320 WARN_ON(data_len < 0);
3315 return data_len; 3321 return data_len;
@@ -3403,8 +3409,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3403 if (push_items == 0) 3409 if (push_items == 0)
3404 goto out_unlock; 3410 goto out_unlock;
3405 3411
3406 if (!empty && push_items == left_nritems) 3412 WARN_ON(!empty && push_items == left_nritems);
3407 WARN_ON(1);
3408 3413
3409 /* push left to right */ 3414 /* push left to right */
3410 right_nritems = btrfs_header_nritems(right); 3415 right_nritems = btrfs_header_nritems(right);
@@ -3642,11 +3647,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3642 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3647 btrfs_set_header_nritems(left, old_left_nritems + push_items);
3643 3648
3644 /* fixup right node */ 3649 /* fixup right node */
3645 if (push_items > right_nritems) { 3650 if (push_items > right_nritems)
3646 printk(KERN_CRIT "push items %d nr %u\n", push_items, 3651 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3647 right_nritems); 3652 right_nritems);
3648 WARN_ON(1);
3649 }
3650 3653
3651 if (push_items < right_nritems) { 3654 if (push_items < right_nritems) {
3652 push_space = btrfs_item_offset_nr(right, push_items - 1) - 3655 push_space = btrfs_item_offset_nr(right, push_items - 1) -
@@ -4602,16 +4605,21 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4602 * empty a node. 4605 * empty a node.
4603 */ 4606 */
4604static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4607static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4605 struct btrfs_path *path, int level, int slot, 4608 struct btrfs_path *path, int level, int slot)
4606 int tree_mod_log)
4607{ 4609{
4608 struct extent_buffer *parent = path->nodes[level]; 4610 struct extent_buffer *parent = path->nodes[level];
4609 u32 nritems; 4611 u32 nritems;
4610 int ret; 4612 int ret;
4611 4613
4614 if (level) {
4615 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4616 MOD_LOG_KEY_REMOVE);
4617 BUG_ON(ret < 0);
4618 }
4619
4612 nritems = btrfs_header_nritems(parent); 4620 nritems = btrfs_header_nritems(parent);
4613 if (slot != nritems - 1) { 4621 if (slot != nritems - 1) {
4614 if (tree_mod_log && level) 4622 if (level)
4615 tree_mod_log_eb_move(root->fs_info, parent, slot, 4623 tree_mod_log_eb_move(root->fs_info, parent, slot,
4616 slot + 1, nritems - slot - 1); 4624 slot + 1, nritems - slot - 1);
4617 memmove_extent_buffer(parent, 4625 memmove_extent_buffer(parent,
@@ -4619,10 +4627,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4619 btrfs_node_key_ptr_offset(slot + 1), 4627 btrfs_node_key_ptr_offset(slot + 1),
4620 sizeof(struct btrfs_key_ptr) * 4628 sizeof(struct btrfs_key_ptr) *
4621 (nritems - slot - 1)); 4629 (nritems - slot - 1));
4622 } else if (tree_mod_log && level) {
4623 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4624 MOD_LOG_KEY_REMOVE);
4625 BUG_ON(ret < 0);
4626 } 4630 }
4627 4631
4628 nritems--; 4632 nritems--;
@@ -4656,7 +4660,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4656 struct extent_buffer *leaf) 4660 struct extent_buffer *leaf)
4657{ 4661{
4658 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4662 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4659 del_ptr(trans, root, path, 1, path->slots[1], 1); 4663 del_ptr(trans, root, path, 1, path->slots[1]);
4660 4664
4661 /* 4665 /*
4662 * btrfs_free_extent is expensive, we want to make sure we 4666 * btrfs_free_extent is expensive, we want to make sure we
@@ -5123,13 +5127,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5123 right_path->search_commit_root = 1; 5127 right_path->search_commit_root = 1;
5124 right_path->skip_locking = 1; 5128 right_path->skip_locking = 1;
5125 5129
5126 spin_lock(&left_root->root_times_lock); 5130 spin_lock(&left_root->root_item_lock);
5127 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); 5131 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
5128 spin_unlock(&left_root->root_times_lock); 5132 spin_unlock(&left_root->root_item_lock);
5129 5133
5130 spin_lock(&right_root->root_times_lock); 5134 spin_lock(&right_root->root_item_lock);
5131 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); 5135 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
5132 spin_unlock(&right_root->root_times_lock); 5136 spin_unlock(&right_root->root_item_lock);
5133 5137
5134 trans = btrfs_join_transaction(left_root); 5138 trans = btrfs_join_transaction(left_root);
5135 if (IS_ERR(trans)) { 5139 if (IS_ERR(trans)) {
@@ -5224,15 +5228,15 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5224 goto out; 5228 goto out;
5225 } 5229 }
5226 5230
5227 spin_lock(&left_root->root_times_lock); 5231 spin_lock(&left_root->root_item_lock);
5228 ctransid = btrfs_root_ctransid(&left_root->root_item); 5232 ctransid = btrfs_root_ctransid(&left_root->root_item);
5229 spin_unlock(&left_root->root_times_lock); 5233 spin_unlock(&left_root->root_item_lock);
5230 if (ctransid != left_start_ctransid) 5234 if (ctransid != left_start_ctransid)
5231 left_start_ctransid = 0; 5235 left_start_ctransid = 0;
5232 5236
5233 spin_lock(&right_root->root_times_lock); 5237 spin_lock(&right_root->root_item_lock);
5234 ctransid = btrfs_root_ctransid(&right_root->root_item); 5238 ctransid = btrfs_root_ctransid(&right_root->root_item);
5235 spin_unlock(&right_root->root_times_lock); 5239 spin_unlock(&right_root->root_item_lock);
5236 if (ctransid != right_start_ctransid) 5240 if (ctransid != right_start_ctransid)
5237 right_start_ctransid = 0; 5241 right_start_ctransid = 0;
5238 5242
@@ -5496,6 +5500,139 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5496 return btrfs_next_old_leaf(root, path, 0); 5500 return btrfs_next_old_leaf(root, path, 0);
5497} 5501}
5498 5502
5503/* Release the path up to but not including the given level */
5504static void btrfs_release_level(struct btrfs_path *path, int level)
5505{
5506 int i;
5507
5508 for (i = 0; i < level; i++) {
5509 path->slots[i] = 0;
5510 if (!path->nodes[i])
5511 continue;
5512 if (path->locks[i]) {
5513 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
5514 path->locks[i] = 0;
5515 }
5516 free_extent_buffer(path->nodes[i]);
5517 path->nodes[i] = NULL;
5518 }
5519}
5520
5521/*
5522 * This function assumes 2 things
5523 *
5524 * 1) You are using path->keep_locks
5525 * 2) You are not inserting items.
5526 *
5527 * If either of these are not true do not use this function. If you need a next
5528 * leaf with either of these not being true then this function can be easily
5529 * adapted to do that, but at the moment these are the limitations.
5530 */
5531int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
5532 struct btrfs_root *root, struct btrfs_path *path,
5533 int del)
5534{
5535 struct extent_buffer *b;
5536 struct btrfs_key key;
5537 u32 nritems;
5538 int level = 1;
5539 int slot;
5540 int ret = 1;
5541 int write_lock_level = BTRFS_MAX_LEVEL;
5542 int ins_len = del ? -1 : 0;
5543
5544 WARN_ON(!(path->keep_locks || path->really_keep_locks));
5545
5546 nritems = btrfs_header_nritems(path->nodes[0]);
5547 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5548
5549 while (path->nodes[level]) {
5550 nritems = btrfs_header_nritems(path->nodes[level]);
5551 if (!(path->locks[level] & BTRFS_WRITE_LOCK)) {
5552search:
5553 btrfs_release_path(path);
5554 ret = btrfs_search_slot(trans, root, &key, path,
5555 ins_len, 1);
5556 if (ret < 0)
5557 goto out;
5558 level = 1;
5559 continue;
5560 }
5561
5562 if (path->slots[level] >= nritems - 1) {
5563 level++;
5564 continue;
5565 }
5566
5567 btrfs_release_level(path, level);
5568 break;
5569 }
5570
5571 if (!path->nodes[level]) {
5572 ret = 1;
5573 goto out;
5574 }
5575
5576 path->slots[level]++;
5577 b = path->nodes[level];
5578
5579 while (b) {
5580 level = btrfs_header_level(b);
5581
5582 if (!should_cow_block(trans, root, b))
5583 goto cow_done;
5584
5585 btrfs_set_path_blocking(path);
5586 ret = btrfs_cow_block(trans, root, b,
5587 path->nodes[level + 1],
5588 path->slots[level + 1], &b);
5589 if (ret)
5590 goto out;
5591cow_done:
5592 path->nodes[level] = b;
5593 btrfs_clear_path_blocking(path, NULL, 0);
5594 if (level != 0) {
5595 ret = setup_nodes_for_search(trans, root, path, b,
5596 level, ins_len,
5597 &write_lock_level);
5598 if (ret == -EAGAIN)
5599 goto search;
5600 if (ret)
5601 goto out;
5602
5603 b = path->nodes[level];
5604 slot = path->slots[level];
5605
5606 ret = read_block_for_search(trans, root, path,
5607 &b, level, slot, &key, 0);
5608 if (ret == -EAGAIN)
5609 goto search;
5610 if (ret)
5611 goto out;
5612 level = btrfs_header_level(b);
5613 if (!btrfs_try_tree_write_lock(b)) {
5614 btrfs_set_path_blocking(path);
5615 btrfs_tree_lock(b);
5616 btrfs_clear_path_blocking(path, b,
5617 BTRFS_WRITE_LOCK);
5618 }
5619 path->locks[level] = BTRFS_WRITE_LOCK;
5620 path->nodes[level] = b;
5621 path->slots[level] = 0;
5622 } else {
5623 path->slots[level] = 0;
5624 ret = 0;
5625 break;
5626 }
5627 }
5628
5629out:
5630 if (ret)
5631 btrfs_release_path(path);
5632
5633 return ret;
5634}
5635
5499int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 5636int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5500 u64 time_seq) 5637 u64 time_seq)
5501{ 5638{
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c72ead869507..547b7b05727f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,7 +48,7 @@ struct btrfs_ordered_sum;
48 48
49#define BTRFS_MAGIC "_BHRfS_M" 49#define BTRFS_MAGIC "_BHRfS_M"
50 50
51#define BTRFS_MAX_MIRRORS 2 51#define BTRFS_MAX_MIRRORS 3
52 52
53#define BTRFS_MAX_LEVEL 8 53#define BTRFS_MAX_LEVEL 8
54 54
@@ -142,6 +142,8 @@ struct btrfs_ordered_sum;
142 142
143#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 143#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
144 144
145#define BTRFS_DEV_REPLACE_DEVID 0
146
145/* 147/*
146 * the max metadata block size. This limit is somewhat artificial, 148 * the max metadata block size. This limit is somewhat artificial,
147 * but the memmove costs go through the roof for larger blocks. 149 * but the memmove costs go through the roof for larger blocks.
@@ -172,6 +174,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
172/* four bytes for CRC32 */ 174/* four bytes for CRC32 */
173#define BTRFS_EMPTY_DIR_SIZE 0 175#define BTRFS_EMPTY_DIR_SIZE 0
174 176
177/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
178#define REQ_GET_READ_MIRRORS (1 << 30)
179
175#define BTRFS_FT_UNKNOWN 0 180#define BTRFS_FT_UNKNOWN 0
176#define BTRFS_FT_REG_FILE 1 181#define BTRFS_FT_REG_FILE 1
177#define BTRFS_FT_DIR 2 182#define BTRFS_FT_DIR 2
@@ -413,7 +418,7 @@ struct btrfs_root_backup {
413 __le64 bytes_used; 418 __le64 bytes_used;
414 __le64 num_devices; 419 __le64 num_devices;
415 /* future */ 420 /* future */
416 __le64 unsed_64[4]; 421 __le64 unused_64[4];
417 422
418 u8 tree_root_level; 423 u8 tree_root_level;
419 u8 chunk_root_level; 424 u8 chunk_root_level;
@@ -571,6 +576,7 @@ struct btrfs_path {
571 unsigned int skip_locking:1; 576 unsigned int skip_locking:1;
572 unsigned int leave_spinning:1; 577 unsigned int leave_spinning:1;
573 unsigned int search_commit_root:1; 578 unsigned int search_commit_root:1;
579 unsigned int really_keep_locks:1;
574}; 580};
575 581
576/* 582/*
@@ -885,6 +891,59 @@ struct btrfs_dev_stats_item {
885 __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; 891 __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
886} __attribute__ ((__packed__)); 892} __attribute__ ((__packed__));
887 893
894#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
895#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
896#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0
897#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1
898#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2
899#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3
900#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4
901
902struct btrfs_dev_replace {
903 u64 replace_state; /* see #define above */
904 u64 time_started; /* seconds since 1-Jan-1970 */
905 u64 time_stopped; /* seconds since 1-Jan-1970 */
906 atomic64_t num_write_errors;
907 atomic64_t num_uncorrectable_read_errors;
908
909 u64 cursor_left;
910 u64 committed_cursor_left;
911 u64 cursor_left_last_write_of_item;
912 u64 cursor_right;
913
914 u64 cont_reading_from_srcdev_mode; /* see #define above */
915
916 int is_valid;
917 int item_needs_writeback;
918 struct btrfs_device *srcdev;
919 struct btrfs_device *tgtdev;
920
921 pid_t lock_owner;
922 atomic_t nesting_level;
923 struct mutex lock_finishing_cancel_unmount;
924 struct mutex lock_management_lock;
925 struct mutex lock;
926
927 struct btrfs_scrub_progress scrub_progress;
928};
929
930struct btrfs_dev_replace_item {
931 /*
932 * grow this item struct at the end for future enhancements and keep
933 * the existing values unchanged
934 */
935 __le64 src_devid;
936 __le64 cursor_left;
937 __le64 cursor_right;
938 __le64 cont_reading_from_srcdev_mode;
939
940 __le64 replace_state;
941 __le64 time_started;
942 __le64 time_stopped;
943 __le64 num_write_errors;
944 __le64 num_uncorrectable_read_errors;
945} __attribute__ ((__packed__));
946
888/* different types of block groups (and chunks) */ 947/* different types of block groups (and chunks) */
889#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) 948#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
890#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) 949#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
@@ -1333,6 +1392,7 @@ struct btrfs_fs_info {
1333 struct btrfs_workers generic_worker; 1392 struct btrfs_workers generic_worker;
1334 struct btrfs_workers workers; 1393 struct btrfs_workers workers;
1335 struct btrfs_workers delalloc_workers; 1394 struct btrfs_workers delalloc_workers;
1395 struct btrfs_workers flush_workers;
1336 struct btrfs_workers endio_workers; 1396 struct btrfs_workers endio_workers;
1337 struct btrfs_workers endio_meta_workers; 1397 struct btrfs_workers endio_meta_workers;
1338 struct btrfs_workers endio_meta_write_workers; 1398 struct btrfs_workers endio_meta_write_workers;
@@ -1429,6 +1489,8 @@ struct btrfs_fs_info {
1429 struct rw_semaphore scrub_super_lock; 1489 struct rw_semaphore scrub_super_lock;
1430 int scrub_workers_refcnt; 1490 int scrub_workers_refcnt;
1431 struct btrfs_workers scrub_workers; 1491 struct btrfs_workers scrub_workers;
1492 struct btrfs_workers scrub_wr_completion_workers;
1493 struct btrfs_workers scrub_nocow_workers;
1432 1494
1433#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1495#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1434 u32 check_integrity_print_mask; 1496 u32 check_integrity_print_mask;
@@ -1470,6 +1532,11 @@ struct btrfs_fs_info {
1470 int backup_root_index; 1532 int backup_root_index;
1471 1533
1472 int num_tolerated_disk_barrier_failures; 1534 int num_tolerated_disk_barrier_failures;
1535
1536 /* device replace state */
1537 struct btrfs_dev_replace dev_replace;
1538
1539 atomic_t mutually_exclusive_operation_running;
1473}; 1540};
1474 1541
1475/* 1542/*
@@ -1579,7 +1646,7 @@ struct btrfs_root {
1579 1646
1580 int force_cow; 1647 int force_cow;
1581 1648
1582 spinlock_t root_times_lock; 1649 spinlock_t root_item_lock;
1583}; 1650};
1584 1651
1585struct btrfs_ioctl_defrag_range_args { 1652struct btrfs_ioctl_defrag_range_args {
@@ -1723,6 +1790,12 @@ struct btrfs_ioctl_defrag_range_args {
1723#define BTRFS_DEV_STATS_KEY 249 1790#define BTRFS_DEV_STATS_KEY 249
1724 1791
1725/* 1792/*
1793 * Persistantly stores the device replace state in the device tree.
1794 * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
1795 */
1796#define BTRFS_DEV_REPLACE_KEY 250
1797
1798/*
1726 * string items are for debugging. They just store a short string of 1799 * string items are for debugging. They just store a short string of
1727 * data in the FS 1800 * data in the FS
1728 */ 1801 */
@@ -1787,7 +1860,7 @@ struct btrfs_map_token {
1787 1860
1788static inline void btrfs_init_map_token (struct btrfs_map_token *token) 1861static inline void btrfs_init_map_token (struct btrfs_map_token *token)
1789{ 1862{
1790 memset(token, 0, sizeof(*token)); 1863 token->kaddr = NULL;
1791} 1864}
1792 1865
1793/* some macros to generate set/get funcs for the struct fields. This 1866/* some macros to generate set/get funcs for the struct fields. This
@@ -2755,6 +2828,49 @@ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
2755BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, 2828BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
2756 rsv_excl, 64); 2829 rsv_excl, 64);
2757 2830
2831/* btrfs_dev_replace_item */
2832BTRFS_SETGET_FUNCS(dev_replace_src_devid,
2833 struct btrfs_dev_replace_item, src_devid, 64);
2834BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
2835 struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
2836 64);
2837BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
2838 replace_state, 64);
2839BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
2840 time_started, 64);
2841BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
2842 time_stopped, 64);
2843BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
2844 num_write_errors, 64);
2845BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
2846 struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
2847 64);
2848BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
2849 cursor_left, 64);
2850BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
2851 cursor_right, 64);
2852
2853BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
2854 struct btrfs_dev_replace_item, src_devid, 64);
2855BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
2856 struct btrfs_dev_replace_item,
2857 cont_reading_from_srcdev_mode, 64);
2858BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
2859 struct btrfs_dev_replace_item, replace_state, 64);
2860BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
2861 struct btrfs_dev_replace_item, time_started, 64);
2862BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
2863 struct btrfs_dev_replace_item, time_stopped, 64);
2864BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
2865 struct btrfs_dev_replace_item, num_write_errors, 64);
2866BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
2867 struct btrfs_dev_replace_item,
2868 num_uncorrectable_read_errors, 64);
2869BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
2870 struct btrfs_dev_replace_item, cursor_left, 64);
2871BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
2872 struct btrfs_dev_replace_item, cursor_right, 64);
2873
2758static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) 2874static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
2759{ 2875{
2760 return sb->s_fs_info; 2876 return sb->s_fs_info;
@@ -2900,6 +3016,18 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
2900u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 3016u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2901u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); 3017u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
2902void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 3018void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
3019
3020enum btrfs_reserve_flush_enum {
3021 /* If we are in the transaction, we can't flush anything.*/
3022 BTRFS_RESERVE_NO_FLUSH,
3023 /*
3024 * Flushing delalloc may cause deadlock somewhere, in this
3025 * case, use FLUSH LIMIT
3026 */
3027 BTRFS_RESERVE_FLUSH_LIMIT,
3028 BTRFS_RESERVE_FLUSH_ALL,
3029};
3030
2903int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 3031int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2904void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 3032void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2905void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 3033void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2919,19 +3047,13 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
2919void btrfs_free_block_rsv(struct btrfs_root *root, 3047void btrfs_free_block_rsv(struct btrfs_root *root,
2920 struct btrfs_block_rsv *rsv); 3048 struct btrfs_block_rsv *rsv);
2921int btrfs_block_rsv_add(struct btrfs_root *root, 3049int btrfs_block_rsv_add(struct btrfs_root *root,
2922 struct btrfs_block_rsv *block_rsv, 3050 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
2923 u64 num_bytes); 3051 enum btrfs_reserve_flush_enum flush);
2924int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
2925 struct btrfs_block_rsv *block_rsv,
2926 u64 num_bytes);
2927int btrfs_block_rsv_check(struct btrfs_root *root, 3052int btrfs_block_rsv_check(struct btrfs_root *root,
2928 struct btrfs_block_rsv *block_rsv, int min_factor); 3053 struct btrfs_block_rsv *block_rsv, int min_factor);
2929int btrfs_block_rsv_refill(struct btrfs_root *root, 3054int btrfs_block_rsv_refill(struct btrfs_root *root,
2930 struct btrfs_block_rsv *block_rsv, 3055 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
2931 u64 min_reserved); 3056 enum btrfs_reserve_flush_enum flush);
2932int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
2933 struct btrfs_block_rsv *block_rsv,
2934 u64 min_reserved);
2935int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3057int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2936 struct btrfs_block_rsv *dst_rsv, 3058 struct btrfs_block_rsv *dst_rsv,
2937 u64 num_bytes); 3059 u64 num_bytes);
@@ -2955,6 +3077,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
2955int btrfs_init_space_info(struct btrfs_fs_info *fs_info); 3077int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
2956int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 3078int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2957 struct btrfs_fs_info *fs_info); 3079 struct btrfs_fs_info *fs_info);
3080int __get_raid_index(u64 flags);
2958/* ctree.c */ 3081/* ctree.c */
2959int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 3082int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2960 int level, int *slot); 3083 int level, int *slot);
@@ -3065,6 +3188,9 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
3065} 3188}
3066 3189
3067int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 3190int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
3191int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
3192 struct btrfs_root *root, struct btrfs_path *path,
3193 int del);
3068int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 3194int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
3069 u64 time_seq); 3195 u64 time_seq);
3070static inline int btrfs_next_old_item(struct btrfs_root *root, 3196static inline int btrfs_next_old_item(struct btrfs_root *root,
@@ -3157,6 +3283,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
3157 struct btrfs_root *root); 3283 struct btrfs_root *root);
3158 3284
3159/* dir-item.c */ 3285/* dir-item.c */
3286int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
3287 const char *name, int name_len);
3160int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, 3288int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
3161 struct btrfs_root *root, const char *name, 3289 struct btrfs_root *root, const char *name,
3162 int name_len, struct inode *dir, 3290 int name_len, struct inode *dir,
@@ -3256,6 +3384,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
3256 struct btrfs_root *root, 3384 struct btrfs_root *root,
3257 struct btrfs_path *path, u64 objectid, 3385 struct btrfs_path *path, u64 objectid,
3258 u64 bytenr, int mod); 3386 u64 bytenr, int mod);
3387u64 btrfs_file_extent_length(struct btrfs_path *path);
3259int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, 3388int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
3260 struct btrfs_root *root, 3389 struct btrfs_root *root,
3261 struct btrfs_ordered_sum *sums); 3390 struct btrfs_ordered_sum *sums);
@@ -3271,6 +3400,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
3271int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 3400int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
3272 struct list_head *list, int search_commit); 3401 struct list_head *list, int search_commit);
3273/* inode.c */ 3402/* inode.c */
3403struct btrfs_delalloc_work {
3404 struct inode *inode;
3405 int wait;
3406 int delay_iput;
3407 struct completion completion;
3408 struct list_head list;
3409 struct btrfs_work work;
3410};
3411
3412struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
3413 int wait, int delay_iput);
3414void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
3415
3274struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 3416struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
3275 size_t pg_offset, u64 start, u64 len, 3417 size_t pg_offset, u64 start, u64 len,
3276 int create); 3418 int create);
@@ -3370,9 +3512,12 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
3370 struct btrfs_ioctl_space_info *space); 3512 struct btrfs_ioctl_space_info *space);
3371 3513
3372/* file.c */ 3514/* file.c */
3515int btrfs_auto_defrag_init(void);
3516void btrfs_auto_defrag_exit(void);
3373int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 3517int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
3374 struct inode *inode); 3518 struct inode *inode);
3375int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); 3519int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
3520void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
3376int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); 3521int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
3377void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 3522void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
3378 int skip_pinned); 3523 int skip_pinned);
@@ -3519,15 +3664,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
3519 struct btrfs_pending_snapshot *pending); 3664 struct btrfs_pending_snapshot *pending);
3520 3665
3521/* scrub.c */ 3666/* scrub.c */
3522int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 3667int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3523 struct btrfs_scrub_progress *progress, int readonly); 3668 u64 end, struct btrfs_scrub_progress *progress,
3669 int readonly, int is_dev_replace);
3524void btrfs_scrub_pause(struct btrfs_root *root); 3670void btrfs_scrub_pause(struct btrfs_root *root);
3525void btrfs_scrub_pause_super(struct btrfs_root *root); 3671void btrfs_scrub_pause_super(struct btrfs_root *root);
3526void btrfs_scrub_continue(struct btrfs_root *root); 3672void btrfs_scrub_continue(struct btrfs_root *root);
3527void btrfs_scrub_continue_super(struct btrfs_root *root); 3673void btrfs_scrub_continue_super(struct btrfs_root *root);
3528int __btrfs_scrub_cancel(struct btrfs_fs_info *info); 3674int btrfs_scrub_cancel(struct btrfs_fs_info *info);
3529int btrfs_scrub_cancel(struct btrfs_root *root); 3675int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
3530int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev); 3676 struct btrfs_device *dev);
3531int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); 3677int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
3532int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 3678int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
3533 struct btrfs_scrub_progress *progress); 3679 struct btrfs_scrub_progress *progress);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 478f66bdc57b..34836036f01b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -651,7 +651,8 @@ static int btrfs_delayed_inode_reserve_metadata(
651 */ 651 */
652 if (!src_rsv || (!trans->bytes_reserved && 652 if (!src_rsv || (!trans->bytes_reserved &&
653 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { 653 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
654 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 654 ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
655 BTRFS_RESERVE_NO_FLUSH);
655 /* 656 /*
656 * Since we're under a transaction reserve_metadata_bytes could 657 * Since we're under a transaction reserve_metadata_bytes could
657 * try to commit the transaction which will make it return 658 * try to commit the transaction which will make it return
@@ -686,7 +687,8 @@ static int btrfs_delayed_inode_reserve_metadata(
686 * reserve something strictly for us. If not be a pain and try 687 * reserve something strictly for us. If not be a pain and try
687 * to steal from the delalloc block rsv. 688 * to steal from the delalloc block rsv.
688 */ 689 */
689 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 690 ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
691 BTRFS_RESERVE_NO_FLUSH);
690 if (!ret) 692 if (!ret)
691 goto out; 693 goto out;
692 694
@@ -1255,7 +1257,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1255 struct btrfs_delayed_node *delayed_node = NULL; 1257 struct btrfs_delayed_node *delayed_node = NULL;
1256 struct btrfs_root *root; 1258 struct btrfs_root *root;
1257 struct btrfs_block_rsv *block_rsv; 1259 struct btrfs_block_rsv *block_rsv;
1258 unsigned long nr = 0;
1259 int need_requeue = 0; 1260 int need_requeue = 0;
1260 int ret; 1261 int ret;
1261 1262
@@ -1316,11 +1317,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1316 delayed_node); 1317 delayed_node);
1317 mutex_unlock(&delayed_node->mutex); 1318 mutex_unlock(&delayed_node->mutex);
1318 1319
1319 nr = trans->blocks_used;
1320
1321 trans->block_rsv = block_rsv; 1320 trans->block_rsv = block_rsv;
1322 btrfs_end_transaction_dmeta(trans, root); 1321 btrfs_end_transaction_dmeta(trans, root);
1323 __btrfs_btree_balance_dirty(root, nr); 1322 btrfs_btree_balance_dirty_nodelay(root);
1324free_path: 1323free_path:
1325 btrfs_free_path(path); 1324 btrfs_free_path(path);
1326out: 1325out:
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
new file mode 100644
index 000000000000..66dbc8dbddf7
--- /dev/null
+++ b/fs/btrfs/dev-replace.c
@@ -0,0 +1,856 @@
1/*
2 * Copyright (C) STRATO AG 2012. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/bio.h>
20#include <linux/slab.h>
21#include <linux/buffer_head.h>
22#include <linux/blkdev.h>
23#include <linux/random.h>
24#include <linux/iocontext.h>
25#include <linux/capability.h>
26#include <linux/kthread.h>
27#include <linux/math64.h>
28#include <asm/div64.h>
29#include "compat.h"
30#include "ctree.h"
31#include "extent_map.h"
32#include "disk-io.h"
33#include "transaction.h"
34#include "print-tree.h"
35#include "volumes.h"
36#include "async-thread.h"
37#include "check-integrity.h"
38#include "rcu-string.h"
39#include "dev-replace.h"
40
41static u64 btrfs_get_seconds_since_1970(void);
42static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
43 int scrub_ret);
44static void btrfs_dev_replace_update_device_in_mapping_tree(
45 struct btrfs_fs_info *fs_info,
46 struct btrfs_device *srcdev,
47 struct btrfs_device *tgtdev);
48static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
49 char *srcdev_name,
50 struct btrfs_device **device);
51static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
52static int btrfs_dev_replace_kthread(void *data);
53static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
54
55
56int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
57{
58 struct btrfs_key key;
59 struct btrfs_root *dev_root = fs_info->dev_root;
60 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
61 struct extent_buffer *eb;
62 int slot;
63 int ret = 0;
64 struct btrfs_path *path = NULL;
65 int item_size;
66 struct btrfs_dev_replace_item *ptr;
67 u64 src_devid;
68
69 path = btrfs_alloc_path();
70 if (!path) {
71 ret = -ENOMEM;
72 goto out;
73 }
74
75 key.objectid = 0;
76 key.type = BTRFS_DEV_REPLACE_KEY;
77 key.offset = 0;
78 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
79 if (ret) {
80no_valid_dev_replace_entry_found:
81 ret = 0;
82 dev_replace->replace_state =
83 BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
84 dev_replace->cont_reading_from_srcdev_mode =
85 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
86 dev_replace->replace_state = 0;
87 dev_replace->time_started = 0;
88 dev_replace->time_stopped = 0;
89 atomic64_set(&dev_replace->num_write_errors, 0);
90 atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
91 dev_replace->cursor_left = 0;
92 dev_replace->committed_cursor_left = 0;
93 dev_replace->cursor_left_last_write_of_item = 0;
94 dev_replace->cursor_right = 0;
95 dev_replace->srcdev = NULL;
96 dev_replace->tgtdev = NULL;
97 dev_replace->is_valid = 0;
98 dev_replace->item_needs_writeback = 0;
99 goto out;
100 }
101 slot = path->slots[0];
102 eb = path->nodes[0];
103 item_size = btrfs_item_size_nr(eb, slot);
104 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
105
106 if (item_size != sizeof(struct btrfs_dev_replace_item)) {
107 pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
108 goto no_valid_dev_replace_entry_found;
109 }
110
111 src_devid = btrfs_dev_replace_src_devid(eb, ptr);
112 dev_replace->cont_reading_from_srcdev_mode =
113 btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
114 dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
115 dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
116 dev_replace->time_stopped =
117 btrfs_dev_replace_time_stopped(eb, ptr);
118 atomic64_set(&dev_replace->num_write_errors,
119 btrfs_dev_replace_num_write_errors(eb, ptr));
120 atomic64_set(&dev_replace->num_uncorrectable_read_errors,
121 btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
122 dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
123 dev_replace->committed_cursor_left = dev_replace->cursor_left;
124 dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
125 dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
126 dev_replace->is_valid = 1;
127
128 dev_replace->item_needs_writeback = 0;
129 switch (dev_replace->replace_state) {
130 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
131 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
132 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
133 dev_replace->srcdev = NULL;
134 dev_replace->tgtdev = NULL;
135 break;
136 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
137 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
138 dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
139 NULL, NULL);
140 dev_replace->tgtdev = btrfs_find_device(fs_info,
141 BTRFS_DEV_REPLACE_DEVID,
142 NULL, NULL);
143 /*
144 * allow 'btrfs dev replace_cancel' if src/tgt device is
145 * missing
146 */
147 if (!dev_replace->srcdev &&
148 !btrfs_test_opt(dev_root, DEGRADED)) {
149 ret = -EIO;
150 pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
151 (unsigned long long)src_devid);
152 }
153 if (!dev_replace->tgtdev &&
154 !btrfs_test_opt(dev_root, DEGRADED)) {
155 ret = -EIO;
156 pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
157 (unsigned long long)BTRFS_DEV_REPLACE_DEVID);
158 }
159 if (dev_replace->tgtdev) {
160 if (dev_replace->srcdev) {
161 dev_replace->tgtdev->total_bytes =
162 dev_replace->srcdev->total_bytes;
163 dev_replace->tgtdev->disk_total_bytes =
164 dev_replace->srcdev->disk_total_bytes;
165 dev_replace->tgtdev->bytes_used =
166 dev_replace->srcdev->bytes_used;
167 }
168 dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
169 btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
170 dev_replace->tgtdev);
171 }
172 break;
173 }
174
175out:
176 if (path)
177 btrfs_free_path(path);
178 return ret;
179}
180
181/*
182 * called from commit_transaction. Writes changed device replace state to
183 * disk.
184 */
185int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
186 struct btrfs_fs_info *fs_info)
187{
188 int ret;
189 struct btrfs_root *dev_root = fs_info->dev_root;
190 struct btrfs_path *path;
191 struct btrfs_key key;
192 struct extent_buffer *eb;
193 struct btrfs_dev_replace_item *ptr;
194 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
195
196 btrfs_dev_replace_lock(dev_replace);
197 if (!dev_replace->is_valid ||
198 !dev_replace->item_needs_writeback) {
199 btrfs_dev_replace_unlock(dev_replace);
200 return 0;
201 }
202 btrfs_dev_replace_unlock(dev_replace);
203
204 key.objectid = 0;
205 key.type = BTRFS_DEV_REPLACE_KEY;
206 key.offset = 0;
207
208 path = btrfs_alloc_path();
209 if (!path) {
210 ret = -ENOMEM;
211 goto out;
212 }
213 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
214 if (ret < 0) {
215 pr_warn("btrfs: error %d while searching for dev_replace item!\n",
216 ret);
217 goto out;
218 }
219
220 if (ret == 0 &&
221 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
222 /*
223 * need to delete old one and insert a new one.
224 * Since no attempt is made to recover any old state, if the
225 * dev_replace state is 'running', the data on the target
226 * drive is lost.
227 * It would be possible to recover the state: just make sure
228 * that the beginning of the item is never changed and always
229 * contains all the essential information. Then read this
230 * minimal set of information and use it as a base for the
231 * new state.
232 */
233 ret = btrfs_del_item(trans, dev_root, path);
234 if (ret != 0) {
235 pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
236 ret);
237 goto out;
238 }
239 ret = 1;
240 }
241
242 if (ret == 1) {
243 /* need to insert a new item */
244 btrfs_release_path(path);
245 ret = btrfs_insert_empty_item(trans, dev_root, path,
246 &key, sizeof(*ptr));
247 if (ret < 0) {
248 pr_warn("btrfs: insert dev_replace item failed %d!\n",
249 ret);
250 goto out;
251 }
252 }
253
254 eb = path->nodes[0];
255 ptr = btrfs_item_ptr(eb, path->slots[0],
256 struct btrfs_dev_replace_item);
257
258 btrfs_dev_replace_lock(dev_replace);
259 if (dev_replace->srcdev)
260 btrfs_set_dev_replace_src_devid(eb, ptr,
261 dev_replace->srcdev->devid);
262 else
263 btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
264 btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
265 dev_replace->cont_reading_from_srcdev_mode);
266 btrfs_set_dev_replace_replace_state(eb, ptr,
267 dev_replace->replace_state);
268 btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
269 btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
270 btrfs_set_dev_replace_num_write_errors(eb, ptr,
271 atomic64_read(&dev_replace->num_write_errors));
272 btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
273 atomic64_read(&dev_replace->num_uncorrectable_read_errors));
274 dev_replace->cursor_left_last_write_of_item =
275 dev_replace->cursor_left;
276 btrfs_set_dev_replace_cursor_left(eb, ptr,
277 dev_replace->cursor_left_last_write_of_item);
278 btrfs_set_dev_replace_cursor_right(eb, ptr,
279 dev_replace->cursor_right);
280 dev_replace->item_needs_writeback = 0;
281 btrfs_dev_replace_unlock(dev_replace);
282
283 btrfs_mark_buffer_dirty(eb);
284
285out:
286 btrfs_free_path(path);
287
288 return ret;
289}
290
291void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
292{
293 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
294
295 dev_replace->committed_cursor_left =
296 dev_replace->cursor_left_last_write_of_item;
297}
298
299static u64 btrfs_get_seconds_since_1970(void)
300{
301 struct timespec t = CURRENT_TIME_SEC;
302
303 return t.tv_sec;
304}
305
306int btrfs_dev_replace_start(struct btrfs_root *root,
307 struct btrfs_ioctl_dev_replace_args *args)
308{
309 struct btrfs_trans_handle *trans;
310 struct btrfs_fs_info *fs_info = root->fs_info;
311 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
312 int ret;
313 struct btrfs_device *tgt_device = NULL;
314 struct btrfs_device *src_device = NULL;
315
316 switch (args->start.cont_reading_from_srcdev_mode) {
317 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
318 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
319 break;
320 default:
321 return -EINVAL;
322 }
323
324 if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
325 args->start.tgtdev_name[0] == '\0')
326 return -EINVAL;
327
328 mutex_lock(&fs_info->volume_mutex);
329 ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
330 &tgt_device);
331 if (ret) {
332 pr_err("btrfs: target device %s is invalid!\n",
333 args->start.tgtdev_name);
334 mutex_unlock(&fs_info->volume_mutex);
335 return -EINVAL;
336 }
337
338 ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
339 args->start.srcdev_name,
340 &src_device);
341 mutex_unlock(&fs_info->volume_mutex);
342 if (ret) {
343 ret = -EINVAL;
344 goto leave_no_lock;
345 }
346
347 if (tgt_device->total_bytes < src_device->total_bytes) {
348 pr_err("btrfs: target device is smaller than source device!\n");
349 ret = -EINVAL;
350 goto leave_no_lock;
351 }
352
353 btrfs_dev_replace_lock(dev_replace);
354 switch (dev_replace->replace_state) {
355 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
356 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
357 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
358 break;
359 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
360 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
361 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
362 goto leave;
363 }
364
365 dev_replace->cont_reading_from_srcdev_mode =
366 args->start.cont_reading_from_srcdev_mode;
367 WARN_ON(!src_device);
368 dev_replace->srcdev = src_device;
369 WARN_ON(!tgt_device);
370 dev_replace->tgtdev = tgt_device;
371
372 printk_in_rcu(KERN_INFO
373 "btrfs: dev_replace from %s (devid %llu) to %s) started\n",
374 src_device->missing ? "<missing disk>" :
375 rcu_str_deref(src_device->name),
376 src_device->devid,
377 rcu_str_deref(tgt_device->name));
378
379 tgt_device->total_bytes = src_device->total_bytes;
380 tgt_device->disk_total_bytes = src_device->disk_total_bytes;
381 tgt_device->bytes_used = src_device->bytes_used;
382
383 /*
384 * from now on, the writes to the srcdev are all duplicated to
385 * go to the tgtdev as well (refer to btrfs_map_block()).
386 */
387 dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
388 dev_replace->time_started = btrfs_get_seconds_since_1970();
389 dev_replace->cursor_left = 0;
390 dev_replace->committed_cursor_left = 0;
391 dev_replace->cursor_left_last_write_of_item = 0;
392 dev_replace->cursor_right = 0;
393 dev_replace->is_valid = 1;
394 dev_replace->item_needs_writeback = 1;
395 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
396 btrfs_dev_replace_unlock(dev_replace);
397
398 btrfs_wait_ordered_extents(root, 0);
399
400 /* force writing the updated state information to disk */
401 trans = btrfs_start_transaction(root, 0);
402 if (IS_ERR(trans)) {
403 ret = PTR_ERR(trans);
404 btrfs_dev_replace_lock(dev_replace);
405 goto leave;
406 }
407
408 ret = btrfs_commit_transaction(trans, root);
409 WARN_ON(ret);
410
411 /* the disk copy procedure reuses the scrub code */
412 ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
413 src_device->total_bytes,
414 &dev_replace->scrub_progress, 0, 1);
415
416 ret = btrfs_dev_replace_finishing(root->fs_info, ret);
417 WARN_ON(ret);
418
419 return 0;
420
421leave:
422 dev_replace->srcdev = NULL;
423 dev_replace->tgtdev = NULL;
424 btrfs_dev_replace_unlock(dev_replace);
425leave_no_lock:
426 if (tgt_device)
427 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
428 return ret;
429}
430
431static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
432 int scrub_ret)
433{
434 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
435 struct btrfs_device *tgt_device;
436 struct btrfs_device *src_device;
437 struct btrfs_root *root = fs_info->tree_root;
438 u8 uuid_tmp[BTRFS_UUID_SIZE];
439 struct btrfs_trans_handle *trans;
440 int ret = 0;
441
442 /* don't allow cancel or unmount to disturb the finishing procedure */
443 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
444
445 btrfs_dev_replace_lock(dev_replace);
446 /* was the operation canceled, or is it finished? */
447 if (dev_replace->replace_state !=
448 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
449 btrfs_dev_replace_unlock(dev_replace);
450 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
451 return 0;
452 }
453
454 tgt_device = dev_replace->tgtdev;
455 src_device = dev_replace->srcdev;
456 btrfs_dev_replace_unlock(dev_replace);
457
458 /* replace old device with new one in mapping tree */
459 if (!scrub_ret)
460 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
461 src_device,
462 tgt_device);
463
464 /*
465 * flush all outstanding I/O and inode extent mappings before the
466 * copy operation is declared as being finished
467 */
468 btrfs_start_delalloc_inodes(root, 0);
469 btrfs_wait_ordered_extents(root, 0);
470
471 trans = btrfs_start_transaction(root, 0);
472 if (IS_ERR(trans)) {
473 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
474 return PTR_ERR(trans);
475 }
476 ret = btrfs_commit_transaction(trans, root);
477 WARN_ON(ret);
478
479 /* keep away write_all_supers() during the finishing procedure */
480 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
481 btrfs_dev_replace_lock(dev_replace);
482 dev_replace->replace_state =
483 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
484 : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
485 dev_replace->tgtdev = NULL;
486 dev_replace->srcdev = NULL;
487 dev_replace->time_stopped = btrfs_get_seconds_since_1970();
488 dev_replace->item_needs_writeback = 1;
489
490 if (scrub_ret) {
491 printk_in_rcu(KERN_ERR
492 "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
493 src_device->missing ? "<missing disk>" :
494 rcu_str_deref(src_device->name),
495 src_device->devid,
496 rcu_str_deref(tgt_device->name), scrub_ret);
497 btrfs_dev_replace_unlock(dev_replace);
498 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
499 if (tgt_device)
500 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
501 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
502
503 return 0;
504 }
505
506 printk_in_rcu(KERN_INFO
507 "btrfs: dev_replace from %s (devid %llu) to %s) finished\n",
508 src_device->missing ? "<missing disk>" :
509 rcu_str_deref(src_device->name),
510 src_device->devid,
511 rcu_str_deref(tgt_device->name));
512 tgt_device->is_tgtdev_for_dev_replace = 0;
513 tgt_device->devid = src_device->devid;
514 src_device->devid = BTRFS_DEV_REPLACE_DEVID;
515 tgt_device->bytes_used = src_device->bytes_used;
516 memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
517 memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
518 memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
519 tgt_device->total_bytes = src_device->total_bytes;
520 tgt_device->disk_total_bytes = src_device->disk_total_bytes;
521 tgt_device->bytes_used = src_device->bytes_used;
522 if (fs_info->sb->s_bdev == src_device->bdev)
523 fs_info->sb->s_bdev = tgt_device->bdev;
524 if (fs_info->fs_devices->latest_bdev == src_device->bdev)
525 fs_info->fs_devices->latest_bdev = tgt_device->bdev;
526 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
527
528 btrfs_rm_dev_replace_srcdev(fs_info, src_device);
529 if (src_device->bdev) {
530 /* zero out the old super */
531 btrfs_scratch_superblock(src_device);
532 }
533 /*
534 * this is again a consistent state where no dev_replace procedure
535 * is running, the target device is part of the filesystem, the
536 * source device is not part of the filesystem anymore and its 1st
537 * superblock is scratched out so that it is no longer marked to
538 * belong to this filesystem.
539 */
540 btrfs_dev_replace_unlock(dev_replace);
541 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
542
543 /* write back the superblocks */
544 trans = btrfs_start_transaction(root, 0);
545 if (!IS_ERR(trans))
546 btrfs_commit_transaction(trans, root);
547
548 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
549
550 return 0;
551}
552
553static void btrfs_dev_replace_update_device_in_mapping_tree(
554 struct btrfs_fs_info *fs_info,
555 struct btrfs_device *srcdev,
556 struct btrfs_device *tgtdev)
557{
558 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
559 struct extent_map *em;
560 struct map_lookup *map;
561 u64 start = 0;
562 int i;
563
564 write_lock(&em_tree->lock);
565 do {
566 em = lookup_extent_mapping(em_tree, start, (u64)-1);
567 if (!em)
568 break;
569 map = (struct map_lookup *)em->bdev;
570 for (i = 0; i < map->num_stripes; i++)
571 if (srcdev == map->stripes[i].dev)
572 map->stripes[i].dev = tgtdev;
573 start = em->start + em->len;
574 free_extent_map(em);
575 } while (start);
576 write_unlock(&em_tree->lock);
577}
578
579static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
580 char *srcdev_name,
581 struct btrfs_device **device)
582{
583 int ret;
584
585 if (srcdevid) {
586 ret = 0;
587 *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
588 NULL);
589 if (!*device)
590 ret = -ENOENT;
591 } else {
592 ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
593 device);
594 }
595 return ret;
596}
597
598void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
599 struct btrfs_ioctl_dev_replace_args *args)
600{
601 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
602
603 btrfs_dev_replace_lock(dev_replace);
604 /* even if !dev_replace_is_valid, the values are good enough for
605 * the replace_status ioctl */
606 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
607 args->status.replace_state = dev_replace->replace_state;
608 args->status.time_started = dev_replace->time_started;
609 args->status.time_stopped = dev_replace->time_stopped;
610 args->status.num_write_errors =
611 atomic64_read(&dev_replace->num_write_errors);
612 args->status.num_uncorrectable_read_errors =
613 atomic64_read(&dev_replace->num_uncorrectable_read_errors);
614 switch (dev_replace->replace_state) {
615 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
616 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
617 args->status.progress_1000 = 0;
618 break;
619 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
620 args->status.progress_1000 = 1000;
621 break;
622 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
623 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
624 args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
625 div64_u64(dev_replace->srcdev->total_bytes, 1000));
626 break;
627 }
628 btrfs_dev_replace_unlock(dev_replace);
629}
630
631int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
632 struct btrfs_ioctl_dev_replace_args *args)
633{
634 args->result = __btrfs_dev_replace_cancel(fs_info);
635 return 0;
636}
637
638static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
639{
640 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
641 struct btrfs_device *tgt_device = NULL;
642 struct btrfs_trans_handle *trans;
643 struct btrfs_root *root = fs_info->tree_root;
644 u64 result;
645 int ret;
646
647 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
648 btrfs_dev_replace_lock(dev_replace);
649 switch (dev_replace->replace_state) {
650 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
651 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
652 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
653 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
654 btrfs_dev_replace_unlock(dev_replace);
655 goto leave;
656 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
657 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
658 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
659 tgt_device = dev_replace->tgtdev;
660 dev_replace->tgtdev = NULL;
661 dev_replace->srcdev = NULL;
662 break;
663 }
664 dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
665 dev_replace->time_stopped = btrfs_get_seconds_since_1970();
666 dev_replace->item_needs_writeback = 1;
667 btrfs_dev_replace_unlock(dev_replace);
668 btrfs_scrub_cancel(fs_info);
669
670 trans = btrfs_start_transaction(root, 0);
671 if (IS_ERR(trans)) {
672 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
673 return PTR_ERR(trans);
674 }
675 ret = btrfs_commit_transaction(trans, root);
676 WARN_ON(ret);
677 if (tgt_device)
678 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
679
680leave:
681 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
682 return result;
683}
684
685void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
686{
687 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
688
689 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
690 btrfs_dev_replace_lock(dev_replace);
691 switch (dev_replace->replace_state) {
692 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
693 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
694 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
695 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
696 break;
697 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
698 dev_replace->replace_state =
699 BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
700 dev_replace->time_stopped = btrfs_get_seconds_since_1970();
701 dev_replace->item_needs_writeback = 1;
702 pr_info("btrfs: suspending dev_replace for unmount\n");
703 break;
704 }
705
706 btrfs_dev_replace_unlock(dev_replace);
707 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
708}
709
710/* resume dev_replace procedure that was interrupted by unmount */
711int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
712{
713 struct task_struct *task;
714 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
715
716 btrfs_dev_replace_lock(dev_replace);
717 switch (dev_replace->replace_state) {
718 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
719 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
720 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
721 btrfs_dev_replace_unlock(dev_replace);
722 return 0;
723 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
724 break;
725 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
726 dev_replace->replace_state =
727 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
728 break;
729 }
730 if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
731 pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
732 "btrfs: you may cancel the operation after 'mount -o degraded'\n");
733 btrfs_dev_replace_unlock(dev_replace);
734 return 0;
735 }
736 btrfs_dev_replace_unlock(dev_replace);
737
738 WARN_ON(atomic_xchg(
739 &fs_info->mutually_exclusive_operation_running, 1));
740 task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
741 return PTR_RET(task);
742}
743
744static int btrfs_dev_replace_kthread(void *data)
745{
746 struct btrfs_fs_info *fs_info = data;
747 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
748 struct btrfs_ioctl_dev_replace_args *status_args;
749 u64 progress;
750
751 status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
752 if (status_args) {
753 btrfs_dev_replace_status(fs_info, status_args);
754 progress = status_args->status.progress_1000;
755 kfree(status_args);
756 do_div(progress, 10);
757 printk_in_rcu(KERN_INFO
758 "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
759 dev_replace->srcdev->missing ? "<missing disk>" :
760 rcu_str_deref(dev_replace->srcdev->name),
761 dev_replace->srcdev->devid,
762 dev_replace->tgtdev ?
763 rcu_str_deref(dev_replace->tgtdev->name) :
764 "<missing target disk>",
765 (unsigned int)progress);
766 }
767 btrfs_dev_replace_continue_on_mount(fs_info);
768 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
769
770 return 0;
771}
772
773static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
774{
775 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
776 int ret;
777
778 ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
779 dev_replace->committed_cursor_left,
780 dev_replace->srcdev->total_bytes,
781 &dev_replace->scrub_progress, 0, 1);
782 ret = btrfs_dev_replace_finishing(fs_info, ret);
783 WARN_ON(ret);
784 return 0;
785}
786
787int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
788{
789 if (!dev_replace->is_valid)
790 return 0;
791
792 switch (dev_replace->replace_state) {
793 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
794 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
795 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
796 return 0;
797 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
798 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
799 /*
800 * return true even if tgtdev is missing (this is
801 * something that can happen if the dev_replace
802 * procedure is suspended by an umount and then
803 * the tgtdev is missing (or "btrfs dev scan") was
804 * not called and the the filesystem is remounted
805 * in degraded state. This does not stop the
806 * dev_replace procedure. It needs to be canceled
807 * manually if the cancelation is wanted.
808 */
809 break;
810 }
811 return 1;
812}
813
814void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
815{
816 /* the beginning is just an optimization for the typical case */
817 if (atomic_read(&dev_replace->nesting_level) == 0) {
818acquire_lock:
819 /* this is not a nested case where the same thread
820 * is trying to acqurire the same lock twice */
821 mutex_lock(&dev_replace->lock);
822 mutex_lock(&dev_replace->lock_management_lock);
823 dev_replace->lock_owner = current->pid;
824 atomic_inc(&dev_replace->nesting_level);
825 mutex_unlock(&dev_replace->lock_management_lock);
826 return;
827 }
828
829 mutex_lock(&dev_replace->lock_management_lock);
830 if (atomic_read(&dev_replace->nesting_level) > 0 &&
831 dev_replace->lock_owner == current->pid) {
832 WARN_ON(!mutex_is_locked(&dev_replace->lock));
833 atomic_inc(&dev_replace->nesting_level);
834 mutex_unlock(&dev_replace->lock_management_lock);
835 return;
836 }
837
838 mutex_unlock(&dev_replace->lock_management_lock);
839 goto acquire_lock;
840}
841
842void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
843{
844 WARN_ON(!mutex_is_locked(&dev_replace->lock));
845 mutex_lock(&dev_replace->lock_management_lock);
846 WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
847 WARN_ON(dev_replace->lock_owner != current->pid);
848 atomic_dec(&dev_replace->nesting_level);
849 if (atomic_read(&dev_replace->nesting_level) == 0) {
850 dev_replace->lock_owner = 0;
851 mutex_unlock(&dev_replace->lock_management_lock);
852 mutex_unlock(&dev_replace->lock);
853 } else {
854 mutex_unlock(&dev_replace->lock_management_lock);
855 }
856}
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
new file mode 100644
index 000000000000..20035cbbf021
--- /dev/null
+++ b/fs/btrfs/dev-replace.h
@@ -0,0 +1,44 @@
1/*
2 * Copyright (C) STRATO AG 2012. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#if !defined(__BTRFS_DEV_REPLACE__)
20#define __BTRFS_DEV_REPLACE__
21
22struct btrfs_ioctl_dev_replace_args;
23
24int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
25int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
26 struct btrfs_fs_info *fs_info);
27void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
28int btrfs_dev_replace_start(struct btrfs_root *root,
29 struct btrfs_ioctl_dev_replace_args *args);
30void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
31 struct btrfs_ioctl_dev_replace_args *args);
32int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
33 struct btrfs_ioctl_dev_replace_args *args);
34void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
35int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
36int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
37void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
38void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
39
40static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
41{
42 atomic64_inc(stat_value);
43}
44#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index c1a074d0696f..502c2158167c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -213,6 +213,65 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
213 return btrfs_match_dir_item_name(root, path, name, name_len); 213 return btrfs_match_dir_item_name(root, path, name, name_len);
214} 214}
215 215
216int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
217 const char *name, int name_len)
218{
219 int ret;
220 struct btrfs_key key;
221 struct btrfs_dir_item *di;
222 int data_size;
223 struct extent_buffer *leaf;
224 int slot;
225 struct btrfs_path *path;
226
227
228 path = btrfs_alloc_path();
229 if (!path)
230 return -ENOMEM;
231
232 key.objectid = dir;
233 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
234 key.offset = btrfs_name_hash(name, name_len);
235
236 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
237
238 /* return back any errors */
239 if (ret < 0)
240 goto out;
241
242 /* nothing found, we're safe */
243 if (ret > 0) {
244 ret = 0;
245 goto out;
246 }
247
248 /* we found an item, look for our name in the item */
249 di = btrfs_match_dir_item_name(root, path, name, name_len);
250 if (di) {
251 /* our exact name was found */
252 ret = -EEXIST;
253 goto out;
254 }
255
256 /*
257 * see if there is room in the item to insert this
258 * name
259 */
260 data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item);
261 leaf = path->nodes[0];
262 slot = path->slots[0];
263 if (data_size + btrfs_item_size_nr(leaf, slot) +
264 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) {
265 ret = -EOVERFLOW;
266 } else {
267 /* plenty of insertion room */
268 ret = 0;
269 }
270out:
271 btrfs_free_path(path);
272 return ret;
273}
274
216/* 275/*
217 * lookup a directory item based on index. 'dir' is the objectid 276 * lookup a directory item based on index. 'dir' is the objectid
218 * we're searching in, and 'mod' tells us if you plan on deleting the 277 * we're searching in, and 'mod' tells us if you plan on deleting the
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7cda51995c1e..a8f652dc940b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -45,6 +45,7 @@
45#include "inode-map.h" 45#include "inode-map.h"
46#include "check-integrity.h" 46#include "check-integrity.h"
47#include "rcu-string.h" 47#include "rcu-string.h"
48#include "dev-replace.h"
48 49
49#ifdef CONFIG_X86 50#ifdef CONFIG_X86
50#include <asm/cpufeature.h> 51#include <asm/cpufeature.h>
@@ -387,7 +388,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
387 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) 388 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
388 break; 389 break;
389 390
390 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, 391 num_copies = btrfs_num_copies(root->fs_info,
391 eb->start, eb->len); 392 eb->start, eb->len);
392 if (num_copies == 1) 393 if (num_copies == 1)
393 break; 394 break;
@@ -852,11 +853,16 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
852 int mirror_num, unsigned long bio_flags, 853 int mirror_num, unsigned long bio_flags,
853 u64 bio_offset) 854 u64 bio_offset)
854{ 855{
856 int ret;
857
855 /* 858 /*
856 * when we're called for a write, we're already in the async 859 * when we're called for a write, we're already in the async
857 * submission context. Just jump into btrfs_map_bio 860 * submission context. Just jump into btrfs_map_bio
858 */ 861 */
859 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); 862 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
863 if (ret)
864 bio_endio(bio, ret);
865 return ret;
860} 866}
861 867
862static int check_async_write(struct inode *inode, unsigned long bio_flags) 868static int check_async_write(struct inode *inode, unsigned long bio_flags)
@@ -878,7 +884,6 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
878 int ret; 884 int ret;
879 885
880 if (!(rw & REQ_WRITE)) { 886 if (!(rw & REQ_WRITE)) {
881
882 /* 887 /*
883 * called for a read, do the setup so that checksum validation 888 * called for a read, do the setup so that checksum validation
884 * can happen in the async kernel threads 889 * can happen in the async kernel threads
@@ -886,26 +891,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
886 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, 891 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
887 bio, 1); 892 bio, 1);
888 if (ret) 893 if (ret)
889 return ret; 894 goto out_w_error;
890 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 895 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
891 mirror_num, 0); 896 mirror_num, 0);
892 } else if (!async) { 897 } else if (!async) {
893 ret = btree_csum_one_bio(bio); 898 ret = btree_csum_one_bio(bio);
894 if (ret) 899 if (ret)
895 return ret; 900 goto out_w_error;
896 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 901 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
897 mirror_num, 0); 902 mirror_num, 0);
903 } else {
904 /*
905 * kthread helpers are used to submit writes so that
906 * checksumming can happen in parallel across all CPUs
907 */
908 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
909 inode, rw, bio, mirror_num, 0,
910 bio_offset,
911 __btree_submit_bio_start,
912 __btree_submit_bio_done);
898 } 913 }
899 914
900 /* 915 if (ret) {
901 * kthread helpers are used to submit writes so that checksumming 916out_w_error:
902 * can happen in parallel across all CPUs 917 bio_endio(bio, ret);
903 */ 918 }
904 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 919 return ret;
905 inode, rw, bio, mirror_num, 0,
906 bio_offset,
907 __btree_submit_bio_start,
908 __btree_submit_bio_done);
909} 920}
910 921
911#ifdef CONFIG_MIGRATION 922#ifdef CONFIG_MIGRATION
@@ -990,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
990 1001
991static int btree_set_page_dirty(struct page *page) 1002static int btree_set_page_dirty(struct page *page)
992{ 1003{
1004#ifdef DEBUG
993 struct extent_buffer *eb; 1005 struct extent_buffer *eb;
994 1006
995 BUG_ON(!PagePrivate(page)); 1007 BUG_ON(!PagePrivate(page));
@@ -998,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page)
998 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 1010 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
999 BUG_ON(!atomic_read(&eb->refs)); 1011 BUG_ON(!atomic_read(&eb->refs));
1000 btrfs_assert_tree_locked(eb); 1012 btrfs_assert_tree_locked(eb);
1013#endif
1001 return __set_page_dirty_nobuffers(page); 1014 return __set_page_dirty_nobuffers(page);
1002} 1015}
1003 1016
@@ -1129,11 +1142,11 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1129 root->fs_info->dirty_metadata_bytes); 1142 root->fs_info->dirty_metadata_bytes);
1130 } 1143 }
1131 spin_unlock(&root->fs_info->delalloc_lock); 1144 spin_unlock(&root->fs_info->delalloc_lock);
1132 }
1133 1145
1134 /* ugh, clear_extent_buffer_dirty needs to lock the page */ 1146 /* ugh, clear_extent_buffer_dirty needs to lock the page */
1135 btrfs_set_lock_blocking(buf); 1147 btrfs_set_lock_blocking(buf);
1136 clear_extent_buffer_dirty(buf); 1148 clear_extent_buffer_dirty(buf);
1149 }
1137 } 1150 }
1138} 1151}
1139 1152
@@ -1193,7 +1206,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1193 root->root_key.objectid = objectid; 1206 root->root_key.objectid = objectid;
1194 root->anon_dev = 0; 1207 root->anon_dev = 0;
1195 1208
1196 spin_lock_init(&root->root_times_lock); 1209 spin_lock_init(&root->root_item_lock);
1197} 1210}
1198 1211
1199static int __must_check find_and_setup_root(struct btrfs_root *tree_root, 1212static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
@@ -2131,6 +2144,11 @@ int open_ctree(struct super_block *sb,
2131 init_rwsem(&fs_info->extent_commit_sem); 2144 init_rwsem(&fs_info->extent_commit_sem);
2132 init_rwsem(&fs_info->cleanup_work_sem); 2145 init_rwsem(&fs_info->cleanup_work_sem);
2133 init_rwsem(&fs_info->subvol_sem); 2146 init_rwsem(&fs_info->subvol_sem);
2147 fs_info->dev_replace.lock_owner = 0;
2148 atomic_set(&fs_info->dev_replace.nesting_level, 0);
2149 mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2150 mutex_init(&fs_info->dev_replace.lock_management_lock);
2151 mutex_init(&fs_info->dev_replace.lock);
2134 2152
2135 spin_lock_init(&fs_info->qgroup_lock); 2153 spin_lock_init(&fs_info->qgroup_lock);
2136 fs_info->qgroup_tree = RB_ROOT; 2154 fs_info->qgroup_tree = RB_ROOT;
@@ -2279,6 +2297,10 @@ int open_ctree(struct super_block *sb,
2279 fs_info->thread_pool_size, 2297 fs_info->thread_pool_size,
2280 &fs_info->generic_worker); 2298 &fs_info->generic_worker);
2281 2299
2300 btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
2301 fs_info->thread_pool_size,
2302 &fs_info->generic_worker);
2303
2282 btrfs_init_workers(&fs_info->submit_workers, "submit", 2304 btrfs_init_workers(&fs_info->submit_workers, "submit",
2283 min_t(u64, fs_devices->num_devices, 2305 min_t(u64, fs_devices->num_devices,
2284 fs_info->thread_pool_size), 2306 fs_info->thread_pool_size),
@@ -2350,6 +2372,7 @@ int open_ctree(struct super_block *sb,
2350 ret |= btrfs_start_workers(&fs_info->delayed_workers); 2372 ret |= btrfs_start_workers(&fs_info->delayed_workers);
2351 ret |= btrfs_start_workers(&fs_info->caching_workers); 2373 ret |= btrfs_start_workers(&fs_info->caching_workers);
2352 ret |= btrfs_start_workers(&fs_info->readahead_workers); 2374 ret |= btrfs_start_workers(&fs_info->readahead_workers);
2375 ret |= btrfs_start_workers(&fs_info->flush_workers);
2353 if (ret) { 2376 if (ret) {
2354 err = -ENOMEM; 2377 err = -ENOMEM;
2355 goto fail_sb_buffer; 2378 goto fail_sb_buffer;
@@ -2418,7 +2441,11 @@ int open_ctree(struct super_block *sb,
2418 goto fail_tree_roots; 2441 goto fail_tree_roots;
2419 } 2442 }
2420 2443
2421 btrfs_close_extra_devices(fs_devices); 2444 /*
2445 * keep the device that is marked to be the target device for the
2446 * dev_replace procedure
2447 */
2448 btrfs_close_extra_devices(fs_info, fs_devices, 0);
2422 2449
2423 if (!fs_devices->latest_bdev) { 2450 if (!fs_devices->latest_bdev) {
2424 printk(KERN_CRIT "btrfs: failed to read devices on %s\n", 2451 printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
@@ -2490,6 +2517,14 @@ retry_root_backup:
2490 goto fail_block_groups; 2517 goto fail_block_groups;
2491 } 2518 }
2492 2519
2520 ret = btrfs_init_dev_replace(fs_info);
2521 if (ret) {
2522 pr_err("btrfs: failed to init dev_replace: %d\n", ret);
2523 goto fail_block_groups;
2524 }
2525
2526 btrfs_close_extra_devices(fs_info, fs_devices, 1);
2527
2493 ret = btrfs_init_space_info(fs_info); 2528 ret = btrfs_init_space_info(fs_info);
2494 if (ret) { 2529 if (ret) {
2495 printk(KERN_ERR "Failed to initial space info: %d\n", ret); 2530 printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2503,6 +2538,13 @@ retry_root_backup:
2503 } 2538 }
2504 fs_info->num_tolerated_disk_barrier_failures = 2539 fs_info->num_tolerated_disk_barrier_failures =
2505 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 2540 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2541 if (fs_info->fs_devices->missing_devices >
2542 fs_info->num_tolerated_disk_barrier_failures &&
2543 !(sb->s_flags & MS_RDONLY)) {
2544 printk(KERN_WARNING
2545 "Btrfs: too many missing devices, writeable mount is not allowed\n");
2546 goto fail_block_groups;
2547 }
2506 2548
2507 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 2549 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
2508 "btrfs-cleaner"); 2550 "btrfs-cleaner");
@@ -2631,6 +2673,13 @@ retry_root_backup:
2631 return ret; 2673 return ret;
2632 } 2674 }
2633 2675
2676 ret = btrfs_resume_dev_replace_async(fs_info);
2677 if (ret) {
2678 pr_warn("btrfs: failed to resume dev_replace\n");
2679 close_ctree(tree_root);
2680 return ret;
2681 }
2682
2634 return 0; 2683 return 0;
2635 2684
2636fail_qgroup: 2685fail_qgroup:
@@ -2667,6 +2716,7 @@ fail_sb_buffer:
2667 btrfs_stop_workers(&fs_info->submit_workers); 2716 btrfs_stop_workers(&fs_info->submit_workers);
2668 btrfs_stop_workers(&fs_info->delayed_workers); 2717 btrfs_stop_workers(&fs_info->delayed_workers);
2669 btrfs_stop_workers(&fs_info->caching_workers); 2718 btrfs_stop_workers(&fs_info->caching_workers);
2719 btrfs_stop_workers(&fs_info->flush_workers);
2670fail_alloc: 2720fail_alloc:
2671fail_iput: 2721fail_iput:
2672 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2722 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -3270,16 +3320,18 @@ int close_ctree(struct btrfs_root *root)
3270 smp_mb(); 3320 smp_mb();
3271 3321
3272 /* pause restriper - we want to resume on mount */ 3322 /* pause restriper - we want to resume on mount */
3273 btrfs_pause_balance(root->fs_info); 3323 btrfs_pause_balance(fs_info);
3324
3325 btrfs_dev_replace_suspend_for_unmount(fs_info);
3274 3326
3275 btrfs_scrub_cancel(root); 3327 btrfs_scrub_cancel(fs_info);
3276 3328
3277 /* wait for any defraggers to finish */ 3329 /* wait for any defraggers to finish */
3278 wait_event(fs_info->transaction_wait, 3330 wait_event(fs_info->transaction_wait,
3279 (atomic_read(&fs_info->defrag_running) == 0)); 3331 (atomic_read(&fs_info->defrag_running) == 0));
3280 3332
3281 /* clear out the rbtree of defraggable inodes */ 3333 /* clear out the rbtree of defraggable inodes */
3282 btrfs_run_defrag_inodes(fs_info); 3334 btrfs_cleanup_defrag_inodes(fs_info);
3283 3335
3284 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 3336 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
3285 ret = btrfs_commit_super(root); 3337 ret = btrfs_commit_super(root);
@@ -3339,6 +3391,7 @@ int close_ctree(struct btrfs_root *root)
3339 btrfs_stop_workers(&fs_info->delayed_workers); 3391 btrfs_stop_workers(&fs_info->delayed_workers);
3340 btrfs_stop_workers(&fs_info->caching_workers); 3392 btrfs_stop_workers(&fs_info->caching_workers);
3341 btrfs_stop_workers(&fs_info->readahead_workers); 3393 btrfs_stop_workers(&fs_info->readahead_workers);
3394 btrfs_stop_workers(&fs_info->flush_workers);
3342 3395
3343#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 3396#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3344 if (btrfs_test_opt(root, CHECK_INTEGRITY)) 3397 if (btrfs_test_opt(root, CHECK_INTEGRITY))
@@ -3383,14 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3383 int was_dirty; 3436 int was_dirty;
3384 3437
3385 btrfs_assert_tree_locked(buf); 3438 btrfs_assert_tree_locked(buf);
3386 if (transid != root->fs_info->generation) { 3439 if (transid != root->fs_info->generation)
3387 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " 3440 WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
3388 "found %llu running %llu\n", 3441 "found %llu running %llu\n",
3389 (unsigned long long)buf->start, 3442 (unsigned long long)buf->start,
3390 (unsigned long long)transid, 3443 (unsigned long long)transid,
3391 (unsigned long long)root->fs_info->generation); 3444 (unsigned long long)root->fs_info->generation);
3392 WARN_ON(1);
3393 }
3394 was_dirty = set_extent_buffer_dirty(buf); 3445 was_dirty = set_extent_buffer_dirty(buf);
3395 if (!was_dirty) { 3446 if (!was_dirty) {
3396 spin_lock(&root->fs_info->delalloc_lock); 3447 spin_lock(&root->fs_info->delalloc_lock);
@@ -3399,7 +3450,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3399 } 3450 }
3400} 3451}
3401 3452
3402void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 3453static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
3454 int flush_delayed)
3403{ 3455{
3404 /* 3456 /*
3405 * looks as though older kernels can get into trouble with 3457 * looks as though older kernels can get into trouble with
@@ -3411,36 +3463,26 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
3411 if (current->flags & PF_MEMALLOC) 3463 if (current->flags & PF_MEMALLOC)
3412 return; 3464 return;
3413 3465
3414 btrfs_balance_delayed_items(root); 3466 if (flush_delayed)
3467 btrfs_balance_delayed_items(root);
3415 3468
3416 num_dirty = root->fs_info->dirty_metadata_bytes; 3469 num_dirty = root->fs_info->dirty_metadata_bytes;
3417 3470
3418 if (num_dirty > thresh) { 3471 if (num_dirty > thresh) {
3419 balance_dirty_pages_ratelimited_nr( 3472 balance_dirty_pages_ratelimited(
3420 root->fs_info->btree_inode->i_mapping, 1); 3473 root->fs_info->btree_inode->i_mapping);
3421 } 3474 }
3422 return; 3475 return;
3423} 3476}
3424 3477
3425void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 3478void btrfs_btree_balance_dirty(struct btrfs_root *root)
3426{ 3479{
3427 /* 3480 __btrfs_btree_balance_dirty(root, 1);
3428 * looks as though older kernels can get into trouble with 3481}
3429 * this code, they end up stuck in balance_dirty_pages forever
3430 */
3431 u64 num_dirty;
3432 unsigned long thresh = 32 * 1024 * 1024;
3433
3434 if (current->flags & PF_MEMALLOC)
3435 return;
3436
3437 num_dirty = root->fs_info->dirty_metadata_bytes;
3438 3482
3439 if (num_dirty > thresh) { 3483void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
3440 balance_dirty_pages_ratelimited_nr( 3484{
3441 root->fs_info->btree_inode->i_mapping, 1); 3485 __btrfs_btree_balance_dirty(root, 0);
3442 }
3443 return;
3444} 3486}
3445 3487
3446int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) 3488int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 2025a9132c16..305c33efb0e3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -62,8 +62,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
62struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 62struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
63 struct btrfs_key *location); 63 struct btrfs_key *location);
64int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); 64int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
65void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 65void btrfs_btree_balance_dirty(struct btrfs_root *root);
66void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 66void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
67void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); 67void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
68void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 68void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
69int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 69int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3d3e2c17d8d1..521e9d4424f6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
33#include "volumes.h" 33#include "volumes.h"
34#include "locking.h" 34#include "locking.h"
35#include "free-space-cache.h" 35#include "free-space-cache.h"
36#include "math.h"
36 37
37#undef SCRAMBLE_DELAYED_REFS 38#undef SCRAMBLE_DELAYED_REFS
38 39
@@ -649,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
649 rcu_read_unlock(); 650 rcu_read_unlock();
650} 651}
651 652
652static u64 div_factor(u64 num, int factor)
653{
654 if (factor == 10)
655 return num;
656 num *= factor;
657 do_div(num, 10);
658 return num;
659}
660
661static u64 div_factor_fine(u64 num, int factor)
662{
663 if (factor == 100)
664 return num;
665 num *= factor;
666 do_div(num, 100);
667 return num;
668}
669
670u64 btrfs_find_block_group(struct btrfs_root *root, 653u64 btrfs_find_block_group(struct btrfs_root *root,
671 u64 search_start, u64 search_hint, int owner) 654 u64 search_start, u64 search_hint, int owner)
672{ 655{
@@ -1835,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1835 1818
1836 1819
1837 /* Tell the block device(s) that the sectors can be discarded */ 1820 /* Tell the block device(s) that the sectors can be discarded */
1838 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, 1821 ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
1839 bytenr, &num_bytes, &bbio, 0); 1822 bytenr, &num_bytes, &bbio, 0);
1840 /* Error condition is -ENOMEM */ 1823 /* Error condition is -ENOMEM */
1841 if (!ret) { 1824 if (!ret) {
@@ -2314,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2314 kfree(extent_op); 2297 kfree(extent_op);
2315 2298
2316 if (ret) { 2299 if (ret) {
2300 list_del_init(&locked_ref->cluster);
2301 mutex_unlock(&locked_ref->mutex);
2302
2317 printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); 2303 printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
2318 spin_lock(&delayed_refs->lock); 2304 spin_lock(&delayed_refs->lock);
2319 return ret; 2305 return ret;
@@ -2356,6 +2342,10 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2356 count++; 2342 count++;
2357 2343
2358 if (ret) { 2344 if (ret) {
2345 if (locked_ref) {
2346 list_del_init(&locked_ref->cluster);
2347 mutex_unlock(&locked_ref->mutex);
2348 }
2359 printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); 2349 printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
2360 spin_lock(&delayed_refs->lock); 2350 spin_lock(&delayed_refs->lock);
2361 return ret; 2351 return ret;
@@ -3661,7 +3651,7 @@ out:
3661 3651
3662static int can_overcommit(struct btrfs_root *root, 3652static int can_overcommit(struct btrfs_root *root,
3663 struct btrfs_space_info *space_info, u64 bytes, 3653 struct btrfs_space_info *space_info, u64 bytes,
3664 int flush) 3654 enum btrfs_reserve_flush_enum flush)
3665{ 3655{
3666 u64 profile = btrfs_get_alloc_profile(root, 0); 3656 u64 profile = btrfs_get_alloc_profile(root, 0);
3667 u64 avail; 3657 u64 avail;
@@ -3685,11 +3675,11 @@ static int can_overcommit(struct btrfs_root *root,
3685 avail >>= 1; 3675 avail >>= 1;
3686 3676
3687 /* 3677 /*
3688 * If we aren't flushing don't let us overcommit too much, say 3678 * If we aren't flushing all things, let us overcommit up to
3689 * 1/8th of the space. If we can flush, let it overcommit up to 3679 * 1/2th of the space. If we can flush, don't let us overcommit
3690 * 1/2 of the space. 3680 * too much, let it overcommit up to 1/8 of the space.
3691 */ 3681 */
3692 if (flush) 3682 if (flush == BTRFS_RESERVE_FLUSH_ALL)
3693 avail >>= 3; 3683 avail >>= 3;
3694 else 3684 else
3695 avail >>= 1; 3685 avail >>= 1;
@@ -3699,6 +3689,20 @@ static int can_overcommit(struct btrfs_root *root,
3699 return 0; 3689 return 0;
3700} 3690}
3701 3691
3692static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
3693 unsigned long nr_pages,
3694 enum wb_reason reason)
3695{
3696 if (!writeback_in_progress(sb->s_bdi) &&
3697 down_read_trylock(&sb->s_umount)) {
3698 writeback_inodes_sb_nr(sb, nr_pages, reason);
3699 up_read(&sb->s_umount);
3700 return 1;
3701 }
3702
3703 return 0;
3704}
3705
3702/* 3706/*
3703 * shrink metadata reservation for delalloc 3707 * shrink metadata reservation for delalloc
3704 */ 3708 */
@@ -3713,6 +3717,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3713 long time_left; 3717 long time_left;
3714 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3718 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3715 int loops = 0; 3719 int loops = 0;
3720 enum btrfs_reserve_flush_enum flush;
3716 3721
3717 trans = (struct btrfs_trans_handle *)current->journal_info; 3722 trans = (struct btrfs_trans_handle *)current->journal_info;
3718 block_rsv = &root->fs_info->delalloc_block_rsv; 3723 block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -3730,8 +3735,9 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3730 while (delalloc_bytes && loops < 3) { 3735 while (delalloc_bytes && loops < 3) {
3731 max_reclaim = min(delalloc_bytes, to_reclaim); 3736 max_reclaim = min(delalloc_bytes, to_reclaim);
3732 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 3737 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3733 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, 3738 writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
3734 WB_REASON_FS_FREE_SPACE); 3739 nr_pages,
3740 WB_REASON_FS_FREE_SPACE);
3735 3741
3736 /* 3742 /*
3737 * We need to wait for the async pages to actually start before 3743 * We need to wait for the async pages to actually start before
@@ -3740,8 +3746,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3740 wait_event(root->fs_info->async_submit_wait, 3746 wait_event(root->fs_info->async_submit_wait,
3741 !atomic_read(&root->fs_info->async_delalloc_pages)); 3747 !atomic_read(&root->fs_info->async_delalloc_pages));
3742 3748
3749 if (!trans)
3750 flush = BTRFS_RESERVE_FLUSH_ALL;
3751 else
3752 flush = BTRFS_RESERVE_NO_FLUSH;
3743 spin_lock(&space_info->lock); 3753 spin_lock(&space_info->lock);
3744 if (can_overcommit(root, space_info, orig, !trans)) { 3754 if (can_overcommit(root, space_info, orig, flush)) {
3745 spin_unlock(&space_info->lock); 3755 spin_unlock(&space_info->lock);
3746 break; 3756 break;
3747 } 3757 }
@@ -3888,7 +3898,7 @@ static int flush_space(struct btrfs_root *root,
3888 * @root - the root we're allocating for 3898 * @root - the root we're allocating for
3889 * @block_rsv - the block_rsv we're allocating for 3899 * @block_rsv - the block_rsv we're allocating for
3890 * @orig_bytes - the number of bytes we want 3900 * @orig_bytes - the number of bytes we want
3891 * @flush - wether or not we can flush to make our reservation 3901 * @flush - whether or not we can flush to make our reservation
3892 * 3902 *
3893 * This will reserve orgi_bytes number of bytes from the space info associated 3903 * This will reserve orgi_bytes number of bytes from the space info associated
3894 * with the block_rsv. If there is not enough space it will make an attempt to 3904 * with the block_rsv. If there is not enough space it will make an attempt to
@@ -3899,7 +3909,8 @@ static int flush_space(struct btrfs_root *root,
3899 */ 3909 */
3900static int reserve_metadata_bytes(struct btrfs_root *root, 3910static int reserve_metadata_bytes(struct btrfs_root *root,
3901 struct btrfs_block_rsv *block_rsv, 3911 struct btrfs_block_rsv *block_rsv,
3902 u64 orig_bytes, int flush) 3912 u64 orig_bytes,
3913 enum btrfs_reserve_flush_enum flush)
3903{ 3914{
3904 struct btrfs_space_info *space_info = block_rsv->space_info; 3915 struct btrfs_space_info *space_info = block_rsv->space_info;
3905 u64 used; 3916 u64 used;
@@ -3912,10 +3923,11 @@ again:
3912 ret = 0; 3923 ret = 0;
3913 spin_lock(&space_info->lock); 3924 spin_lock(&space_info->lock);
3914 /* 3925 /*
3915 * We only want to wait if somebody other than us is flushing and we are 3926 * We only want to wait if somebody other than us is flushing and we
3916 * actually alloed to flush. 3927 * are actually allowed to flush all things.
3917 */ 3928 */
3918 while (flush && !flushing && space_info->flush) { 3929 while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
3930 space_info->flush) {
3919 spin_unlock(&space_info->lock); 3931 spin_unlock(&space_info->lock);
3920 /* 3932 /*
3921 * If we have a trans handle we can't wait because the flusher 3933 * If we have a trans handle we can't wait because the flusher
@@ -3981,23 +3993,40 @@ again:
3981 * Couldn't make our reservation, save our place so while we're trying 3993 * Couldn't make our reservation, save our place so while we're trying
3982 * to reclaim space we can actually use it instead of somebody else 3994 * to reclaim space we can actually use it instead of somebody else
3983 * stealing it from us. 3995 * stealing it from us.
3996 *
3997 * We make the other tasks wait for the flush only when we can flush
3998 * all things.
3984 */ 3999 */
3985 if (ret && flush) { 4000 if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
3986 flushing = true; 4001 flushing = true;
3987 space_info->flush = 1; 4002 space_info->flush = 1;
3988 } 4003 }
3989 4004
3990 spin_unlock(&space_info->lock); 4005 spin_unlock(&space_info->lock);
3991 4006
3992 if (!ret || !flush) 4007 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
3993 goto out; 4008 goto out;
3994 4009
3995 ret = flush_space(root, space_info, num_bytes, orig_bytes, 4010 ret = flush_space(root, space_info, num_bytes, orig_bytes,
3996 flush_state); 4011 flush_state);
3997 flush_state++; 4012 flush_state++;
4013
4014 /*
4015 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4016 * would happen. So skip delalloc flush.
4017 */
4018 if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4019 (flush_state == FLUSH_DELALLOC ||
4020 flush_state == FLUSH_DELALLOC_WAIT))
4021 flush_state = ALLOC_CHUNK;
4022
3998 if (!ret) 4023 if (!ret)
3999 goto again; 4024 goto again;
4000 else if (flush_state <= COMMIT_TRANS) 4025 else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4026 flush_state < COMMIT_TRANS)
4027 goto again;
4028 else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
4029 flush_state <= COMMIT_TRANS)
4001 goto again; 4030 goto again;
4002 4031
4003out: 4032out:
@@ -4148,9 +4177,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
4148 kfree(rsv); 4177 kfree(rsv);
4149} 4178}
4150 4179
4151static inline int __block_rsv_add(struct btrfs_root *root, 4180int btrfs_block_rsv_add(struct btrfs_root *root,
4152 struct btrfs_block_rsv *block_rsv, 4181 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
4153 u64 num_bytes, int flush) 4182 enum btrfs_reserve_flush_enum flush)
4154{ 4183{
4155 int ret; 4184 int ret;
4156 4185
@@ -4166,20 +4195,6 @@ static inline int __block_rsv_add(struct btrfs_root *root,
4166 return ret; 4195 return ret;
4167} 4196}
4168 4197
4169int btrfs_block_rsv_add(struct btrfs_root *root,
4170 struct btrfs_block_rsv *block_rsv,
4171 u64 num_bytes)
4172{
4173 return __block_rsv_add(root, block_rsv, num_bytes, 1);
4174}
4175
4176int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
4177 struct btrfs_block_rsv *block_rsv,
4178 u64 num_bytes)
4179{
4180 return __block_rsv_add(root, block_rsv, num_bytes, 0);
4181}
4182
4183int btrfs_block_rsv_check(struct btrfs_root *root, 4198int btrfs_block_rsv_check(struct btrfs_root *root,
4184 struct btrfs_block_rsv *block_rsv, int min_factor) 4199 struct btrfs_block_rsv *block_rsv, int min_factor)
4185{ 4200{
@@ -4198,9 +4213,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
4198 return ret; 4213 return ret;
4199} 4214}
4200 4215
4201static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, 4216int btrfs_block_rsv_refill(struct btrfs_root *root,
4202 struct btrfs_block_rsv *block_rsv, 4217 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
4203 u64 min_reserved, int flush) 4218 enum btrfs_reserve_flush_enum flush)
4204{ 4219{
4205 u64 num_bytes = 0; 4220 u64 num_bytes = 0;
4206 int ret = -ENOSPC; 4221 int ret = -ENOSPC;
@@ -4228,20 +4243,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
4228 return ret; 4243 return ret;
4229} 4244}
4230 4245
4231int btrfs_block_rsv_refill(struct btrfs_root *root,
4232 struct btrfs_block_rsv *block_rsv,
4233 u64 min_reserved)
4234{
4235 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
4236}
4237
4238int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
4239 struct btrfs_block_rsv *block_rsv,
4240 u64 min_reserved)
4241{
4242 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
4243}
4244
4245int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 4246int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
4246 struct btrfs_block_rsv *dst_rsv, 4247 struct btrfs_block_rsv *dst_rsv,
4247 u64 num_bytes) 4248 u64 num_bytes)
@@ -4532,17 +4533,27 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4532 u64 csum_bytes; 4533 u64 csum_bytes;
4533 unsigned nr_extents = 0; 4534 unsigned nr_extents = 0;
4534 int extra_reserve = 0; 4535 int extra_reserve = 0;
4535 int flush = 1; 4536 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
4536 int ret; 4537 int ret;
4538 bool delalloc_lock = true;
4537 4539
4538 /* Need to be holding the i_mutex here if we aren't free space cache */ 4540 /* If we are a free space inode we need to not flush since we will be in
4539 if (btrfs_is_free_space_inode(inode)) 4541 * the middle of a transaction commit. We also don't need the delalloc
4540 flush = 0; 4542 * mutex since we won't race with anybody. We need this mostly to make
4543 * lockdep shut its filthy mouth.
4544 */
4545 if (btrfs_is_free_space_inode(inode)) {
4546 flush = BTRFS_RESERVE_NO_FLUSH;
4547 delalloc_lock = false;
4548 }
4541 4549
4542 if (flush && btrfs_transaction_in_commit(root->fs_info)) 4550 if (flush != BTRFS_RESERVE_NO_FLUSH &&
4551 btrfs_transaction_in_commit(root->fs_info))
4543 schedule_timeout(1); 4552 schedule_timeout(1);
4544 4553
4545 mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 4554 if (delalloc_lock)
4555 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4556
4546 num_bytes = ALIGN(num_bytes, root->sectorsize); 4557 num_bytes = ALIGN(num_bytes, root->sectorsize);
4547 4558
4548 spin_lock(&BTRFS_I(inode)->lock); 4559 spin_lock(&BTRFS_I(inode)->lock);
@@ -4572,7 +4583,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4572 ret = btrfs_qgroup_reserve(root, num_bytes + 4583 ret = btrfs_qgroup_reserve(root, num_bytes +
4573 nr_extents * root->leafsize); 4584 nr_extents * root->leafsize);
4574 if (ret) { 4585 if (ret) {
4575 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4586 spin_lock(&BTRFS_I(inode)->lock);
4587 calc_csum_metadata_size(inode, num_bytes, 0);
4588 spin_unlock(&BTRFS_I(inode)->lock);
4589 if (delalloc_lock)
4590 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4576 return ret; 4591 return ret;
4577 } 4592 }
4578 } 4593 }
@@ -4607,7 +4622,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4607 btrfs_ino(inode), 4622 btrfs_ino(inode),
4608 to_free, 0); 4623 to_free, 0);
4609 } 4624 }
4610 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4625 if (root->fs_info->quota_enabled) {
4626 btrfs_qgroup_free(root, num_bytes +
4627 nr_extents * root->leafsize);
4628 }
4629 if (delalloc_lock)
4630 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4611 return ret; 4631 return ret;
4612 } 4632 }
4613 4633
@@ -4619,7 +4639,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4619 } 4639 }
4620 BTRFS_I(inode)->reserved_extents += nr_extents; 4640 BTRFS_I(inode)->reserved_extents += nr_extents;
4621 spin_unlock(&BTRFS_I(inode)->lock); 4641 spin_unlock(&BTRFS_I(inode)->lock);
4622 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4642
4643 if (delalloc_lock)
4644 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4623 4645
4624 if (to_reserve) 4646 if (to_reserve)
4625 trace_btrfs_space_reservation(root->fs_info,"delalloc", 4647 trace_btrfs_space_reservation(root->fs_info,"delalloc",
@@ -4969,9 +4991,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4969{ 4991{
4970 struct btrfs_fs_info *fs_info = root->fs_info; 4992 struct btrfs_fs_info *fs_info = root->fs_info;
4971 struct btrfs_block_group_cache *cache = NULL; 4993 struct btrfs_block_group_cache *cache = NULL;
4994 struct btrfs_space_info *space_info;
4995 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4972 u64 len; 4996 u64 len;
4997 bool readonly;
4973 4998
4974 while (start <= end) { 4999 while (start <= end) {
5000 readonly = false;
4975 if (!cache || 5001 if (!cache ||
4976 start >= cache->key.objectid + cache->key.offset) { 5002 start >= cache->key.objectid + cache->key.offset) {
4977 if (cache) 5003 if (cache)
@@ -4989,15 +5015,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4989 } 5015 }
4990 5016
4991 start += len; 5017 start += len;
5018 space_info = cache->space_info;
4992 5019
4993 spin_lock(&cache->space_info->lock); 5020 spin_lock(&space_info->lock);
4994 spin_lock(&cache->lock); 5021 spin_lock(&cache->lock);
4995 cache->pinned -= len; 5022 cache->pinned -= len;
4996 cache->space_info->bytes_pinned -= len; 5023 space_info->bytes_pinned -= len;
4997 if (cache->ro) 5024 if (cache->ro) {
4998 cache->space_info->bytes_readonly += len; 5025 space_info->bytes_readonly += len;
5026 readonly = true;
5027 }
4999 spin_unlock(&cache->lock); 5028 spin_unlock(&cache->lock);
5000 spin_unlock(&cache->space_info->lock); 5029 if (!readonly && global_rsv->space_info == space_info) {
5030 spin_lock(&global_rsv->lock);
5031 if (!global_rsv->full) {
5032 len = min(len, global_rsv->size -
5033 global_rsv->reserved);
5034 global_rsv->reserved += len;
5035 space_info->bytes_may_use += len;
5036 if (global_rsv->reserved >= global_rsv->size)
5037 global_rsv->full = 1;
5038 }
5039 spin_unlock(&global_rsv->lock);
5040 }
5041 spin_unlock(&space_info->lock);
5001 } 5042 }
5002 5043
5003 if (cache) 5044 if (cache)
@@ -5466,7 +5507,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5466 return 0; 5507 return 0;
5467} 5508}
5468 5509
5469static int __get_block_group_index(u64 flags) 5510int __get_raid_index(u64 flags)
5470{ 5511{
5471 int index; 5512 int index;
5472 5513
@@ -5486,7 +5527,7 @@ static int __get_block_group_index(u64 flags)
5486 5527
5487static int get_block_group_index(struct btrfs_block_group_cache *cache) 5528static int get_block_group_index(struct btrfs_block_group_cache *cache)
5488{ 5529{
5489 return __get_block_group_index(cache->flags); 5530 return __get_raid_index(cache->flags);
5490} 5531}
5491 5532
5492enum btrfs_loop_type { 5533enum btrfs_loop_type {
@@ -6269,7 +6310,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6269 block_rsv = get_block_rsv(trans, root); 6310 block_rsv = get_block_rsv(trans, root);
6270 6311
6271 if (block_rsv->size == 0) { 6312 if (block_rsv->size == 0) {
6272 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); 6313 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6314 BTRFS_RESERVE_NO_FLUSH);
6273 /* 6315 /*
6274 * If we couldn't reserve metadata bytes try and use some from 6316 * If we couldn't reserve metadata bytes try and use some from
6275 * the global reserve. 6317 * the global reserve.
@@ -6292,11 +6334,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6292 static DEFINE_RATELIMIT_STATE(_rs, 6334 static DEFINE_RATELIMIT_STATE(_rs,
6293 DEFAULT_RATELIMIT_INTERVAL, 6335 DEFAULT_RATELIMIT_INTERVAL,
6294 /*DEFAULT_RATELIMIT_BURST*/ 2); 6336 /*DEFAULT_RATELIMIT_BURST*/ 2);
6295 if (__ratelimit(&_rs)) { 6337 if (__ratelimit(&_rs))
6296 printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); 6338 WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
6297 WARN_ON(1); 6339 ret);
6298 } 6340 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6299 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); 6341 BTRFS_RESERVE_NO_FLUSH);
6300 if (!ret) { 6342 if (!ret) {
6301 return block_rsv; 6343 return block_rsv;
6302 } else if (ret && block_rsv != global_rsv) { 6344 } else if (ret && block_rsv != global_rsv) {
@@ -7427,7 +7469,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7427 */ 7469 */
7428 target = get_restripe_target(root->fs_info, block_group->flags); 7470 target = get_restripe_target(root->fs_info, block_group->flags);
7429 if (target) { 7471 if (target) {
7430 index = __get_block_group_index(extended_to_chunk(target)); 7472 index = __get_raid_index(extended_to_chunk(target));
7431 } else { 7473 } else {
7432 /* 7474 /*
7433 * this is just a balance, so if we were marked as full 7475 * this is just a balance, so if we were marked as full
@@ -7461,7 +7503,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7461 * check to make sure we can actually find a chunk with enough 7503 * check to make sure we can actually find a chunk with enough
7462 * space to fit our block group in. 7504 * space to fit our block group in.
7463 */ 7505 */
7464 if (device->total_bytes > device->bytes_used + min_free) { 7506 if (device->total_bytes > device->bytes_used + min_free &&
7507 !device->is_tgtdev_for_dev_replace) {
7465 ret = find_free_dev_extent(device, min_free, 7508 ret = find_free_dev_extent(device, min_free,
7466 &dev_offset, NULL); 7509 &dev_offset, NULL);
7467 if (!ret) 7510 if (!ret)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 472873a94d96..1b319df29eee 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -341,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree,
341{ 341{
342 struct rb_node *node; 342 struct rb_node *node;
343 343
344 if (end < start) { 344 if (end < start)
345 printk(KERN_ERR "btrfs end < start %llu %llu\n", 345 WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
346 (unsigned long long)end, 346 (unsigned long long)end,
347 (unsigned long long)start); 347 (unsigned long long)start);
348 WARN_ON(1);
349 }
350 state->start = start; 348 state->start = start;
351 state->end = end; 349 state->end = end;
352 350
@@ -1919,12 +1917,12 @@ static void repair_io_failure_callback(struct bio *bio, int err)
1919 * the standard behavior is to write all copies in a raid setup. here we only 1917 * the standard behavior is to write all copies in a raid setup. here we only
1920 * want to write the one bad copy. so we do the mapping for ourselves and issue 1918 * want to write the one bad copy. so we do the mapping for ourselves and issue
1921 * submit_bio directly. 1919 * submit_bio directly.
1922 * to avoid any synchonization issues, wait for the data after writing, which 1920 * to avoid any synchronization issues, wait for the data after writing, which
1923 * actually prevents the read that triggered the error from finishing. 1921 * actually prevents the read that triggered the error from finishing.
1924 * currently, there can be no more than two copies of every data bit. thus, 1922 * currently, there can be no more than two copies of every data bit. thus,
1925 * exactly one rewrite is required. 1923 * exactly one rewrite is required.
1926 */ 1924 */
1927int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, 1925int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1928 u64 length, u64 logical, struct page *page, 1926 u64 length, u64 logical, struct page *page,
1929 int mirror_num) 1927 int mirror_num)
1930{ 1928{
@@ -1946,7 +1944,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1946 bio->bi_size = 0; 1944 bio->bi_size = 0;
1947 map_length = length; 1945 map_length = length;
1948 1946
1949 ret = btrfs_map_block(map_tree, WRITE, logical, 1947 ret = btrfs_map_block(fs_info, WRITE, logical,
1950 &map_length, &bbio, mirror_num); 1948 &map_length, &bbio, mirror_num);
1951 if (ret) { 1949 if (ret) {
1952 bio_put(bio); 1950 bio_put(bio);
@@ -1984,14 +1982,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1984int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, 1982int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
1985 int mirror_num) 1983 int mirror_num)
1986{ 1984{
1987 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
1988 u64 start = eb->start; 1985 u64 start = eb->start;
1989 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 1986 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
1990 int ret = 0; 1987 int ret = 0;
1991 1988
1992 for (i = 0; i < num_pages; i++) { 1989 for (i = 0; i < num_pages; i++) {
1993 struct page *p = extent_buffer_page(eb, i); 1990 struct page *p = extent_buffer_page(eb, i);
1994 ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, 1991 ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
1995 start, p, mirror_num); 1992 start, p, mirror_num);
1996 if (ret) 1993 if (ret)
1997 break; 1994 break;
@@ -2010,7 +2007,7 @@ static int clean_io_failure(u64 start, struct page *page)
2010 u64 private; 2007 u64 private;
2011 u64 private_failure; 2008 u64 private_failure;
2012 struct io_failure_record *failrec; 2009 struct io_failure_record *failrec;
2013 struct btrfs_mapping_tree *map_tree; 2010 struct btrfs_fs_info *fs_info;
2014 struct extent_state *state; 2011 struct extent_state *state;
2015 int num_copies; 2012 int num_copies;
2016 int did_repair = 0; 2013 int did_repair = 0;
@@ -2046,11 +2043,11 @@ static int clean_io_failure(u64 start, struct page *page)
2046 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 2043 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
2047 2044
2048 if (state && state->start == failrec->start) { 2045 if (state && state->start == failrec->start) {
2049 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 2046 fs_info = BTRFS_I(inode)->root->fs_info;
2050 num_copies = btrfs_num_copies(map_tree, failrec->logical, 2047 num_copies = btrfs_num_copies(fs_info, failrec->logical,
2051 failrec->len); 2048 failrec->len);
2052 if (num_copies > 1) { 2049 if (num_copies > 1) {
2053 ret = repair_io_failure(map_tree, start, failrec->len, 2050 ret = repair_io_failure(fs_info, start, failrec->len,
2054 failrec->logical, page, 2051 failrec->logical, page,
2055 failrec->failed_mirror); 2052 failrec->failed_mirror);
2056 did_repair = !ret; 2053 did_repair = !ret;
@@ -2159,9 +2156,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
2159 * clean_io_failure() clean all those errors at once. 2156 * clean_io_failure() clean all those errors at once.
2160 */ 2157 */
2161 } 2158 }
2162 num_copies = btrfs_num_copies( 2159 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
2163 &BTRFS_I(inode)->root->fs_info->mapping_tree, 2160 failrec->logical, failrec->len);
2164 failrec->logical, failrec->len);
2165 if (num_copies == 1) { 2161 if (num_copies == 1) {
2166 /* 2162 /*
2167 * we only have a single copy of the data, so don't bother with 2163 * we only have a single copy of the data, so don't bother with
@@ -2466,10 +2462,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2466 return bio; 2462 return bio;
2467} 2463}
2468 2464
2469/*
2470 * Since writes are async, they will only return -ENOMEM.
2471 * Reads can return the full range of I/O error conditions.
2472 */
2473static int __must_check submit_one_bio(int rw, struct bio *bio, 2465static int __must_check submit_one_bio(int rw, struct bio *bio,
2474 int mirror_num, unsigned long bio_flags) 2466 int mirror_num, unsigned long bio_flags)
2475{ 2467{
@@ -4721,10 +4713,9 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
4721 } 4713 }
4722 4714
4723 if (start + min_len > eb->len) { 4715 if (start + min_len > eb->len) {
4724 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 4716 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
4725 "wanted %lu %lu\n", (unsigned long long)eb->start, 4717 "wanted %lu %lu\n", (unsigned long long)eb->start,
4726 eb->len, start, min_len); 4718 eb->len, start, min_len);
4727 WARN_ON(1);
4728 return -EINVAL; 4719 return -EINVAL;
4729 } 4720 }
4730 4721
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 711d12b80028..2eacfabd3263 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -337,9 +337,9 @@ struct bio *
337btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 337btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
338 gfp_t gfp_flags); 338 gfp_t gfp_flags);
339 339
340struct btrfs_mapping_tree; 340struct btrfs_fs_info;
341 341
342int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, 342int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
343 u64 length, u64 logical, struct page *page, 343 u64 length, u64 logical, struct page *page,
344 int mirror_num); 344 int mirror_num);
345int end_extent_writepage(struct page *page, int err, u64 start, u64 end); 345int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b8cbc8d5c7f7..f169d6b11d7f 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -49,7 +49,7 @@ void extent_map_tree_init(struct extent_map_tree *tree)
49struct extent_map *alloc_extent_map(void) 49struct extent_map *alloc_extent_map(void)
50{ 50{
51 struct extent_map *em; 51 struct extent_map *em;
52 em = kmem_cache_alloc(extent_map_cache, GFP_NOFS); 52 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
53 if (!em) 53 if (!em)
54 return NULL; 54 return NULL;
55 em->in_tree = 0; 55 em->in_tree = 0;
@@ -198,16 +198,15 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
198 merge = rb_entry(rb, struct extent_map, rb_node); 198 merge = rb_entry(rb, struct extent_map, rb_node);
199 if (rb && mergable_maps(merge, em)) { 199 if (rb && mergable_maps(merge, em)) {
200 em->start = merge->start; 200 em->start = merge->start;
201 em->orig_start = merge->orig_start;
201 em->len += merge->len; 202 em->len += merge->len;
202 em->block_len += merge->block_len; 203 em->block_len += merge->block_len;
203 em->block_start = merge->block_start; 204 em->block_start = merge->block_start;
204 merge->in_tree = 0; 205 merge->in_tree = 0;
205 if (merge->generation > em->generation) { 206 em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
206 em->mod_start = em->start; 207 em->mod_start = merge->mod_start;
207 em->mod_len = em->len; 208 em->generation = max(em->generation, merge->generation);
208 em->generation = merge->generation; 209 list_move(&em->list, &tree->modified_extents);
209 list_move(&em->list, &tree->modified_extents);
210 }
211 210
212 list_del_init(&merge->list); 211 list_del_init(&merge->list);
213 rb_erase(&merge->rb_node, &tree->map); 212 rb_erase(&merge->rb_node, &tree->map);
@@ -223,23 +222,19 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
223 em->block_len += merge->len; 222 em->block_len += merge->len;
224 rb_erase(&merge->rb_node, &tree->map); 223 rb_erase(&merge->rb_node, &tree->map);
225 merge->in_tree = 0; 224 merge->in_tree = 0;
226 if (merge->generation > em->generation) { 225 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
227 em->mod_len = em->len; 226 em->generation = max(em->generation, merge->generation);
228 em->generation = merge->generation;
229 list_move(&em->list, &tree->modified_extents);
230 }
231 list_del_init(&merge->list); 227 list_del_init(&merge->list);
232 free_extent_map(merge); 228 free_extent_map(merge);
233 } 229 }
234} 230}
235 231
236/** 232/**
237 * unpint_extent_cache - unpin an extent from the cache 233 * unpin_extent_cache - unpin an extent from the cache
238 * @tree: tree to unpin the extent in 234 * @tree: tree to unpin the extent in
239 * @start: logical offset in the file 235 * @start: logical offset in the file
240 * @len: length of the extent 236 * @len: length of the extent
241 * @gen: generation that this extent has been modified in 237 * @gen: generation that this extent has been modified in
242 * @prealloc: if this is set we need to clear the prealloc flag
243 * 238 *
244 * Called after an extent has been written to disk properly. Set the generation 239 * Called after an extent has been written to disk properly. Set the generation
245 * to the generation that actually added the file item to the inode so we know 240 * to the generation that actually added the file item to the inode so we know
@@ -266,9 +261,9 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
266 em->mod_start = em->start; 261 em->mod_start = em->start;
267 em->mod_len = em->len; 262 em->mod_len = em->len;
268 263
269 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 264 if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
270 prealloc = true; 265 prealloc = true;
271 clear_bit(EXTENT_FLAG_PREALLOC, &em->flags); 266 clear_bit(EXTENT_FLAG_FILLING, &em->flags);
272 } 267 }
273 268
274 try_merge_map(tree, em); 269 try_merge_map(tree, em);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 679225555f7b..922943ce29e8 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -14,6 +14,7 @@
14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ 14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ 15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
16#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ 16#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
17#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
17 18
18struct extent_map { 19struct extent_map {
19 struct rb_node rb_node; 20 struct rb_node rb_node;
@@ -24,6 +25,7 @@ struct extent_map {
24 u64 mod_start; 25 u64 mod_start;
25 u64 mod_len; 26 u64 mod_len;
26 u64 orig_start; 27 u64 orig_start;
28 u64 orig_block_len;
27 u64 block_start; 29 u64 block_start;
28 u64 block_len; 30 u64 block_len;
29 u64 generation; 31 u64 generation;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 1ad08e4e4a15..bd38cef42358 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -133,7 +133,6 @@ fail:
133 return ERR_PTR(ret); 133 return ERR_PTR(ret);
134} 134}
135 135
136
137int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, 136int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
138 struct btrfs_root *root, 137 struct btrfs_root *root,
139 struct btrfs_path *path, u64 objectid, 138 struct btrfs_path *path, u64 objectid,
@@ -151,6 +150,26 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
151 return ret; 150 return ret;
152} 151}
153 152
153u64 btrfs_file_extent_length(struct btrfs_path *path)
154{
155 int extent_type;
156 struct btrfs_file_extent_item *fi;
157 u64 len;
158
159 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
160 struct btrfs_file_extent_item);
161 extent_type = btrfs_file_extent_type(path->nodes[0], fi);
162
163 if (extent_type == BTRFS_FILE_EXTENT_REG ||
164 extent_type == BTRFS_FILE_EXTENT_PREALLOC)
165 len = btrfs_file_extent_num_bytes(path->nodes[0], fi);
166 else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
167 len = btrfs_file_extent_inline_len(path->nodes[0], fi);
168 else
169 BUG();
170
171 return len;
172}
154 173
155static int __btrfs_lookup_bio_sums(struct btrfs_root *root, 174static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
156 struct inode *inode, struct bio *bio, 175 struct inode *inode, struct bio *bio,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9ab1bed88116..77061bf43edb 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -41,6 +41,7 @@
41#include "compat.h" 41#include "compat.h"
42#include "volumes.h" 42#include "volumes.h"
43 43
44static struct kmem_cache *btrfs_inode_defrag_cachep;
44/* 45/*
45 * when auto defrag is enabled we 46 * when auto defrag is enabled we
46 * queue up these defrag structs to remember which 47 * queue up these defrag structs to remember which
@@ -90,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1,
90 * If an existing record is found the defrag item you 91 * If an existing record is found the defrag item you
91 * pass in is freed 92 * pass in is freed
92 */ 93 */
93static void __btrfs_add_inode_defrag(struct inode *inode, 94static int __btrfs_add_inode_defrag(struct inode *inode,
94 struct inode_defrag *defrag) 95 struct inode_defrag *defrag)
95{ 96{
96 struct btrfs_root *root = BTRFS_I(inode)->root; 97 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -118,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
118 entry->transid = defrag->transid; 119 entry->transid = defrag->transid;
119 if (defrag->last_offset > entry->last_offset) 120 if (defrag->last_offset > entry->last_offset)
120 entry->last_offset = defrag->last_offset; 121 entry->last_offset = defrag->last_offset;
121 goto exists; 122 return -EEXIST;
122 } 123 }
123 } 124 }
124 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 125 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
125 rb_link_node(&defrag->rb_node, parent, p); 126 rb_link_node(&defrag->rb_node, parent, p);
126 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 127 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
127 return; 128 return 0;
129}
128 130
129exists: 131static inline int __need_auto_defrag(struct btrfs_root *root)
130 kfree(defrag); 132{
131 return; 133 if (!btrfs_test_opt(root, AUTO_DEFRAG))
134 return 0;
135
136 if (btrfs_fs_closing(root->fs_info))
137 return 0;
132 138
139 return 1;
133} 140}
134 141
135/* 142/*
@@ -142,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
142 struct btrfs_root *root = BTRFS_I(inode)->root; 149 struct btrfs_root *root = BTRFS_I(inode)->root;
143 struct inode_defrag *defrag; 150 struct inode_defrag *defrag;
144 u64 transid; 151 u64 transid;
152 int ret;
145 153
146 if (!btrfs_test_opt(root, AUTO_DEFRAG)) 154 if (!__need_auto_defrag(root))
147 return 0;
148
149 if (btrfs_fs_closing(root->fs_info))
150 return 0; 155 return 0;
151 156
152 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 157 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
@@ -157,7 +162,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
157 else 162 else
158 transid = BTRFS_I(inode)->root->last_trans; 163 transid = BTRFS_I(inode)->root->last_trans;
159 164
160 defrag = kzalloc(sizeof(*defrag), GFP_NOFS); 165 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
161 if (!defrag) 166 if (!defrag)
162 return -ENOMEM; 167 return -ENOMEM;
163 168
@@ -166,20 +171,56 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
166 defrag->root = root->root_key.objectid; 171 defrag->root = root->root_key.objectid;
167 172
168 spin_lock(&root->fs_info->defrag_inodes_lock); 173 spin_lock(&root->fs_info->defrag_inodes_lock);
169 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 174 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
170 __btrfs_add_inode_defrag(inode, defrag); 175 /*
171 else 176 * If we set IN_DEFRAG flag and evict the inode from memory,
172 kfree(defrag); 177 * and then re-read this inode, this new inode doesn't have
178 * IN_DEFRAG flag. At the case, we may find the existed defrag.
179 */
180 ret = __btrfs_add_inode_defrag(inode, defrag);
181 if (ret)
182 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
183 } else {
184 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
185 }
173 spin_unlock(&root->fs_info->defrag_inodes_lock); 186 spin_unlock(&root->fs_info->defrag_inodes_lock);
174 return 0; 187 return 0;
175} 188}
176 189
177/* 190/*
178 * must be called with the defrag_inodes lock held 191 * Requeue the defrag object. If there is a defrag object that points to
192 * the same inode in the tree, we will merge them together (by
193 * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
179 */ 194 */
180struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, 195void btrfs_requeue_inode_defrag(struct inode *inode,
181 u64 root, u64 ino, 196 struct inode_defrag *defrag)
182 struct rb_node **next) 197{
198 struct btrfs_root *root = BTRFS_I(inode)->root;
199 int ret;
200
201 if (!__need_auto_defrag(root))
202 goto out;
203
204 /*
205 * Here we don't check the IN_DEFRAG flag, because we need merge
206 * them together.
207 */
208 spin_lock(&root->fs_info->defrag_inodes_lock);
209 ret = __btrfs_add_inode_defrag(inode, defrag);
210 spin_unlock(&root->fs_info->defrag_inodes_lock);
211 if (ret)
212 goto out;
213 return;
214out:
215 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
216}
217
218/*
219 * pick the defragable inode that we want, if it doesn't exist, we will get
220 * the next one.
221 */
222static struct inode_defrag *
223btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
183{ 224{
184 struct inode_defrag *entry = NULL; 225 struct inode_defrag *entry = NULL;
185 struct inode_defrag tmp; 226 struct inode_defrag tmp;
@@ -190,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
190 tmp.ino = ino; 231 tmp.ino = ino;
191 tmp.root = root; 232 tmp.root = root;
192 233
193 p = info->defrag_inodes.rb_node; 234 spin_lock(&fs_info->defrag_inodes_lock);
235 p = fs_info->defrag_inodes.rb_node;
194 while (p) { 236 while (p) {
195 parent = p; 237 parent = p;
196 entry = rb_entry(parent, struct inode_defrag, rb_node); 238 entry = rb_entry(parent, struct inode_defrag, rb_node);
@@ -201,52 +243,131 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
201 else if (ret > 0) 243 else if (ret > 0)
202 p = parent->rb_right; 244 p = parent->rb_right;
203 else 245 else
204 return entry; 246 goto out;
205 } 247 }
206 248
207 if (next) { 249 if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
208 while (parent && __compare_inode_defrag(&tmp, entry) > 0) { 250 parent = rb_next(parent);
209 parent = rb_next(parent); 251 if (parent)
210 entry = rb_entry(parent, struct inode_defrag, rb_node); 252 entry = rb_entry(parent, struct inode_defrag, rb_node);
211 } 253 else
212 *next = parent; 254 entry = NULL;
213 } 255 }
214 return NULL; 256out:
257 if (entry)
258 rb_erase(parent, &fs_info->defrag_inodes);
259 spin_unlock(&fs_info->defrag_inodes_lock);
260 return entry;
215} 261}
216 262
217/* 263void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
218 * run through the list of inodes in the FS that need
219 * defragging
220 */
221int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
222{ 264{
223 struct inode_defrag *defrag; 265 struct inode_defrag *defrag;
266 struct rb_node *node;
267
268 spin_lock(&fs_info->defrag_inodes_lock);
269 node = rb_first(&fs_info->defrag_inodes);
270 while (node) {
271 rb_erase(node, &fs_info->defrag_inodes);
272 defrag = rb_entry(node, struct inode_defrag, rb_node);
273 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
274
275 if (need_resched()) {
276 spin_unlock(&fs_info->defrag_inodes_lock);
277 cond_resched();
278 spin_lock(&fs_info->defrag_inodes_lock);
279 }
280
281 node = rb_first(&fs_info->defrag_inodes);
282 }
283 spin_unlock(&fs_info->defrag_inodes_lock);
284}
285
286#define BTRFS_DEFRAG_BATCH 1024
287
288static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
289 struct inode_defrag *defrag)
290{
224 struct btrfs_root *inode_root; 291 struct btrfs_root *inode_root;
225 struct inode *inode; 292 struct inode *inode;
226 struct rb_node *n;
227 struct btrfs_key key; 293 struct btrfs_key key;
228 struct btrfs_ioctl_defrag_range_args range; 294 struct btrfs_ioctl_defrag_range_args range;
229 u64 first_ino = 0;
230 u64 root_objectid = 0;
231 int num_defrag; 295 int num_defrag;
232 int defrag_batch = 1024;
233 296
297 /* get the inode */
298 key.objectid = defrag->root;
299 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
300 key.offset = (u64)-1;
301 inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
302 if (IS_ERR(inode_root)) {
303 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
304 return PTR_ERR(inode_root);
305 }
306
307 key.objectid = defrag->ino;
308 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
309 key.offset = 0;
310 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
311 if (IS_ERR(inode)) {
312 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
313 return PTR_ERR(inode);
314 }
315
316 /* do a chunk of defrag */
317 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
234 memset(&range, 0, sizeof(range)); 318 memset(&range, 0, sizeof(range));
235 range.len = (u64)-1; 319 range.len = (u64)-1;
320 range.start = defrag->last_offset;
321
322 sb_start_write(fs_info->sb);
323 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
324 BTRFS_DEFRAG_BATCH);
325 sb_end_write(fs_info->sb);
326 /*
327 * if we filled the whole defrag batch, there
328 * must be more work to do. Queue this defrag
329 * again
330 */
331 if (num_defrag == BTRFS_DEFRAG_BATCH) {
332 defrag->last_offset = range.start;
333 btrfs_requeue_inode_defrag(inode, defrag);
334 } else if (defrag->last_offset && !defrag->cycled) {
335 /*
336 * we didn't fill our defrag batch, but
337 * we didn't start at zero. Make sure we loop
338 * around to the start of the file.
339 */
340 defrag->last_offset = 0;
341 defrag->cycled = 1;
342 btrfs_requeue_inode_defrag(inode, defrag);
343 } else {
344 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
345 }
346
347 iput(inode);
348 return 0;
349}
350
351/*
352 * run through the list of inodes in the FS that need
353 * defragging
354 */
355int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
356{
357 struct inode_defrag *defrag;
358 u64 first_ino = 0;
359 u64 root_objectid = 0;
236 360
237 atomic_inc(&fs_info->defrag_running); 361 atomic_inc(&fs_info->defrag_running);
238 spin_lock(&fs_info->defrag_inodes_lock);
239 while(1) { 362 while(1) {
240 n = NULL; 363 if (!__need_auto_defrag(fs_info->tree_root))
364 break;
241 365
242 /* find an inode to defrag */ 366 /* find an inode to defrag */
243 defrag = btrfs_find_defrag_inode(fs_info, root_objectid, 367 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
244 first_ino, &n); 368 first_ino);
245 if (!defrag) { 369 if (!defrag) {
246 if (n) { 370 if (root_objectid || first_ino) {
247 defrag = rb_entry(n, struct inode_defrag,
248 rb_node);
249 } else if (root_objectid || first_ino) {
250 root_objectid = 0; 371 root_objectid = 0;
251 first_ino = 0; 372 first_ino = 0;
252 continue; 373 continue;
@@ -255,70 +376,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
255 } 376 }
256 } 377 }
257 378
258 /* remove it from the rbtree */
259 first_ino = defrag->ino + 1; 379 first_ino = defrag->ino + 1;
260 root_objectid = defrag->root; 380 root_objectid = defrag->root;
261 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
262
263 if (btrfs_fs_closing(fs_info))
264 goto next_free;
265
266 spin_unlock(&fs_info->defrag_inodes_lock);
267
268 /* get the inode */
269 key.objectid = defrag->root;
270 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
271 key.offset = (u64)-1;
272 inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
273 if (IS_ERR(inode_root))
274 goto next;
275
276 key.objectid = defrag->ino;
277 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
278 key.offset = 0;
279
280 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
281 if (IS_ERR(inode))
282 goto next;
283 381
284 /* do a chunk of defrag */ 382 __btrfs_run_defrag_inode(fs_info, defrag);
285 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
286 range.start = defrag->last_offset;
287 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
288 defrag_batch);
289 /*
290 * if we filled the whole defrag batch, there
291 * must be more work to do. Queue this defrag
292 * again
293 */
294 if (num_defrag == defrag_batch) {
295 defrag->last_offset = range.start;
296 __btrfs_add_inode_defrag(inode, defrag);
297 /*
298 * we don't want to kfree defrag, we added it back to
299 * the rbtree
300 */
301 defrag = NULL;
302 } else if (defrag->last_offset && !defrag->cycled) {
303 /*
304 * we didn't fill our defrag batch, but
305 * we didn't start at zero. Make sure we loop
306 * around to the start of the file.
307 */
308 defrag->last_offset = 0;
309 defrag->cycled = 1;
310 __btrfs_add_inode_defrag(inode, defrag);
311 defrag = NULL;
312 }
313
314 iput(inode);
315next:
316 spin_lock(&fs_info->defrag_inodes_lock);
317next_free:
318 kfree(defrag);
319 } 383 }
320 spin_unlock(&fs_info->defrag_inodes_lock);
321
322 atomic_dec(&fs_info->defrag_running); 384 atomic_dec(&fs_info->defrag_running);
323 385
324 /* 386 /*
@@ -526,6 +588,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
526 split->block_len = em->block_len; 588 split->block_len = em->block_len;
527 else 589 else
528 split->block_len = split->len; 590 split->block_len = split->len;
591 split->orig_block_len = max(split->block_len,
592 em->orig_block_len);
529 split->generation = gen; 593 split->generation = gen;
530 split->bdev = em->bdev; 594 split->bdev = em->bdev;
531 split->flags = flags; 595 split->flags = flags;
@@ -547,6 +611,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
547 split->flags = flags; 611 split->flags = flags;
548 split->compress_type = em->compress_type; 612 split->compress_type = em->compress_type;
549 split->generation = gen; 613 split->generation = gen;
614 split->orig_block_len = max(em->block_len,
615 em->orig_block_len);
550 616
551 if (compressed) { 617 if (compressed) {
552 split->block_len = em->block_len; 618 split->block_len = em->block_len;
@@ -555,7 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
555 } else { 621 } else {
556 split->block_len = split->len; 622 split->block_len = split->len;
557 split->block_start = em->block_start + diff; 623 split->block_start = em->block_start + diff;
558 split->orig_start = split->start; 624 split->orig_start = em->orig_start;
559 } 625 }
560 626
561 ret = add_extent_mapping(em_tree, split); 627 ret = add_extent_mapping(em_tree, split);
@@ -1346,10 +1412,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1346 1412
1347 cond_resched(); 1413 cond_resched();
1348 1414
1349 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1415 balance_dirty_pages_ratelimited(inode->i_mapping);
1350 dirty_pages);
1351 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1416 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1352 btrfs_btree_balance_dirty(root, 1); 1417 btrfs_btree_balance_dirty(root);
1353 1418
1354 pos += copied; 1419 pos += copied;
1355 num_written += copied; 1420 num_written += copied;
@@ -1398,6 +1463,24 @@ out:
1398 return written ? written : err; 1463 return written ? written : err;
1399} 1464}
1400 1465
1466static void update_time_for_write(struct inode *inode)
1467{
1468 struct timespec now;
1469
1470 if (IS_NOCMTIME(inode))
1471 return;
1472
1473 now = current_fs_time(inode->i_sb);
1474 if (!timespec_equal(&inode->i_mtime, &now))
1475 inode->i_mtime = now;
1476
1477 if (!timespec_equal(&inode->i_ctime, &now))
1478 inode->i_ctime = now;
1479
1480 if (IS_I_VERSION(inode))
1481 inode_inc_iversion(inode);
1482}
1483
1401static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 1484static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1402 const struct iovec *iov, 1485 const struct iovec *iov,
1403 unsigned long nr_segs, loff_t pos) 1486 unsigned long nr_segs, loff_t pos)
@@ -1410,6 +1493,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1410 ssize_t num_written = 0; 1493 ssize_t num_written = 0;
1411 ssize_t err = 0; 1494 ssize_t err = 0;
1412 size_t count, ocount; 1495 size_t count, ocount;
1496 bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
1413 1497
1414 sb_start_write(inode->i_sb); 1498 sb_start_write(inode->i_sb);
1415 1499
@@ -1452,11 +1536,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1452 goto out; 1536 goto out;
1453 } 1537 }
1454 1538
1455 err = file_update_time(file); 1539 /*
1456 if (err) { 1540 * We reserve space for updating the inode when we reserve space for the
1457 mutex_unlock(&inode->i_mutex); 1541 * extent we are going to write, so we will enospc out there. We don't
1458 goto out; 1542 * need to start yet another transaction to update the inode as we will
1459 } 1543 * update the inode when we finish writing whatever data we write.
1544 */
1545 update_time_for_write(inode);
1460 1546
1461 start_pos = round_down(pos, root->sectorsize); 1547 start_pos = round_down(pos, root->sectorsize);
1462 if (start_pos > i_size_read(inode)) { 1548 if (start_pos > i_size_read(inode)) {
@@ -1467,6 +1553,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1467 } 1553 }
1468 } 1554 }
1469 1555
1556 if (sync)
1557 atomic_inc(&BTRFS_I(inode)->sync_writers);
1558
1470 if (unlikely(file->f_flags & O_DIRECT)) { 1559 if (unlikely(file->f_flags & O_DIRECT)) {
1471 num_written = __btrfs_direct_write(iocb, iov, nr_segs, 1560 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1472 pos, ppos, count, ocount); 1561 pos, ppos, count, ocount);
@@ -1493,14 +1582,21 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1493 * this will either be one more than the running transaction 1582 * this will either be one more than the running transaction
1494 * or the generation used for the next transaction if there isn't 1583 * or the generation used for the next transaction if there isn't
1495 * one running right now. 1584 * one running right now.
1585 *
1586 * We also have to set last_sub_trans to the current log transid,
1587 * otherwise subsequent syncs to a file that's been synced in this
1588 * transaction will appear to have already occured.
1496 */ 1589 */
1497 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 1590 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1591 BTRFS_I(inode)->last_sub_trans = root->log_transid;
1498 if (num_written > 0 || num_written == -EIOCBQUEUED) { 1592 if (num_written > 0 || num_written == -EIOCBQUEUED) {
1499 err = generic_write_sync(file, pos, num_written); 1593 err = generic_write_sync(file, pos, num_written);
1500 if (err < 0 && num_written > 0) 1594 if (err < 0 && num_written > 0)
1501 num_written = err; 1595 num_written = err;
1502 } 1596 }
1503out: 1597out:
1598 if (sync)
1599 atomic_dec(&BTRFS_I(inode)->sync_writers);
1504 sb_end_write(inode->i_sb); 1600 sb_end_write(inode->i_sb);
1505 current->backing_dev_info = NULL; 1601 current->backing_dev_info = NULL;
1506 return num_written ? num_written : err; 1602 return num_written ? num_written : err;
@@ -1551,7 +1647,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1551 * out of the ->i_mutex. If so, we can flush the dirty pages by 1647 * out of the ->i_mutex. If so, we can flush the dirty pages by
1552 * multi-task, and make the performance up. 1648 * multi-task, and make the performance up.
1553 */ 1649 */
1650 atomic_inc(&BTRFS_I(inode)->sync_writers);
1554 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 1651 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1652 atomic_dec(&BTRFS_I(inode)->sync_writers);
1555 if (ret) 1653 if (ret)
1556 return ret; 1654 return ret;
1557 1655
@@ -1562,7 +1660,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1562 * range being left. 1660 * range being left.
1563 */ 1661 */
1564 atomic_inc(&root->log_batch); 1662 atomic_inc(&root->log_batch);
1565 btrfs_wait_ordered_range(inode, start, end); 1663 btrfs_wait_ordered_range(inode, start, end - start + 1);
1566 atomic_inc(&root->log_batch); 1664 atomic_inc(&root->log_batch);
1567 1665
1568 /* 1666 /*
@@ -1768,6 +1866,7 @@ out:
1768 1866
1769 hole_em->block_start = EXTENT_MAP_HOLE; 1867 hole_em->block_start = EXTENT_MAP_HOLE;
1770 hole_em->block_len = 0; 1868 hole_em->block_len = 0;
1869 hole_em->orig_block_len = 0;
1771 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 1870 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
1772 hole_em->compress_type = BTRFS_COMPRESS_NONE; 1871 hole_em->compress_type = BTRFS_COMPRESS_NONE;
1773 hole_em->generation = trans->transid; 1872 hole_em->generation = trans->transid;
@@ -1797,48 +1896,51 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
1797 struct btrfs_path *path; 1896 struct btrfs_path *path;
1798 struct btrfs_block_rsv *rsv; 1897 struct btrfs_block_rsv *rsv;
1799 struct btrfs_trans_handle *trans; 1898 struct btrfs_trans_handle *trans;
1800 u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 1899 u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
1801 u64 lockstart = (offset + mask) & ~mask; 1900 u64 lockend = round_down(offset + len,
1802 u64 lockend = ((offset + len) & ~mask) - 1; 1901 BTRFS_I(inode)->root->sectorsize) - 1;
1803 u64 cur_offset = lockstart; 1902 u64 cur_offset = lockstart;
1804 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 1903 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
1805 u64 drop_end; 1904 u64 drop_end;
1806 unsigned long nr;
1807 int ret = 0; 1905 int ret = 0;
1808 int err = 0; 1906 int err = 0;
1809 bool same_page = (offset >> PAGE_CACHE_SHIFT) == 1907 bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
1810 ((offset + len) >> PAGE_CACHE_SHIFT); 1908 ((offset + len - 1) >> PAGE_CACHE_SHIFT));
1811 1909
1812 btrfs_wait_ordered_range(inode, offset, len); 1910 btrfs_wait_ordered_range(inode, offset, len);
1813 1911
1814 mutex_lock(&inode->i_mutex); 1912 mutex_lock(&inode->i_mutex);
1815 if (offset >= inode->i_size) { 1913 /*
1816 mutex_unlock(&inode->i_mutex); 1914 * We needn't truncate any page which is beyond the end of the file
1817 return 0; 1915 * because we are sure there is no data there.
1818 } 1916 */
1819
1820 /* 1917 /*
1821 * Only do this if we are in the same page and we aren't doing the 1918 * Only do this if we are in the same page and we aren't doing the
1822 * entire page. 1919 * entire page.
1823 */ 1920 */
1824 if (same_page && len < PAGE_CACHE_SIZE) { 1921 if (same_page && len < PAGE_CACHE_SIZE) {
1825 ret = btrfs_truncate_page(inode, offset, len, 0); 1922 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
1923 ret = btrfs_truncate_page(inode, offset, len, 0);
1826 mutex_unlock(&inode->i_mutex); 1924 mutex_unlock(&inode->i_mutex);
1827 return ret; 1925 return ret;
1828 } 1926 }
1829 1927
1830 /* zero back part of the first page */ 1928 /* zero back part of the first page */
1831 ret = btrfs_truncate_page(inode, offset, 0, 0); 1929 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
1832 if (ret) { 1930 ret = btrfs_truncate_page(inode, offset, 0, 0);
1833 mutex_unlock(&inode->i_mutex); 1931 if (ret) {
1834 return ret; 1932 mutex_unlock(&inode->i_mutex);
1933 return ret;
1934 }
1835 } 1935 }
1836 1936
1837 /* zero the front end of the last page */ 1937 /* zero the front end of the last page */
1838 ret = btrfs_truncate_page(inode, offset + len, 0, 1); 1938 if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
1839 if (ret) { 1939 ret = btrfs_truncate_page(inode, offset + len, 0, 1);
1840 mutex_unlock(&inode->i_mutex); 1940 if (ret) {
1841 return ret; 1941 mutex_unlock(&inode->i_mutex);
1942 return ret;
1943 }
1842 } 1944 }
1843 1945
1844 if (lockend < lockstart) { 1946 if (lockend < lockstart) {
@@ -1931,9 +2033,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
1931 break; 2033 break;
1932 } 2034 }
1933 2035
1934 nr = trans->blocks_used;
1935 btrfs_end_transaction(trans, root); 2036 btrfs_end_transaction(trans, root);
1936 btrfs_btree_balance_dirty(root, nr); 2037 btrfs_btree_balance_dirty(root);
1937 2038
1938 trans = btrfs_start_transaction(root, 3); 2039 trans = btrfs_start_transaction(root, 3);
1939 if (IS_ERR(trans)) { 2040 if (IS_ERR(trans)) {
@@ -1964,11 +2065,13 @@ out_trans:
1964 if (!trans) 2065 if (!trans)
1965 goto out_free; 2066 goto out_free;
1966 2067
2068 inode_inc_iversion(inode);
2069 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2070
1967 trans->block_rsv = &root->fs_info->trans_block_rsv; 2071 trans->block_rsv = &root->fs_info->trans_block_rsv;
1968 ret = btrfs_update_inode(trans, root, inode); 2072 ret = btrfs_update_inode(trans, root, inode);
1969 nr = trans->blocks_used;
1970 btrfs_end_transaction(trans, root); 2073 btrfs_end_transaction(trans, root);
1971 btrfs_btree_balance_dirty(root, nr); 2074 btrfs_btree_balance_dirty(root);
1972out_free: 2075out_free:
1973 btrfs_free_path(path); 2076 btrfs_free_path(path);
1974 btrfs_free_block_rsv(root, rsv); 2077 btrfs_free_block_rsv(root, rsv);
@@ -1992,12 +2095,12 @@ static long btrfs_fallocate(struct file *file, int mode,
1992 u64 alloc_end; 2095 u64 alloc_end;
1993 u64 alloc_hint = 0; 2096 u64 alloc_hint = 0;
1994 u64 locked_end; 2097 u64 locked_end;
1995 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1996 struct extent_map *em; 2098 struct extent_map *em;
2099 int blocksize = BTRFS_I(inode)->root->sectorsize;
1997 int ret; 2100 int ret;
1998 2101
1999 alloc_start = offset & ~mask; 2102 alloc_start = round_down(offset, blocksize);
2000 alloc_end = (offset + len + mask) & ~mask; 2103 alloc_end = round_up(offset + len, blocksize);
2001 2104
2002 /* Make sure we aren't being give some crap mode */ 2105 /* Make sure we aren't being give some crap mode */
2003 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 2106 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -2010,7 +2113,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2010 * Make sure we have enough space before we do the 2113 * Make sure we have enough space before we do the
2011 * allocation. 2114 * allocation.
2012 */ 2115 */
2013 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1); 2116 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
2014 if (ret) 2117 if (ret)
2015 return ret; 2118 return ret;
2016 2119
@@ -2078,7 +2181,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2078 } 2181 }
2079 last_byte = min(extent_map_end(em), alloc_end); 2182 last_byte = min(extent_map_end(em), alloc_end);
2080 actual_end = min_t(u64, extent_map_end(em), offset + len); 2183 actual_end = min_t(u64, extent_map_end(em), offset + len);
2081 last_byte = (last_byte + mask) & ~mask; 2184 last_byte = ALIGN(last_byte, blocksize);
2082 2185
2083 if (em->block_start == EXTENT_MAP_HOLE || 2186 if (em->block_start == EXTENT_MAP_HOLE ||
2084 (cur_offset >= inode->i_size && 2187 (cur_offset >= inode->i_size &&
@@ -2117,11 +2220,11 @@ static long btrfs_fallocate(struct file *file, int mode,
2117out: 2220out:
2118 mutex_unlock(&inode->i_mutex); 2221 mutex_unlock(&inode->i_mutex);
2119 /* Let go of our reservation. */ 2222 /* Let go of our reservation. */
2120 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1); 2223 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
2121 return ret; 2224 return ret;
2122} 2225}
2123 2226
2124static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) 2227static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
2125{ 2228{
2126 struct btrfs_root *root = BTRFS_I(inode)->root; 2229 struct btrfs_root *root = BTRFS_I(inode)->root;
2127 struct extent_map *em; 2230 struct extent_map *em;
@@ -2155,7 +2258,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
2155 * before the position we want in case there is outstanding delalloc 2258 * before the position we want in case there is outstanding delalloc
2156 * going on here. 2259 * going on here.
2157 */ 2260 */
2158 if (origin == SEEK_HOLE && start != 0) { 2261 if (whence == SEEK_HOLE && start != 0) {
2159 if (start <= root->sectorsize) 2262 if (start <= root->sectorsize)
2160 em = btrfs_get_extent_fiemap(inode, NULL, 0, 0, 2263 em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,
2161 root->sectorsize, 0); 2264 root->sectorsize, 0);
@@ -2189,13 +2292,13 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
2189 } 2292 }
2190 } 2293 }
2191 2294
2192 if (origin == SEEK_HOLE) { 2295 if (whence == SEEK_HOLE) {
2193 *offset = start; 2296 *offset = start;
2194 free_extent_map(em); 2297 free_extent_map(em);
2195 break; 2298 break;
2196 } 2299 }
2197 } else { 2300 } else {
2198 if (origin == SEEK_DATA) { 2301 if (whence == SEEK_DATA) {
2199 if (em->block_start == EXTENT_MAP_DELALLOC) { 2302 if (em->block_start == EXTENT_MAP_DELALLOC) {
2200 if (start >= inode->i_size) { 2303 if (start >= inode->i_size) {
2201 free_extent_map(em); 2304 free_extent_map(em);
@@ -2232,16 +2335,16 @@ out:
2232 return ret; 2335 return ret;
2233} 2336}
2234 2337
2235static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) 2338static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
2236{ 2339{
2237 struct inode *inode = file->f_mapping->host; 2340 struct inode *inode = file->f_mapping->host;
2238 int ret; 2341 int ret;
2239 2342
2240 mutex_lock(&inode->i_mutex); 2343 mutex_lock(&inode->i_mutex);
2241 switch (origin) { 2344 switch (whence) {
2242 case SEEK_END: 2345 case SEEK_END:
2243 case SEEK_CUR: 2346 case SEEK_CUR:
2244 offset = generic_file_llseek(file, offset, origin); 2347 offset = generic_file_llseek(file, offset, whence);
2245 goto out; 2348 goto out;
2246 case SEEK_DATA: 2349 case SEEK_DATA:
2247 case SEEK_HOLE: 2350 case SEEK_HOLE:
@@ -2250,7 +2353,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
2250 return -ENXIO; 2353 return -ENXIO;
2251 } 2354 }
2252 2355
2253 ret = find_desired_extent(inode, &offset, origin); 2356 ret = find_desired_extent(inode, &offset, whence);
2254 if (ret) { 2357 if (ret) {
2255 mutex_unlock(&inode->i_mutex); 2358 mutex_unlock(&inode->i_mutex);
2256 return ret; 2359 return ret;
@@ -2293,3 +2396,21 @@ const struct file_operations btrfs_file_operations = {
2293 .compat_ioctl = btrfs_ioctl, 2396 .compat_ioctl = btrfs_ioctl,
2294#endif 2397#endif
2295}; 2398};
2399
2400void btrfs_auto_defrag_exit(void)
2401{
2402 if (btrfs_inode_defrag_cachep)
2403 kmem_cache_destroy(btrfs_inode_defrag_cachep);
2404}
2405
2406int btrfs_auto_defrag_init(void)
2407{
2408 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
2409 sizeof(struct inode_defrag), 0,
2410 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
2411 NULL);
2412 if (!btrfs_inode_defrag_cachep)
2413 return -ENOMEM;
2414
2415 return 0;
2416}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 1027b854b90c..59ea2e4349c9 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -307,7 +307,6 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl)
307 307
308static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) 308static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
309{ 309{
310 WARN_ON(io_ctl->cur);
311 BUG_ON(io_ctl->index >= io_ctl->num_pages); 310 BUG_ON(io_ctl->index >= io_ctl->num_pages);
312 io_ctl->page = io_ctl->pages[io_ctl->index++]; 311 io_ctl->page = io_ctl->pages[io_ctl->index++];
313 io_ctl->cur = kmap(io_ctl->page); 312 io_ctl->cur = kmap(io_ctl->page);
@@ -1250,18 +1249,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
1250 * if previous extent entry covers the offset, 1249 * if previous extent entry covers the offset,
1251 * we should return it instead of the bitmap entry 1250 * we should return it instead of the bitmap entry
1252 */ 1251 */
1253 n = &entry->offset_index; 1252 n = rb_prev(&entry->offset_index);
1254 while (1) { 1253 if (n) {
1255 n = rb_prev(n);
1256 if (!n)
1257 break;
1258 prev = rb_entry(n, struct btrfs_free_space, 1254 prev = rb_entry(n, struct btrfs_free_space,
1259 offset_index); 1255 offset_index);
1260 if (!prev->bitmap) { 1256 if (!prev->bitmap &&
1261 if (prev->offset + prev->bytes > offset) 1257 prev->offset + prev->bytes > offset)
1262 entry = prev; 1258 entry = prev;
1263 break;
1264 }
1265 } 1259 }
1266 } 1260 }
1267 return entry; 1261 return entry;
@@ -1287,18 +1281,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
1287 } 1281 }
1288 1282
1289 if (entry->bitmap) { 1283 if (entry->bitmap) {
1290 n = &entry->offset_index; 1284 n = rb_prev(&entry->offset_index);
1291 while (1) { 1285 if (n) {
1292 n = rb_prev(n);
1293 if (!n)
1294 break;
1295 prev = rb_entry(n, struct btrfs_free_space, 1286 prev = rb_entry(n, struct btrfs_free_space,
1296 offset_index); 1287 offset_index);
1297 if (!prev->bitmap) { 1288 if (!prev->bitmap &&
1298 if (prev->offset + prev->bytes > offset) 1289 prev->offset + prev->bytes > offset)
1299 return prev; 1290 return prev;
1300 break;
1301 }
1302 } 1291 }
1303 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1292 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1304 return entry; 1293 return entry;
@@ -1364,7 +1353,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1364 u64 bitmap_bytes; 1353 u64 bitmap_bytes;
1365 u64 extent_bytes; 1354 u64 extent_bytes;
1366 u64 size = block_group->key.offset; 1355 u64 size = block_group->key.offset;
1367 u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; 1356 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1368 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 1357 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1369 1358
1370 BUG_ON(ctl->total_bitmaps > max_bitmaps); 1359 BUG_ON(ctl->total_bitmaps > max_bitmaps);
@@ -1650,8 +1639,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1650 * some block groups are so tiny they can't be enveloped by a bitmap, so 1639 * some block groups are so tiny they can't be enveloped by a bitmap, so
1651 * don't even bother to create a bitmap for this 1640 * don't even bother to create a bitmap for this
1652 */ 1641 */
1653 if (BITS_PER_BITMAP * block_group->sectorsize > 1642 if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset)
1654 block_group->key.offset)
1655 return false; 1643 return false;
1656 1644
1657 return true; 1645 return true;
@@ -2298,10 +2286,10 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2298 unsigned long total_found = 0; 2286 unsigned long total_found = 0;
2299 int ret; 2287 int ret;
2300 2288
2301 i = offset_to_bit(entry->offset, block_group->sectorsize, 2289 i = offset_to_bit(entry->offset, ctl->unit,
2302 max_t(u64, offset, entry->offset)); 2290 max_t(u64, offset, entry->offset));
2303 want_bits = bytes_to_bits(bytes, block_group->sectorsize); 2291 want_bits = bytes_to_bits(bytes, ctl->unit);
2304 min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); 2292 min_bits = bytes_to_bits(min_bytes, ctl->unit);
2305 2293
2306again: 2294again:
2307 found_bits = 0; 2295 found_bits = 0;
@@ -2325,23 +2313,22 @@ again:
2325 2313
2326 total_found += found_bits; 2314 total_found += found_bits;
2327 2315
2328 if (cluster->max_size < found_bits * block_group->sectorsize) 2316 if (cluster->max_size < found_bits * ctl->unit)
2329 cluster->max_size = found_bits * block_group->sectorsize; 2317 cluster->max_size = found_bits * ctl->unit;
2330 2318
2331 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 2319 if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2332 i = next_zero + 1; 2320 i = next_zero + 1;
2333 goto again; 2321 goto again;
2334 } 2322 }
2335 2323
2336 cluster->window_start = start * block_group->sectorsize + 2324 cluster->window_start = start * ctl->unit + entry->offset;
2337 entry->offset;
2338 rb_erase(&entry->offset_index, &ctl->free_space_offset); 2325 rb_erase(&entry->offset_index, &ctl->free_space_offset);
2339 ret = tree_insert_offset(&cluster->root, entry->offset, 2326 ret = tree_insert_offset(&cluster->root, entry->offset,
2340 &entry->offset_index, 1); 2327 &entry->offset_index, 1);
2341 BUG_ON(ret); /* -EEXIST; Logic error */ 2328 BUG_ON(ret); /* -EEXIST; Logic error */
2342 2329
2343 trace_btrfs_setup_cluster(block_group, cluster, 2330 trace_btrfs_setup_cluster(block_group, cluster,
2344 total_found * block_group->sectorsize, 1); 2331 total_found * ctl->unit, 1);
2345 return 0; 2332 return 0;
2346} 2333}
2347 2334
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b1a1c929ba80..d26f67a59e36 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -434,8 +434,9 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
434 * 3 items for pre-allocation 434 * 3 items for pre-allocation
435 */ 435 */
436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); 436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
437 ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, 437 ret = btrfs_block_rsv_add(root, trans->block_rsv,
438 trans->bytes_reserved); 438 trans->bytes_reserved,
439 BTRFS_RESERVE_NO_FLUSH);
439 if (ret) 440 if (ret)
440 goto out; 441 goto out;
441 trace_btrfs_space_reservation(root->fs_info, "ino_cache", 442 trace_btrfs_space_reservation(root->fs_info, "ino_cache",
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 95542a1b3dfc..67ed24ae86bb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -71,6 +71,7 @@ static const struct file_operations btrfs_dir_file_operations;
71static struct extent_io_ops btrfs_extent_io_ops; 71static struct extent_io_ops btrfs_extent_io_ops;
72 72
73static struct kmem_cache *btrfs_inode_cachep; 73static struct kmem_cache *btrfs_inode_cachep;
74static struct kmem_cache *btrfs_delalloc_work_cachep;
74struct kmem_cache *btrfs_trans_handle_cachep; 75struct kmem_cache *btrfs_trans_handle_cachep;
75struct kmem_cache *btrfs_transaction_cachep; 76struct kmem_cache *btrfs_transaction_cachep;
76struct kmem_cache *btrfs_path_cachep; 77struct kmem_cache *btrfs_path_cachep;
@@ -94,6 +95,10 @@ static noinline int cow_file_range(struct inode *inode,
94 struct page *locked_page, 95 struct page *locked_page,
95 u64 start, u64 end, int *page_started, 96 u64 start, u64 end, int *page_started,
96 unsigned long *nr_written, int unlock); 97 unsigned long *nr_written, int unlock);
98static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
99 u64 len, u64 orig_start,
100 u64 block_start, u64 block_len,
101 u64 orig_block_len, int type);
97 102
98static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 103static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
99 struct inode *inode, struct inode *dir, 104 struct inode *inode, struct inode *dir,
@@ -698,14 +703,19 @@ retry:
698 703
699 em->block_start = ins.objectid; 704 em->block_start = ins.objectid;
700 em->block_len = ins.offset; 705 em->block_len = ins.offset;
706 em->orig_block_len = ins.offset;
701 em->bdev = root->fs_info->fs_devices->latest_bdev; 707 em->bdev = root->fs_info->fs_devices->latest_bdev;
702 em->compress_type = async_extent->compress_type; 708 em->compress_type = async_extent->compress_type;
703 set_bit(EXTENT_FLAG_PINNED, &em->flags); 709 set_bit(EXTENT_FLAG_PINNED, &em->flags);
704 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 710 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
711 em->generation = -1;
705 712
706 while (1) { 713 while (1) {
707 write_lock(&em_tree->lock); 714 write_lock(&em_tree->lock);
708 ret = add_extent_mapping(em_tree, em); 715 ret = add_extent_mapping(em_tree, em);
716 if (!ret)
717 list_move(&em->list,
718 &em_tree->modified_extents);
709 write_unlock(&em_tree->lock); 719 write_unlock(&em_tree->lock);
710 if (ret != -EEXIST) { 720 if (ret != -EEXIST) {
711 free_extent_map(em); 721 free_extent_map(em);
@@ -803,14 +813,14 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
803 * required to start IO on it. It may be clean and already done with 813 * required to start IO on it. It may be clean and already done with
804 * IO when we return. 814 * IO when we return.
805 */ 815 */
806static noinline int cow_file_range(struct inode *inode, 816static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
807 struct page *locked_page, 817 struct inode *inode,
808 u64 start, u64 end, int *page_started, 818 struct btrfs_root *root,
809 unsigned long *nr_written, 819 struct page *locked_page,
810 int unlock) 820 u64 start, u64 end, int *page_started,
821 unsigned long *nr_written,
822 int unlock)
811{ 823{
812 struct btrfs_root *root = BTRFS_I(inode)->root;
813 struct btrfs_trans_handle *trans;
814 u64 alloc_hint = 0; 824 u64 alloc_hint = 0;
815 u64 num_bytes; 825 u64 num_bytes;
816 unsigned long ram_size; 826 unsigned long ram_size;
@@ -823,25 +833,10 @@ static noinline int cow_file_range(struct inode *inode,
823 int ret = 0; 833 int ret = 0;
824 834
825 BUG_ON(btrfs_is_free_space_inode(inode)); 835 BUG_ON(btrfs_is_free_space_inode(inode));
826 trans = btrfs_join_transaction(root);
827 if (IS_ERR(trans)) {
828 extent_clear_unlock_delalloc(inode,
829 &BTRFS_I(inode)->io_tree,
830 start, end, locked_page,
831 EXTENT_CLEAR_UNLOCK_PAGE |
832 EXTENT_CLEAR_UNLOCK |
833 EXTENT_CLEAR_DELALLOC |
834 EXTENT_CLEAR_DIRTY |
835 EXTENT_SET_WRITEBACK |
836 EXTENT_END_WRITEBACK);
837 return PTR_ERR(trans);
838 }
839 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
840 836
841 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 837 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
842 num_bytes = max(blocksize, num_bytes); 838 num_bytes = max(blocksize, num_bytes);
843 disk_num_bytes = num_bytes; 839 disk_num_bytes = num_bytes;
844 ret = 0;
845 840
846 /* if this is a small write inside eof, kick off defrag */ 841 /* if this is a small write inside eof, kick off defrag */
847 if (num_bytes < 64 * 1024 && 842 if (num_bytes < 64 * 1024 &&
@@ -900,12 +895,17 @@ static noinline int cow_file_range(struct inode *inode,
900 895
901 em->block_start = ins.objectid; 896 em->block_start = ins.objectid;
902 em->block_len = ins.offset; 897 em->block_len = ins.offset;
898 em->orig_block_len = ins.offset;
903 em->bdev = root->fs_info->fs_devices->latest_bdev; 899 em->bdev = root->fs_info->fs_devices->latest_bdev;
904 set_bit(EXTENT_FLAG_PINNED, &em->flags); 900 set_bit(EXTENT_FLAG_PINNED, &em->flags);
901 em->generation = -1;
905 902
906 while (1) { 903 while (1) {
907 write_lock(&em_tree->lock); 904 write_lock(&em_tree->lock);
908 ret = add_extent_mapping(em_tree, em); 905 ret = add_extent_mapping(em_tree, em);
906 if (!ret)
907 list_move(&em->list,
908 &em_tree->modified_extents);
909 write_unlock(&em_tree->lock); 909 write_unlock(&em_tree->lock);
910 if (ret != -EEXIST) { 910 if (ret != -EEXIST) {
911 free_extent_map(em); 911 free_extent_map(em);
@@ -952,11 +952,9 @@ static noinline int cow_file_range(struct inode *inode,
952 alloc_hint = ins.objectid + ins.offset; 952 alloc_hint = ins.objectid + ins.offset;
953 start += cur_alloc_size; 953 start += cur_alloc_size;
954 } 954 }
955 ret = 0;
956out: 955out:
957 btrfs_end_transaction(trans, root);
958
959 return ret; 956 return ret;
957
960out_unlock: 958out_unlock:
961 extent_clear_unlock_delalloc(inode, 959 extent_clear_unlock_delalloc(inode,
962 &BTRFS_I(inode)->io_tree, 960 &BTRFS_I(inode)->io_tree,
@@ -971,6 +969,39 @@ out_unlock:
971 goto out; 969 goto out;
972} 970}
973 971
972static noinline int cow_file_range(struct inode *inode,
973 struct page *locked_page,
974 u64 start, u64 end, int *page_started,
975 unsigned long *nr_written,
976 int unlock)
977{
978 struct btrfs_trans_handle *trans;
979 struct btrfs_root *root = BTRFS_I(inode)->root;
980 int ret;
981
982 trans = btrfs_join_transaction(root);
983 if (IS_ERR(trans)) {
984 extent_clear_unlock_delalloc(inode,
985 &BTRFS_I(inode)->io_tree,
986 start, end, locked_page,
987 EXTENT_CLEAR_UNLOCK_PAGE |
988 EXTENT_CLEAR_UNLOCK |
989 EXTENT_CLEAR_DELALLOC |
990 EXTENT_CLEAR_DIRTY |
991 EXTENT_SET_WRITEBACK |
992 EXTENT_END_WRITEBACK);
993 return PTR_ERR(trans);
994 }
995 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
996
997 ret = __cow_file_range(trans, inode, root, locked_page, start, end,
998 page_started, nr_written, unlock);
999
1000 btrfs_end_transaction(trans, root);
1001
1002 return ret;
1003}
1004
974/* 1005/*
975 * work queue call back to started compression on a file and pages 1006 * work queue call back to started compression on a file and pages
976 */ 1007 */
@@ -1126,6 +1157,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1126 u64 extent_offset; 1157 u64 extent_offset;
1127 u64 disk_bytenr; 1158 u64 disk_bytenr;
1128 u64 num_bytes; 1159 u64 num_bytes;
1160 u64 disk_num_bytes;
1129 int extent_type; 1161 int extent_type;
1130 int ret, err; 1162 int ret, err;
1131 int type; 1163 int type;
@@ -1228,6 +1260,8 @@ next_slot:
1228 extent_offset = btrfs_file_extent_offset(leaf, fi); 1260 extent_offset = btrfs_file_extent_offset(leaf, fi);
1229 extent_end = found_key.offset + 1261 extent_end = found_key.offset +
1230 btrfs_file_extent_num_bytes(leaf, fi); 1262 btrfs_file_extent_num_bytes(leaf, fi);
1263 disk_num_bytes =
1264 btrfs_file_extent_disk_num_bytes(leaf, fi);
1231 if (extent_end <= start) { 1265 if (extent_end <= start) {
1232 path->slots[0]++; 1266 path->slots[0]++;
1233 goto next_slot; 1267 goto next_slot;
@@ -1281,9 +1315,9 @@ out_check:
1281 1315
1282 btrfs_release_path(path); 1316 btrfs_release_path(path);
1283 if (cow_start != (u64)-1) { 1317 if (cow_start != (u64)-1) {
1284 ret = cow_file_range(inode, locked_page, cow_start, 1318 ret = __cow_file_range(trans, inode, root, locked_page,
1285 found_key.offset - 1, page_started, 1319 cow_start, found_key.offset - 1,
1286 nr_written, 1); 1320 page_started, nr_written, 1);
1287 if (ret) { 1321 if (ret) {
1288 btrfs_abort_transaction(trans, root, ret); 1322 btrfs_abort_transaction(trans, root, ret);
1289 goto error; 1323 goto error;
@@ -1298,16 +1332,21 @@ out_check:
1298 em = alloc_extent_map(); 1332 em = alloc_extent_map();
1299 BUG_ON(!em); /* -ENOMEM */ 1333 BUG_ON(!em); /* -ENOMEM */
1300 em->start = cur_offset; 1334 em->start = cur_offset;
1301 em->orig_start = em->start; 1335 em->orig_start = found_key.offset - extent_offset;
1302 em->len = num_bytes; 1336 em->len = num_bytes;
1303 em->block_len = num_bytes; 1337 em->block_len = num_bytes;
1304 em->block_start = disk_bytenr; 1338 em->block_start = disk_bytenr;
1339 em->orig_block_len = disk_num_bytes;
1305 em->bdev = root->fs_info->fs_devices->latest_bdev; 1340 em->bdev = root->fs_info->fs_devices->latest_bdev;
1306 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1341 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1307 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 1342 set_bit(EXTENT_FLAG_FILLING, &em->flags);
1343 em->generation = -1;
1308 while (1) { 1344 while (1) {
1309 write_lock(&em_tree->lock); 1345 write_lock(&em_tree->lock);
1310 ret = add_extent_mapping(em_tree, em); 1346 ret = add_extent_mapping(em_tree, em);
1347 if (!ret)
1348 list_move(&em->list,
1349 &em_tree->modified_extents);
1311 write_unlock(&em_tree->lock); 1350 write_unlock(&em_tree->lock);
1312 if (ret != -EEXIST) { 1351 if (ret != -EEXIST) {
1313 free_extent_map(em); 1352 free_extent_map(em);
@@ -1352,8 +1391,9 @@ out_check:
1352 } 1391 }
1353 1392
1354 if (cow_start != (u64)-1) { 1393 if (cow_start != (u64)-1) {
1355 ret = cow_file_range(inode, locked_page, cow_start, end, 1394 ret = __cow_file_range(trans, inode, root, locked_page,
1356 page_started, nr_written, 1); 1395 cow_start, end,
1396 page_started, nr_written, 1);
1357 if (ret) { 1397 if (ret) {
1358 btrfs_abort_transaction(trans, root, ret); 1398 btrfs_abort_transaction(trans, root, ret);
1359 goto error; 1399 goto error;
@@ -1531,7 +1571,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1531 unsigned long bio_flags) 1571 unsigned long bio_flags)
1532{ 1572{
1533 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1573 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1534 struct btrfs_mapping_tree *map_tree;
1535 u64 logical = (u64)bio->bi_sector << 9; 1574 u64 logical = (u64)bio->bi_sector << 9;
1536 u64 length = 0; 1575 u64 length = 0;
1537 u64 map_length; 1576 u64 map_length;
@@ -1541,11 +1580,10 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1541 return 0; 1580 return 0;
1542 1581
1543 length = bio->bi_size; 1582 length = bio->bi_size;
1544 map_tree = &root->fs_info->mapping_tree;
1545 map_length = length; 1583 map_length = length;
1546 ret = btrfs_map_block(map_tree, READ, logical, 1584 ret = btrfs_map_block(root->fs_info, READ, logical,
1547 &map_length, NULL, 0); 1585 &map_length, NULL, 0);
1548 /* Will always return 0 or 1 with map_multi == NULL */ 1586 /* Will always return 0 with map_multi == NULL */
1549 BUG_ON(ret < 0); 1587 BUG_ON(ret < 0);
1550 if (map_length < length + size) 1588 if (map_length < length + size)
1551 return 1; 1589 return 1;
@@ -1586,7 +1624,12 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1586 u64 bio_offset) 1624 u64 bio_offset)
1587{ 1625{
1588 struct btrfs_root *root = BTRFS_I(inode)->root; 1626 struct btrfs_root *root = BTRFS_I(inode)->root;
1589 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1627 int ret;
1628
1629 ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
1630 if (ret)
1631 bio_endio(bio, ret);
1632 return ret;
1590} 1633}
1591 1634
1592/* 1635/*
@@ -1601,6 +1644,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1601 int ret = 0; 1644 int ret = 0;
1602 int skip_sum; 1645 int skip_sum;
1603 int metadata = 0; 1646 int metadata = 0;
1647 int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1604 1648
1605 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1649 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1606 1650
@@ -1610,31 +1654,43 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1610 if (!(rw & REQ_WRITE)) { 1654 if (!(rw & REQ_WRITE)) {
1611 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); 1655 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1612 if (ret) 1656 if (ret)
1613 return ret; 1657 goto out;
1614 1658
1615 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1659 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1616 return btrfs_submit_compressed_read(inode, bio, 1660 ret = btrfs_submit_compressed_read(inode, bio,
1617 mirror_num, bio_flags); 1661 mirror_num,
1662 bio_flags);
1663 goto out;
1618 } else if (!skip_sum) { 1664 } else if (!skip_sum) {
1619 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); 1665 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1620 if (ret) 1666 if (ret)
1621 return ret; 1667 goto out;
1622 } 1668 }
1623 goto mapit; 1669 goto mapit;
1624 } else if (!skip_sum) { 1670 } else if (async && !skip_sum) {
1625 /* csum items have already been cloned */ 1671 /* csum items have already been cloned */
1626 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1672 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1627 goto mapit; 1673 goto mapit;
1628 /* we're doing a write, do the async checksumming */ 1674 /* we're doing a write, do the async checksumming */
1629 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1675 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1630 inode, rw, bio, mirror_num, 1676 inode, rw, bio, mirror_num,
1631 bio_flags, bio_offset, 1677 bio_flags, bio_offset,
1632 __btrfs_submit_bio_start, 1678 __btrfs_submit_bio_start,
1633 __btrfs_submit_bio_done); 1679 __btrfs_submit_bio_done);
1680 goto out;
1681 } else if (!skip_sum) {
1682 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1683 if (ret)
1684 goto out;
1634 } 1685 }
1635 1686
1636mapit: 1687mapit:
1637 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 1688 ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
1689
1690out:
1691 if (ret < 0)
1692 bio_endio(bio, ret);
1693 return ret;
1638} 1694}
1639 1695
1640/* 1696/*
@@ -1657,8 +1713,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1657int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 1713int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1658 struct extent_state **cached_state) 1714 struct extent_state **cached_state)
1659{ 1715{
1660 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1716 WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
1661 WARN_ON(1);
1662 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1717 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1663 cached_state, GFP_NOFS); 1718 cached_state, GFP_NOFS);
1664} 1719}
@@ -1867,22 +1922,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1867 1922
1868 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1923 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1869 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 1924 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
1870 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1925 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1871 if (!ret) { 1926 if (nolock)
1872 if (nolock) 1927 trans = btrfs_join_transaction_nolock(root);
1873 trans = btrfs_join_transaction_nolock(root); 1928 else
1874 else 1929 trans = btrfs_join_transaction(root);
1875 trans = btrfs_join_transaction(root); 1930 if (IS_ERR(trans)) {
1876 if (IS_ERR(trans)) { 1931 ret = PTR_ERR(trans);
1877 ret = PTR_ERR(trans); 1932 trans = NULL;
1878 trans = NULL; 1933 goto out;
1879 goto out;
1880 }
1881 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1882 ret = btrfs_update_inode_fallback(trans, root, inode);
1883 if (ret) /* -ENOMEM or corruption */
1884 btrfs_abort_transaction(trans, root, ret);
1885 } 1934 }
1935 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1936 ret = btrfs_update_inode_fallback(trans, root, inode);
1937 if (ret) /* -ENOMEM or corruption */
1938 btrfs_abort_transaction(trans, root, ret);
1886 goto out; 1939 goto out;
1887 } 1940 }
1888 1941
@@ -1931,15 +1984,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1931 add_pending_csums(trans, inode, ordered_extent->file_offset, 1984 add_pending_csums(trans, inode, ordered_extent->file_offset,
1932 &ordered_extent->list); 1985 &ordered_extent->list);
1933 1986
1934 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1987 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1935 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1988 ret = btrfs_update_inode_fallback(trans, root, inode);
1936 ret = btrfs_update_inode_fallback(trans, root, inode); 1989 if (ret) { /* -ENOMEM or corruption */
1937 if (ret) { /* -ENOMEM or corruption */ 1990 btrfs_abort_transaction(trans, root, ret);
1938 btrfs_abort_transaction(trans, root, ret); 1991 goto out_unlock;
1939 goto out_unlock;
1940 }
1941 } else {
1942 btrfs_set_inode_last_trans(trans, inode);
1943 } 1992 }
1944 ret = 0; 1993 ret = 0;
1945out_unlock: 1994out_unlock:
@@ -3074,7 +3123,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3074 struct btrfs_trans_handle *trans; 3123 struct btrfs_trans_handle *trans;
3075 struct inode *inode = dentry->d_inode; 3124 struct inode *inode = dentry->d_inode;
3076 int ret; 3125 int ret;
3077 unsigned long nr = 0;
3078 3126
3079 trans = __unlink_start_trans(dir, dentry); 3127 trans = __unlink_start_trans(dir, dentry);
3080 if (IS_ERR(trans)) 3128 if (IS_ERR(trans))
@@ -3094,9 +3142,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3094 } 3142 }
3095 3143
3096out: 3144out:
3097 nr = trans->blocks_used;
3098 __unlink_end_trans(trans, root); 3145 __unlink_end_trans(trans, root);
3099 btrfs_btree_balance_dirty(root, nr); 3146 btrfs_btree_balance_dirty(root);
3100 return ret; 3147 return ret;
3101} 3148}
3102 3149
@@ -3186,7 +3233,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3186 int err = 0; 3233 int err = 0;
3187 struct btrfs_root *root = BTRFS_I(dir)->root; 3234 struct btrfs_root *root = BTRFS_I(dir)->root;
3188 struct btrfs_trans_handle *trans; 3235 struct btrfs_trans_handle *trans;
3189 unsigned long nr = 0;
3190 3236
3191 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) 3237 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
3192 return -ENOTEMPTY; 3238 return -ENOTEMPTY;
@@ -3215,9 +3261,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3215 if (!err) 3261 if (!err)
3216 btrfs_i_size_write(inode, 0); 3262 btrfs_i_size_write(inode, 0);
3217out: 3263out:
3218 nr = trans->blocks_used;
3219 __unlink_end_trans(trans, root); 3264 __unlink_end_trans(trans, root);
3220 btrfs_btree_balance_dirty(root, nr); 3265 btrfs_btree_balance_dirty(root);
3221 3266
3222 return err; 3267 return err;
3223} 3268}
@@ -3497,11 +3542,11 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
3497 if (ret) 3542 if (ret)
3498 goto out; 3543 goto out;
3499 3544
3500 ret = -ENOMEM;
3501again: 3545again:
3502 page = find_or_create_page(mapping, index, mask); 3546 page = find_or_create_page(mapping, index, mask);
3503 if (!page) { 3547 if (!page) {
3504 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3548 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3549 ret = -ENOMEM;
3505 goto out; 3550 goto out;
3506 } 3551 }
3507 3552
@@ -3550,7 +3595,6 @@ again:
3550 goto out_unlock; 3595 goto out_unlock;
3551 } 3596 }
3552 3597
3553 ret = 0;
3554 if (offset != PAGE_CACHE_SIZE) { 3598 if (offset != PAGE_CACHE_SIZE) {
3555 if (!len) 3599 if (!len)
3556 len = PAGE_CACHE_SIZE - offset; 3600 len = PAGE_CACHE_SIZE - offset;
@@ -3668,6 +3712,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3668 3712
3669 hole_em->block_start = EXTENT_MAP_HOLE; 3713 hole_em->block_start = EXTENT_MAP_HOLE;
3670 hole_em->block_len = 0; 3714 hole_em->block_len = 0;
3715 hole_em->orig_block_len = 0;
3671 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 3716 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
3672 hole_em->compress_type = BTRFS_COMPRESS_NONE; 3717 hole_em->compress_type = BTRFS_COMPRESS_NONE;
3673 hole_em->generation = trans->transid; 3718 hole_em->generation = trans->transid;
@@ -3783,7 +3828,6 @@ void btrfs_evict_inode(struct inode *inode)
3783 struct btrfs_root *root = BTRFS_I(inode)->root; 3828 struct btrfs_root *root = BTRFS_I(inode)->root;
3784 struct btrfs_block_rsv *rsv, *global_rsv; 3829 struct btrfs_block_rsv *rsv, *global_rsv;
3785 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 3830 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
3786 unsigned long nr;
3787 int ret; 3831 int ret;
3788 3832
3789 trace_btrfs_inode_evict(inode); 3833 trace_btrfs_inode_evict(inode);
@@ -3829,7 +3873,8 @@ void btrfs_evict_inode(struct inode *inode)
3829 * inode item when doing the truncate. 3873 * inode item when doing the truncate.
3830 */ 3874 */
3831 while (1) { 3875 while (1) {
3832 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); 3876 ret = btrfs_block_rsv_refill(root, rsv, min_size,
3877 BTRFS_RESERVE_FLUSH_LIMIT);
3833 3878
3834 /* 3879 /*
3835 * Try and steal from the global reserve since we will 3880 * Try and steal from the global reserve since we will
@@ -3847,7 +3892,7 @@ void btrfs_evict_inode(struct inode *inode)
3847 goto no_delete; 3892 goto no_delete;
3848 } 3893 }
3849 3894
3850 trans = btrfs_start_transaction_noflush(root, 1); 3895 trans = btrfs_start_transaction_lflush(root, 1);
3851 if (IS_ERR(trans)) { 3896 if (IS_ERR(trans)) {
3852 btrfs_orphan_del(NULL, inode); 3897 btrfs_orphan_del(NULL, inode);
3853 btrfs_free_block_rsv(root, rsv); 3898 btrfs_free_block_rsv(root, rsv);
@@ -3864,10 +3909,9 @@ void btrfs_evict_inode(struct inode *inode)
3864 ret = btrfs_update_inode(trans, root, inode); 3909 ret = btrfs_update_inode(trans, root, inode);
3865 BUG_ON(ret); 3910 BUG_ON(ret);
3866 3911
3867 nr = trans->blocks_used;
3868 btrfs_end_transaction(trans, root); 3912 btrfs_end_transaction(trans, root);
3869 trans = NULL; 3913 trans = NULL;
3870 btrfs_btree_balance_dirty(root, nr); 3914 btrfs_btree_balance_dirty(root);
3871 } 3915 }
3872 3916
3873 btrfs_free_block_rsv(root, rsv); 3917 btrfs_free_block_rsv(root, rsv);
@@ -3883,9 +3927,8 @@ void btrfs_evict_inode(struct inode *inode)
3883 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3927 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
3884 btrfs_return_ino(root, btrfs_ino(inode)); 3928 btrfs_return_ino(root, btrfs_ino(inode));
3885 3929
3886 nr = trans->blocks_used;
3887 btrfs_end_transaction(trans, root); 3930 btrfs_end_transaction(trans, root);
3888 btrfs_btree_balance_dirty(root, nr); 3931 btrfs_btree_balance_dirty(root);
3889no_delete: 3932no_delete:
3890 clear_inode(inode); 3933 clear_inode(inode);
3891 return; 3934 return;
@@ -4775,8 +4818,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4775 if (S_ISREG(mode)) { 4818 if (S_ISREG(mode)) {
4776 if (btrfs_test_opt(root, NODATASUM)) 4819 if (btrfs_test_opt(root, NODATASUM))
4777 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4820 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
4778 if (btrfs_test_opt(root, NODATACOW) || 4821 if (btrfs_test_opt(root, NODATACOW))
4779 (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
4780 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 4822 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
4781 } 4823 }
4782 4824
@@ -4842,7 +4884,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
4842 ret = btrfs_insert_dir_item(trans, root, name, name_len, 4884 ret = btrfs_insert_dir_item(trans, root, name, name_len,
4843 parent_inode, &key, 4885 parent_inode, &key,
4844 btrfs_inode_type(inode), index); 4886 btrfs_inode_type(inode), index);
4845 if (ret == -EEXIST) 4887 if (ret == -EEXIST || ret == -EOVERFLOW)
4846 goto fail_dir_item; 4888 goto fail_dir_item;
4847 else if (ret) { 4889 else if (ret) {
4848 btrfs_abort_transaction(trans, root, ret); 4890 btrfs_abort_transaction(trans, root, ret);
@@ -4897,7 +4939,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4897 int err; 4939 int err;
4898 int drop_inode = 0; 4940 int drop_inode = 0;
4899 u64 objectid; 4941 u64 objectid;
4900 unsigned long nr = 0;
4901 u64 index = 0; 4942 u64 index = 0;
4902 4943
4903 if (!new_valid_dev(rdev)) 4944 if (!new_valid_dev(rdev))
@@ -4930,6 +4971,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4930 goto out_unlock; 4971 goto out_unlock;
4931 } 4972 }
4932 4973
4974 err = btrfs_update_inode(trans, root, inode);
4975 if (err) {
4976 drop_inode = 1;
4977 goto out_unlock;
4978 }
4979
4933 /* 4980 /*
4934 * If the active LSM wants to access the inode during 4981 * If the active LSM wants to access the inode during
4935 * d_instantiate it needs these. Smack checks to see 4982 * d_instantiate it needs these. Smack checks to see
@@ -4947,9 +4994,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4947 d_instantiate(dentry, inode); 4994 d_instantiate(dentry, inode);
4948 } 4995 }
4949out_unlock: 4996out_unlock:
4950 nr = trans->blocks_used;
4951 btrfs_end_transaction(trans, root); 4997 btrfs_end_transaction(trans, root);
4952 btrfs_btree_balance_dirty(root, nr); 4998 btrfs_btree_balance_dirty(root);
4953 if (drop_inode) { 4999 if (drop_inode) {
4954 inode_dec_link_count(inode); 5000 inode_dec_link_count(inode);
4955 iput(inode); 5001 iput(inode);
@@ -4963,9 +5009,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4963 struct btrfs_trans_handle *trans; 5009 struct btrfs_trans_handle *trans;
4964 struct btrfs_root *root = BTRFS_I(dir)->root; 5010 struct btrfs_root *root = BTRFS_I(dir)->root;
4965 struct inode *inode = NULL; 5011 struct inode *inode = NULL;
4966 int drop_inode = 0; 5012 int drop_inode_on_err = 0;
4967 int err; 5013 int err;
4968 unsigned long nr = 0;
4969 u64 objectid; 5014 u64 objectid;
4970 u64 index = 0; 5015 u64 index = 0;
4971 5016
@@ -4989,12 +5034,15 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4989 err = PTR_ERR(inode); 5034 err = PTR_ERR(inode);
4990 goto out_unlock; 5035 goto out_unlock;
4991 } 5036 }
5037 drop_inode_on_err = 1;
4992 5038
4993 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 5039 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4994 if (err) { 5040 if (err)
4995 drop_inode = 1; 5041 goto out_unlock;
5042
5043 err = btrfs_update_inode(trans, root, inode);
5044 if (err)
4996 goto out_unlock; 5045 goto out_unlock;
4997 }
4998 5046
4999 /* 5047 /*
5000 * If the active LSM wants to access the inode during 5048 * If the active LSM wants to access the inode during
@@ -5007,21 +5055,20 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
5007 5055
5008 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 5056 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5009 if (err) 5057 if (err)
5010 drop_inode = 1; 5058 goto out_unlock;
5011 else { 5059
5012 inode->i_mapping->a_ops = &btrfs_aops; 5060 inode->i_mapping->a_ops = &btrfs_aops;
5013 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 5061 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
5014 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 5062 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
5015 d_instantiate(dentry, inode); 5063 d_instantiate(dentry, inode);
5016 } 5064
5017out_unlock: 5065out_unlock:
5018 nr = trans->blocks_used;
5019 btrfs_end_transaction(trans, root); 5066 btrfs_end_transaction(trans, root);
5020 if (drop_inode) { 5067 if (err && drop_inode_on_err) {
5021 inode_dec_link_count(inode); 5068 inode_dec_link_count(inode);
5022 iput(inode); 5069 iput(inode);
5023 } 5070 }
5024 btrfs_btree_balance_dirty(root, nr); 5071 btrfs_btree_balance_dirty(root);
5025 return err; 5072 return err;
5026} 5073}
5027 5074
@@ -5032,7 +5079,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5032 struct btrfs_root *root = BTRFS_I(dir)->root; 5079 struct btrfs_root *root = BTRFS_I(dir)->root;
5033 struct inode *inode = old_dentry->d_inode; 5080 struct inode *inode = old_dentry->d_inode;
5034 u64 index; 5081 u64 index;
5035 unsigned long nr = 0;
5036 int err; 5082 int err;
5037 int drop_inode = 0; 5083 int drop_inode = 0;
5038 5084
@@ -5062,6 +5108,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5062 inode_inc_iversion(inode); 5108 inode_inc_iversion(inode);
5063 inode->i_ctime = CURRENT_TIME; 5109 inode->i_ctime = CURRENT_TIME;
5064 ihold(inode); 5110 ihold(inode);
5111 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
5065 5112
5066 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); 5113 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
5067 5114
@@ -5076,14 +5123,13 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5076 btrfs_log_new_name(trans, inode, NULL, parent); 5123 btrfs_log_new_name(trans, inode, NULL, parent);
5077 } 5124 }
5078 5125
5079 nr = trans->blocks_used;
5080 btrfs_end_transaction(trans, root); 5126 btrfs_end_transaction(trans, root);
5081fail: 5127fail:
5082 if (drop_inode) { 5128 if (drop_inode) {
5083 inode_dec_link_count(inode); 5129 inode_dec_link_count(inode);
5084 iput(inode); 5130 iput(inode);
5085 } 5131 }
5086 btrfs_btree_balance_dirty(root, nr); 5132 btrfs_btree_balance_dirty(root);
5087 return err; 5133 return err;
5088} 5134}
5089 5135
@@ -5096,7 +5142,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
5096 int drop_on_err = 0; 5142 int drop_on_err = 0;
5097 u64 objectid = 0; 5143 u64 objectid = 0;
5098 u64 index = 0; 5144 u64 index = 0;
5099 unsigned long nr = 1;
5100 5145
5101 /* 5146 /*
5102 * 2 items for inode and ref 5147 * 2 items for inode and ref
@@ -5142,11 +5187,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
5142 drop_on_err = 0; 5187 drop_on_err = 0;
5143 5188
5144out_fail: 5189out_fail:
5145 nr = trans->blocks_used;
5146 btrfs_end_transaction(trans, root); 5190 btrfs_end_transaction(trans, root);
5147 if (drop_on_err) 5191 if (drop_on_err)
5148 iput(inode); 5192 iput(inode);
5149 btrfs_btree_balance_dirty(root, nr); 5193 btrfs_btree_balance_dirty(root);
5150 return err; 5194 return err;
5151} 5195}
5152 5196
@@ -5340,6 +5384,7 @@ again:
5340 if (start + len <= found_key.offset) 5384 if (start + len <= found_key.offset)
5341 goto not_found; 5385 goto not_found;
5342 em->start = start; 5386 em->start = start;
5387 em->orig_start = start;
5343 em->len = found_key.offset - start; 5388 em->len = found_key.offset - start;
5344 goto not_found_em; 5389 goto not_found_em;
5345 } 5390 }
@@ -5350,6 +5395,8 @@ again:
5350 em->len = extent_end - extent_start; 5395 em->len = extent_end - extent_start;
5351 em->orig_start = extent_start - 5396 em->orig_start = extent_start -
5352 btrfs_file_extent_offset(leaf, item); 5397 btrfs_file_extent_offset(leaf, item);
5398 em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
5399 item);
5353 bytenr = btrfs_file_extent_disk_bytenr(leaf, item); 5400 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
5354 if (bytenr == 0) { 5401 if (bytenr == 0) {
5355 em->block_start = EXTENT_MAP_HOLE; 5402 em->block_start = EXTENT_MAP_HOLE;
@@ -5359,8 +5406,7 @@ again:
5359 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5406 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5360 em->compress_type = compress_type; 5407 em->compress_type = compress_type;
5361 em->block_start = bytenr; 5408 em->block_start = bytenr;
5362 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5409 em->block_len = em->orig_block_len;
5363 item);
5364 } else { 5410 } else {
5365 bytenr += btrfs_file_extent_offset(leaf, item); 5411 bytenr += btrfs_file_extent_offset(leaf, item);
5366 em->block_start = bytenr; 5412 em->block_start = bytenr;
@@ -5390,7 +5436,8 @@ again:
5390 em->start = extent_start + extent_offset; 5436 em->start = extent_start + extent_offset;
5391 em->len = (copy_size + root->sectorsize - 1) & 5437 em->len = (copy_size + root->sectorsize - 1) &
5392 ~((u64)root->sectorsize - 1); 5438 ~((u64)root->sectorsize - 1);
5393 em->orig_start = EXTENT_MAP_INLINE; 5439 em->orig_block_len = em->len;
5440 em->orig_start = em->start;
5394 if (compress_type) { 5441 if (compress_type) {
5395 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5442 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5396 em->compress_type = compress_type; 5443 em->compress_type = compress_type;
@@ -5439,11 +5486,11 @@ again:
5439 extent_map_end(em) - 1, NULL, GFP_NOFS); 5486 extent_map_end(em) - 1, NULL, GFP_NOFS);
5440 goto insert; 5487 goto insert;
5441 } else { 5488 } else {
5442 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 5489 WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
5443 WARN_ON(1);
5444 } 5490 }
5445not_found: 5491not_found:
5446 em->start = start; 5492 em->start = start;
5493 em->orig_start = start;
5447 em->len = len; 5494 em->len = len;
5448not_found_em: 5495not_found_em:
5449 em->block_start = EXTENT_MAP_HOLE; 5496 em->block_start = EXTENT_MAP_HOLE;
@@ -5645,38 +5692,19 @@ out:
5645} 5692}
5646 5693
5647static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5694static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5648 struct extent_map *em,
5649 u64 start, u64 len) 5695 u64 start, u64 len)
5650{ 5696{
5651 struct btrfs_root *root = BTRFS_I(inode)->root; 5697 struct btrfs_root *root = BTRFS_I(inode)->root;
5652 struct btrfs_trans_handle *trans; 5698 struct btrfs_trans_handle *trans;
5653 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5699 struct extent_map *em;
5654 struct btrfs_key ins; 5700 struct btrfs_key ins;
5655 u64 alloc_hint; 5701 u64 alloc_hint;
5656 int ret; 5702 int ret;
5657 bool insert = false;
5658
5659 /*
5660 * Ok if the extent map we looked up is a hole and is for the exact
5661 * range we want, there is no reason to allocate a new one, however if
5662 * it is not right then we need to free this one and drop the cache for
5663 * our range.
5664 */
5665 if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
5666 em->len != len) {
5667 free_extent_map(em);
5668 em = NULL;
5669 insert = true;
5670 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5671 }
5672 5703
5673 trans = btrfs_join_transaction(root); 5704 trans = btrfs_join_transaction(root);
5674 if (IS_ERR(trans)) 5705 if (IS_ERR(trans))
5675 return ERR_CAST(trans); 5706 return ERR_CAST(trans);
5676 5707
5677 if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
5678 btrfs_add_inode_defrag(trans, inode);
5679
5680 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5708 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5681 5709
5682 alloc_hint = get_extent_allocation_hint(inode, start, len); 5710 alloc_hint = get_extent_allocation_hint(inode, start, len);
@@ -5687,37 +5715,10 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5687 goto out; 5715 goto out;
5688 } 5716 }
5689 5717
5690 if (!em) { 5718 em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
5691 em = alloc_extent_map(); 5719 ins.offset, ins.offset, 0);
5692 if (!em) { 5720 if (IS_ERR(em))
5693 em = ERR_PTR(-ENOMEM); 5721 goto out;
5694 goto out;
5695 }
5696 }
5697
5698 em->start = start;
5699 em->orig_start = em->start;
5700 em->len = ins.offset;
5701
5702 em->block_start = ins.objectid;
5703 em->block_len = ins.offset;
5704 em->bdev = root->fs_info->fs_devices->latest_bdev;
5705
5706 /*
5707 * We need to do this because if we're using the original em we searched
5708 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
5709 */
5710 em->flags = 0;
5711 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5712
5713 while (insert) {
5714 write_lock(&em_tree->lock);
5715 ret = add_extent_mapping(em_tree, em);
5716 write_unlock(&em_tree->lock);
5717 if (ret != -EEXIST)
5718 break;
5719 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
5720 }
5721 5722
5722 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, 5723 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
5723 ins.offset, ins.offset, 0); 5724 ins.offset, ins.offset, 0);
@@ -5894,7 +5895,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
5894static struct extent_map *create_pinned_em(struct inode *inode, u64 start, 5895static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
5895 u64 len, u64 orig_start, 5896 u64 len, u64 orig_start,
5896 u64 block_start, u64 block_len, 5897 u64 block_start, u64 block_len,
5897 int type) 5898 u64 orig_block_len, int type)
5898{ 5899{
5899 struct extent_map_tree *em_tree; 5900 struct extent_map_tree *em_tree;
5900 struct extent_map *em; 5901 struct extent_map *em;
@@ -5912,15 +5913,20 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
5912 em->block_len = block_len; 5913 em->block_len = block_len;
5913 em->block_start = block_start; 5914 em->block_start = block_start;
5914 em->bdev = root->fs_info->fs_devices->latest_bdev; 5915 em->bdev = root->fs_info->fs_devices->latest_bdev;
5916 em->orig_block_len = orig_block_len;
5917 em->generation = -1;
5915 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5918 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5916 if (type == BTRFS_ORDERED_PREALLOC) 5919 if (type == BTRFS_ORDERED_PREALLOC)
5917 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 5920 set_bit(EXTENT_FLAG_FILLING, &em->flags);
5918 5921
5919 do { 5922 do {
5920 btrfs_drop_extent_cache(inode, em->start, 5923 btrfs_drop_extent_cache(inode, em->start,
5921 em->start + em->len - 1, 0); 5924 em->start + em->len - 1, 0);
5922 write_lock(&em_tree->lock); 5925 write_lock(&em_tree->lock);
5923 ret = add_extent_mapping(em_tree, em); 5926 ret = add_extent_mapping(em_tree, em);
5927 if (!ret)
5928 list_move(&em->list,
5929 &em_tree->modified_extents);
5924 write_unlock(&em_tree->lock); 5930 write_unlock(&em_tree->lock);
5925 } while (ret == -EEXIST); 5931 } while (ret == -EEXIST);
5926 5932
@@ -6047,13 +6053,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
6047 goto must_cow; 6053 goto must_cow;
6048 6054
6049 if (can_nocow_odirect(trans, inode, start, len) == 1) { 6055 if (can_nocow_odirect(trans, inode, start, len) == 1) {
6050 u64 orig_start = em->start; 6056 u64 orig_start = em->orig_start;
6057 u64 orig_block_len = em->orig_block_len;
6051 6058
6052 if (type == BTRFS_ORDERED_PREALLOC) { 6059 if (type == BTRFS_ORDERED_PREALLOC) {
6053 free_extent_map(em); 6060 free_extent_map(em);
6054 em = create_pinned_em(inode, start, len, 6061 em = create_pinned_em(inode, start, len,
6055 orig_start, 6062 orig_start,
6056 block_start, len, type); 6063 block_start, len,
6064 orig_block_len, type);
6057 if (IS_ERR(em)) { 6065 if (IS_ERR(em)) {
6058 btrfs_end_transaction(trans, root); 6066 btrfs_end_transaction(trans, root);
6059 goto unlock_err; 6067 goto unlock_err;
@@ -6077,7 +6085,8 @@ must_cow:
6077 * it above 6085 * it above
6078 */ 6086 */
6079 len = bh_result->b_size; 6087 len = bh_result->b_size;
6080 em = btrfs_new_extent_direct(inode, em, start, len); 6088 free_extent_map(em);
6089 em = btrfs_new_extent_direct(inode, start, len);
6081 if (IS_ERR(em)) { 6090 if (IS_ERR(em)) {
6082 ret = PTR_ERR(em); 6091 ret = PTR_ERR(em);
6083 goto unlock_err; 6092 goto unlock_err;
@@ -6318,6 +6327,9 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
6318 struct btrfs_root *root = BTRFS_I(inode)->root; 6327 struct btrfs_root *root = BTRFS_I(inode)->root;
6319 int ret; 6328 int ret;
6320 6329
6330 if (async_submit)
6331 async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
6332
6321 bio_get(bio); 6333 bio_get(bio);
6322 6334
6323 if (!write) { 6335 if (!write) {
@@ -6362,7 +6374,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6362{ 6374{
6363 struct inode *inode = dip->inode; 6375 struct inode *inode = dip->inode;
6364 struct btrfs_root *root = BTRFS_I(inode)->root; 6376 struct btrfs_root *root = BTRFS_I(inode)->root;
6365 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
6366 struct bio *bio; 6377 struct bio *bio;
6367 struct bio *orig_bio = dip->orig_bio; 6378 struct bio *orig_bio = dip->orig_bio;
6368 struct bio_vec *bvec = orig_bio->bi_io_vec; 6379 struct bio_vec *bvec = orig_bio->bi_io_vec;
@@ -6375,7 +6386,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6375 int async_submit = 0; 6386 int async_submit = 0;
6376 6387
6377 map_length = orig_bio->bi_size; 6388 map_length = orig_bio->bi_size;
6378 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6389 ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,
6379 &map_length, NULL, 0); 6390 &map_length, NULL, 0);
6380 if (ret) { 6391 if (ret) {
6381 bio_put(orig_bio); 6392 bio_put(orig_bio);
@@ -6429,7 +6440,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6429 bio->bi_end_io = btrfs_end_dio_bio; 6440 bio->bi_end_io = btrfs_end_dio_bio;
6430 6441
6431 map_length = orig_bio->bi_size; 6442 map_length = orig_bio->bi_size;
6432 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6443 ret = btrfs_map_block(root->fs_info, READ,
6444 start_sector << 9,
6433 &map_length, NULL, 0); 6445 &map_length, NULL, 0);
6434 if (ret) { 6446 if (ret) {
6435 bio_put(bio); 6447 bio_put(bio);
@@ -6582,9 +6594,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
6582 btrfs_submit_direct, 0); 6594 btrfs_submit_direct, 0);
6583} 6595}
6584 6596
6597#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
6598
6585static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6599static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
6586 __u64 start, __u64 len) 6600 __u64 start, __u64 len)
6587{ 6601{
6602 int ret;
6603
6604 ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
6605 if (ret)
6606 return ret;
6607
6588 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); 6608 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
6589} 6609}
6590 6610
@@ -6855,7 +6875,6 @@ static int btrfs_truncate(struct inode *inode)
6855 int ret; 6875 int ret;
6856 int err = 0; 6876 int err = 0;
6857 struct btrfs_trans_handle *trans; 6877 struct btrfs_trans_handle *trans;
6858 unsigned long nr;
6859 u64 mask = root->sectorsize - 1; 6878 u64 mask = root->sectorsize - 1;
6860 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 6879 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6861 6880
@@ -6978,9 +6997,8 @@ static int btrfs_truncate(struct inode *inode)
6978 break; 6997 break;
6979 } 6998 }
6980 6999
6981 nr = trans->blocks_used;
6982 btrfs_end_transaction(trans, root); 7000 btrfs_end_transaction(trans, root);
6983 btrfs_btree_balance_dirty(root, nr); 7001 btrfs_btree_balance_dirty(root);
6984 7002
6985 trans = btrfs_start_transaction(root, 2); 7003 trans = btrfs_start_transaction(root, 2);
6986 if (IS_ERR(trans)) { 7004 if (IS_ERR(trans)) {
@@ -7014,9 +7032,8 @@ static int btrfs_truncate(struct inode *inode)
7014 if (ret && !err) 7032 if (ret && !err)
7015 err = ret; 7033 err = ret;
7016 7034
7017 nr = trans->blocks_used;
7018 ret = btrfs_end_transaction(trans, root); 7035 ret = btrfs_end_transaction(trans, root);
7019 btrfs_btree_balance_dirty(root, nr); 7036 btrfs_btree_balance_dirty(root);
7020 } 7037 }
7021 7038
7022out: 7039out:
@@ -7093,6 +7110,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
7093 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 7110 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
7094 ei->io_tree.track_uptodate = 1; 7111 ei->io_tree.track_uptodate = 1;
7095 ei->io_failure_tree.track_uptodate = 1; 7112 ei->io_failure_tree.track_uptodate = 1;
7113 atomic_set(&ei->sync_writers, 0);
7096 mutex_init(&ei->log_mutex); 7114 mutex_init(&ei->log_mutex);
7097 mutex_init(&ei->delalloc_mutex); 7115 mutex_init(&ei->delalloc_mutex);
7098 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 7116 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
@@ -7203,6 +7221,8 @@ void btrfs_destroy_cachep(void)
7203 kmem_cache_destroy(btrfs_path_cachep); 7221 kmem_cache_destroy(btrfs_path_cachep);
7204 if (btrfs_free_space_cachep) 7222 if (btrfs_free_space_cachep)
7205 kmem_cache_destroy(btrfs_free_space_cachep); 7223 kmem_cache_destroy(btrfs_free_space_cachep);
7224 if (btrfs_delalloc_work_cachep)
7225 kmem_cache_destroy(btrfs_delalloc_work_cachep);
7206} 7226}
7207 7227
7208int btrfs_init_cachep(void) 7228int btrfs_init_cachep(void)
@@ -7237,6 +7257,13 @@ int btrfs_init_cachep(void)
7237 if (!btrfs_free_space_cachep) 7257 if (!btrfs_free_space_cachep)
7238 goto fail; 7258 goto fail;
7239 7259
7260 btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
7261 sizeof(struct btrfs_delalloc_work), 0,
7262 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
7263 NULL);
7264 if (!btrfs_delalloc_work_cachep)
7265 goto fail;
7266
7240 return 0; 7267 return 0;
7241fail: 7268fail:
7242 btrfs_destroy_cachep(); 7269 btrfs_destroy_cachep();
@@ -7308,6 +7335,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7308 if (S_ISDIR(old_inode->i_mode) && new_inode && 7335 if (S_ISDIR(old_inode->i_mode) && new_inode &&
7309 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 7336 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
7310 return -ENOTEMPTY; 7337 return -ENOTEMPTY;
7338
7339
7340 /* check for collisions, even if the name isn't there */
7341 ret = btrfs_check_dir_item_collision(root, new_dir->i_ino,
7342 new_dentry->d_name.name,
7343 new_dentry->d_name.len);
7344
7345 if (ret) {
7346 if (ret == -EEXIST) {
7347 /* we shouldn't get
7348 * eexist without a new_inode */
7349 if (!new_inode) {
7350 WARN_ON(1);
7351 return ret;
7352 }
7353 } else {
7354 /* maybe -EOVERFLOW */
7355 return ret;
7356 }
7357 }
7358 ret = 0;
7359
7311 /* 7360 /*
7312 * we're using rename to replace one file with another. 7361 * we're using rename to replace one file with another.
7313 * and the replacement file is large. Start IO on it now so 7362 * and the replacement file is large. Start IO on it now so
@@ -7447,6 +7496,49 @@ out_notrans:
7447 return ret; 7496 return ret;
7448} 7497}
7449 7498
7499static void btrfs_run_delalloc_work(struct btrfs_work *work)
7500{
7501 struct btrfs_delalloc_work *delalloc_work;
7502
7503 delalloc_work = container_of(work, struct btrfs_delalloc_work,
7504 work);
7505 if (delalloc_work->wait)
7506 btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
7507 else
7508 filemap_flush(delalloc_work->inode->i_mapping);
7509
7510 if (delalloc_work->delay_iput)
7511 btrfs_add_delayed_iput(delalloc_work->inode);
7512 else
7513 iput(delalloc_work->inode);
7514 complete(&delalloc_work->completion);
7515}
7516
7517struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
7518 int wait, int delay_iput)
7519{
7520 struct btrfs_delalloc_work *work;
7521
7522 work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
7523 if (!work)
7524 return NULL;
7525
7526 init_completion(&work->completion);
7527 INIT_LIST_HEAD(&work->list);
7528 work->inode = inode;
7529 work->wait = wait;
7530 work->delay_iput = delay_iput;
7531 work->work.func = btrfs_run_delalloc_work;
7532
7533 return work;
7534}
7535
7536void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
7537{
7538 wait_for_completion(&work->completion);
7539 kmem_cache_free(btrfs_delalloc_work_cachep, work);
7540}
7541
7450/* 7542/*
7451 * some fairly slow code that needs optimization. This walks the list 7543 * some fairly slow code that needs optimization. This walks the list
7452 * of all the inodes with pending delalloc and forces them to disk. 7544 * of all the inodes with pending delalloc and forces them to disk.
@@ -7456,10 +7548,15 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7456 struct list_head *head = &root->fs_info->delalloc_inodes; 7548 struct list_head *head = &root->fs_info->delalloc_inodes;
7457 struct btrfs_inode *binode; 7549 struct btrfs_inode *binode;
7458 struct inode *inode; 7550 struct inode *inode;
7551 struct btrfs_delalloc_work *work, *next;
7552 struct list_head works;
7553 int ret = 0;
7459 7554
7460 if (root->fs_info->sb->s_flags & MS_RDONLY) 7555 if (root->fs_info->sb->s_flags & MS_RDONLY)
7461 return -EROFS; 7556 return -EROFS;
7462 7557
7558 INIT_LIST_HEAD(&works);
7559
7463 spin_lock(&root->fs_info->delalloc_lock); 7560 spin_lock(&root->fs_info->delalloc_lock);
7464 while (!list_empty(head)) { 7561 while (!list_empty(head)) {
7465 binode = list_entry(head->next, struct btrfs_inode, 7562 binode = list_entry(head->next, struct btrfs_inode,
@@ -7469,11 +7566,14 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7469 list_del_init(&binode->delalloc_inodes); 7566 list_del_init(&binode->delalloc_inodes);
7470 spin_unlock(&root->fs_info->delalloc_lock); 7567 spin_unlock(&root->fs_info->delalloc_lock);
7471 if (inode) { 7568 if (inode) {
7472 filemap_flush(inode->i_mapping); 7569 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
7473 if (delay_iput) 7570 if (!work) {
7474 btrfs_add_delayed_iput(inode); 7571 ret = -ENOMEM;
7475 else 7572 goto out;
7476 iput(inode); 7573 }
7574 list_add_tail(&work->list, &works);
7575 btrfs_queue_worker(&root->fs_info->flush_workers,
7576 &work->work);
7477 } 7577 }
7478 cond_resched(); 7578 cond_resched();
7479 spin_lock(&root->fs_info->delalloc_lock); 7579 spin_lock(&root->fs_info->delalloc_lock);
@@ -7492,7 +7592,12 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7492 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 7592 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
7493 } 7593 }
7494 atomic_dec(&root->fs_info->async_submit_draining); 7594 atomic_dec(&root->fs_info->async_submit_draining);
7495 return 0; 7595out:
7596 list_for_each_entry_safe(work, next, &works, list) {
7597 list_del_init(&work->list);
7598 btrfs_wait_and_free_delalloc_work(work);
7599 }
7600 return ret;
7496} 7601}
7497 7602
7498static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 7603static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
@@ -7512,7 +7617,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7512 unsigned long ptr; 7617 unsigned long ptr;
7513 struct btrfs_file_extent_item *ei; 7618 struct btrfs_file_extent_item *ei;
7514 struct extent_buffer *leaf; 7619 struct extent_buffer *leaf;
7515 unsigned long nr = 0;
7516 7620
7517 name_len = strlen(symname) + 1; 7621 name_len = strlen(symname) + 1;
7518 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 7622 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
@@ -7610,13 +7714,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7610out_unlock: 7714out_unlock:
7611 if (!err) 7715 if (!err)
7612 d_instantiate(dentry, inode); 7716 d_instantiate(dentry, inode);
7613 nr = trans->blocks_used;
7614 btrfs_end_transaction(trans, root); 7717 btrfs_end_transaction(trans, root);
7615 if (drop_inode) { 7718 if (drop_inode) {
7616 inode_dec_link_count(inode); 7719 inode_dec_link_count(inode);
7617 iput(inode); 7720 iput(inode);
7618 } 7721 }
7619 btrfs_btree_balance_dirty(root, nr); 7722 btrfs_btree_balance_dirty(root);
7620 return err; 7723 return err;
7621} 7724}
7622 7725
@@ -7679,6 +7782,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7679 em->len = ins.offset; 7782 em->len = ins.offset;
7680 em->block_start = ins.objectid; 7783 em->block_start = ins.objectid;
7681 em->block_len = ins.offset; 7784 em->block_len = ins.offset;
7785 em->orig_block_len = ins.offset;
7682 em->bdev = root->fs_info->fs_devices->latest_bdev; 7786 em->bdev = root->fs_info->fs_devices->latest_bdev;
7683 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 7787 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7684 em->generation = trans->transid; 7788 em->generation = trans->transid;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8fcf9a59c28d..4b4516770f05 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -55,6 +55,7 @@
55#include "backref.h" 55#include "backref.h"
56#include "rcu-string.h" 56#include "rcu-string.h"
57#include "send.h" 57#include "send.h"
58#include "dev-replace.h"
58 59
59/* Mask out flags that are inappropriate for the given type of inode. */ 60/* Mask out flags that are inappropriate for the given type of inode. */
60static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 61static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -140,8 +141,11 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
140 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; 141 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
141 } 142 }
142 143
143 if (flags & BTRFS_INODE_NODATACOW) 144 if (flags & BTRFS_INODE_NODATACOW) {
144 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 145 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
146 if (S_ISREG(inode->i_mode))
147 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
148 }
145 149
146 btrfs_update_iflags(inode); 150 btrfs_update_iflags(inode);
147} 151}
@@ -571,8 +575,12 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
571 ret = btrfs_commit_transaction(trans, 575 ret = btrfs_commit_transaction(trans,
572 root->fs_info->extent_root); 576 root->fs_info->extent_root);
573 } 577 }
574 if (ret) 578 if (ret) {
579 /* cleanup_transaction has freed this for us */
580 if (trans->aborted)
581 pending_snapshot = NULL;
575 goto fail; 582 goto fail;
583 }
576 584
577 ret = pending_snapshot->error; 585 ret = pending_snapshot->error;
578 if (ret) 586 if (ret)
@@ -705,6 +713,16 @@ static noinline int btrfs_mksubvol(struct path *parent,
705 if (error) 713 if (error)
706 goto out_dput; 714 goto out_dput;
707 715
716 /*
717 * even if this name doesn't exist, we may get hash collisions.
718 * check for them now when we can safely fail
719 */
720 error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
721 dir->i_ino, name,
722 namelen);
723 if (error)
724 goto out_dput;
725
708 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); 726 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
709 727
710 if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) 728 if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
@@ -1225,7 +1243,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1225 } 1243 }
1226 1244
1227 defrag_count += ret; 1245 defrag_count += ret;
1228 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); 1246 balance_dirty_pages_ratelimited(inode->i_mapping);
1229 mutex_unlock(&inode->i_mutex); 1247 mutex_unlock(&inode->i_mutex);
1230 1248
1231 if (newer_than) { 1249 if (newer_than) {
@@ -1293,12 +1311,13 @@ out_ra:
1293 return ret; 1311 return ret;
1294} 1312}
1295 1313
1296static noinline int btrfs_ioctl_resize(struct btrfs_root *root, 1314static noinline int btrfs_ioctl_resize(struct file *file,
1297 void __user *arg) 1315 void __user *arg)
1298{ 1316{
1299 u64 new_size; 1317 u64 new_size;
1300 u64 old_size; 1318 u64 old_size;
1301 u64 devid = 1; 1319 u64 devid = 1;
1320 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
1302 struct btrfs_ioctl_vol_args *vol_args; 1321 struct btrfs_ioctl_vol_args *vol_args;
1303 struct btrfs_trans_handle *trans; 1322 struct btrfs_trans_handle *trans;
1304 struct btrfs_device *device = NULL; 1323 struct btrfs_device *device = NULL;
@@ -1313,13 +1332,17 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1313 if (!capable(CAP_SYS_ADMIN)) 1332 if (!capable(CAP_SYS_ADMIN))
1314 return -EPERM; 1333 return -EPERM;
1315 1334
1316 mutex_lock(&root->fs_info->volume_mutex); 1335 ret = mnt_want_write_file(file);
1317 if (root->fs_info->balance_ctl) { 1336 if (ret)
1318 printk(KERN_INFO "btrfs: balance in progress\n"); 1337 return ret;
1319 ret = -EINVAL; 1338
1320 goto out; 1339 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
1340 1)) {
1341 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
1342 return -EINPROGRESS;
1321 } 1343 }
1322 1344
1345 mutex_lock(&root->fs_info->volume_mutex);
1323 vol_args = memdup_user(arg, sizeof(*vol_args)); 1346 vol_args = memdup_user(arg, sizeof(*vol_args));
1324 if (IS_ERR(vol_args)) { 1347 if (IS_ERR(vol_args)) {
1325 ret = PTR_ERR(vol_args); 1348 ret = PTR_ERR(vol_args);
@@ -1339,7 +1362,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1339 printk(KERN_INFO "btrfs: resizing devid %llu\n", 1362 printk(KERN_INFO "btrfs: resizing devid %llu\n",
1340 (unsigned long long)devid); 1363 (unsigned long long)devid);
1341 } 1364 }
1342 device = btrfs_find_device(root, devid, NULL, NULL); 1365 device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
1343 if (!device) { 1366 if (!device) {
1344 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", 1367 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1345 (unsigned long long)devid); 1368 (unsigned long long)devid);
@@ -1371,6 +1394,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1371 } 1394 }
1372 } 1395 }
1373 1396
1397 if (device->is_tgtdev_for_dev_replace) {
1398 ret = -EINVAL;
1399 goto out_free;
1400 }
1401
1374 old_size = device->total_bytes; 1402 old_size = device->total_bytes;
1375 1403
1376 if (mod < 0) { 1404 if (mod < 0) {
@@ -1409,12 +1437,14 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1409 btrfs_commit_transaction(trans, root); 1437 btrfs_commit_transaction(trans, root);
1410 } else if (new_size < old_size) { 1438 } else if (new_size < old_size) {
1411 ret = btrfs_shrink_device(device, new_size); 1439 ret = btrfs_shrink_device(device, new_size);
1412 } 1440 } /* equal, nothing need to do */
1413 1441
1414out_free: 1442out_free:
1415 kfree(vol_args); 1443 kfree(vol_args);
1416out: 1444out:
1417 mutex_unlock(&root->fs_info->volume_mutex); 1445 mutex_unlock(&root->fs_info->volume_mutex);
1446 mnt_drop_write_file(file);
1447 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
1418 return ret; 1448 return ret;
1419} 1449}
1420 1450
@@ -2156,9 +2186,17 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2156 if (btrfs_root_readonly(root)) 2186 if (btrfs_root_readonly(root))
2157 return -EROFS; 2187 return -EROFS;
2158 2188
2189 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2190 1)) {
2191 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2192 return -EINPROGRESS;
2193 }
2159 ret = mnt_want_write_file(file); 2194 ret = mnt_want_write_file(file);
2160 if (ret) 2195 if (ret) {
2196 atomic_set(&root->fs_info->mutually_exclusive_operation_running,
2197 0);
2161 return ret; 2198 return ret;
2199 }
2162 2200
2163 switch (inode->i_mode & S_IFMT) { 2201 switch (inode->i_mode & S_IFMT) {
2164 case S_IFDIR: 2202 case S_IFDIR:
@@ -2210,6 +2248,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2210 } 2248 }
2211out: 2249out:
2212 mnt_drop_write_file(file); 2250 mnt_drop_write_file(file);
2251 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2213 return ret; 2252 return ret;
2214} 2253}
2215 2254
@@ -2221,13 +2260,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2221 if (!capable(CAP_SYS_ADMIN)) 2260 if (!capable(CAP_SYS_ADMIN))
2222 return -EPERM; 2261 return -EPERM;
2223 2262
2224 mutex_lock(&root->fs_info->volume_mutex); 2263 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2225 if (root->fs_info->balance_ctl) { 2264 1)) {
2226 printk(KERN_INFO "btrfs: balance in progress\n"); 2265 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2227 ret = -EINVAL; 2266 return -EINPROGRESS;
2228 goto out;
2229 } 2267 }
2230 2268
2269 mutex_lock(&root->fs_info->volume_mutex);
2231 vol_args = memdup_user(arg, sizeof(*vol_args)); 2270 vol_args = memdup_user(arg, sizeof(*vol_args));
2232 if (IS_ERR(vol_args)) { 2271 if (IS_ERR(vol_args)) {
2233 ret = PTR_ERR(vol_args); 2272 ret = PTR_ERR(vol_args);
@@ -2240,27 +2279,31 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2240 kfree(vol_args); 2279 kfree(vol_args);
2241out: 2280out:
2242 mutex_unlock(&root->fs_info->volume_mutex); 2281 mutex_unlock(&root->fs_info->volume_mutex);
2282 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2243 return ret; 2283 return ret;
2244} 2284}
2245 2285
2246static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) 2286static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2247{ 2287{
2288 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
2248 struct btrfs_ioctl_vol_args *vol_args; 2289 struct btrfs_ioctl_vol_args *vol_args;
2249 int ret; 2290 int ret;
2250 2291
2251 if (!capable(CAP_SYS_ADMIN)) 2292 if (!capable(CAP_SYS_ADMIN))
2252 return -EPERM; 2293 return -EPERM;
2253 2294
2254 if (root->fs_info->sb->s_flags & MS_RDONLY) 2295 ret = mnt_want_write_file(file);
2255 return -EROFS; 2296 if (ret)
2297 return ret;
2256 2298
2257 mutex_lock(&root->fs_info->volume_mutex); 2299 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2258 if (root->fs_info->balance_ctl) { 2300 1)) {
2259 printk(KERN_INFO "btrfs: balance in progress\n"); 2301 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2260 ret = -EINVAL; 2302 mnt_drop_write_file(file);
2261 goto out; 2303 return -EINPROGRESS;
2262 } 2304 }
2263 2305
2306 mutex_lock(&root->fs_info->volume_mutex);
2264 vol_args = memdup_user(arg, sizeof(*vol_args)); 2307 vol_args = memdup_user(arg, sizeof(*vol_args));
2265 if (IS_ERR(vol_args)) { 2308 if (IS_ERR(vol_args)) {
2266 ret = PTR_ERR(vol_args); 2309 ret = PTR_ERR(vol_args);
@@ -2273,6 +2316,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
2273 kfree(vol_args); 2316 kfree(vol_args);
2274out: 2317out:
2275 mutex_unlock(&root->fs_info->volume_mutex); 2318 mutex_unlock(&root->fs_info->volume_mutex);
2319 mnt_drop_write_file(file);
2320 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2276 return ret; 2321 return ret;
2277} 2322}
2278 2323
@@ -2328,7 +2373,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
2328 s_uuid = di_args->uuid; 2373 s_uuid = di_args->uuid;
2329 2374
2330 mutex_lock(&fs_devices->device_list_mutex); 2375 mutex_lock(&fs_devices->device_list_mutex);
2331 dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL); 2376 dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
2332 mutex_unlock(&fs_devices->device_list_mutex); 2377 mutex_unlock(&fs_devices->device_list_mutex);
2333 2378
2334 if (!dev) { 2379 if (!dev) {
@@ -2821,12 +2866,19 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2821 struct btrfs_disk_key disk_key; 2866 struct btrfs_disk_key disk_key;
2822 u64 objectid = 0; 2867 u64 objectid = 0;
2823 u64 dir_id; 2868 u64 dir_id;
2869 int ret;
2824 2870
2825 if (!capable(CAP_SYS_ADMIN)) 2871 if (!capable(CAP_SYS_ADMIN))
2826 return -EPERM; 2872 return -EPERM;
2827 2873
2828 if (copy_from_user(&objectid, argp, sizeof(objectid))) 2874 ret = mnt_want_write_file(file);
2829 return -EFAULT; 2875 if (ret)
2876 return ret;
2877
2878 if (copy_from_user(&objectid, argp, sizeof(objectid))) {
2879 ret = -EFAULT;
2880 goto out;
2881 }
2830 2882
2831 if (!objectid) 2883 if (!objectid)
2832 objectid = root->root_key.objectid; 2884 objectid = root->root_key.objectid;
@@ -2836,21 +2888,28 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2836 location.offset = (u64)-1; 2888 location.offset = (u64)-1;
2837 2889
2838 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 2890 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
2839 if (IS_ERR(new_root)) 2891 if (IS_ERR(new_root)) {
2840 return PTR_ERR(new_root); 2892 ret = PTR_ERR(new_root);
2893 goto out;
2894 }
2841 2895
2842 if (btrfs_root_refs(&new_root->root_item) == 0) 2896 if (btrfs_root_refs(&new_root->root_item) == 0) {
2843 return -ENOENT; 2897 ret = -ENOENT;
2898 goto out;
2899 }
2844 2900
2845 path = btrfs_alloc_path(); 2901 path = btrfs_alloc_path();
2846 if (!path) 2902 if (!path) {
2847 return -ENOMEM; 2903 ret = -ENOMEM;
2904 goto out;
2905 }
2848 path->leave_spinning = 1; 2906 path->leave_spinning = 1;
2849 2907
2850 trans = btrfs_start_transaction(root, 1); 2908 trans = btrfs_start_transaction(root, 1);
2851 if (IS_ERR(trans)) { 2909 if (IS_ERR(trans)) {
2852 btrfs_free_path(path); 2910 btrfs_free_path(path);
2853 return PTR_ERR(trans); 2911 ret = PTR_ERR(trans);
2912 goto out;
2854 } 2913 }
2855 2914
2856 dir_id = btrfs_super_root_dir(root->fs_info->super_copy); 2915 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
@@ -2861,7 +2920,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2861 btrfs_end_transaction(trans, root); 2920 btrfs_end_transaction(trans, root);
2862 printk(KERN_ERR "Umm, you don't have the default dir item, " 2921 printk(KERN_ERR "Umm, you don't have the default dir item, "
2863 "this isn't going to work\n"); 2922 "this isn't going to work\n");
2864 return -ENOENT; 2923 ret = -ENOENT;
2924 goto out;
2865 } 2925 }
2866 2926
2867 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); 2927 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
@@ -2871,8 +2931,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2871 2931
2872 btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL); 2932 btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
2873 btrfs_end_transaction(trans, root); 2933 btrfs_end_transaction(trans, root);
2874 2934out:
2875 return 0; 2935 mnt_drop_write_file(file);
2936 return ret;
2876} 2937}
2877 2938
2878void btrfs_get_block_group_info(struct list_head *groups_list, 2939void btrfs_get_block_group_info(struct list_head *groups_list,
@@ -3036,32 +3097,38 @@ long btrfs_ioctl_trans_end(struct file *file)
3036 return 0; 3097 return 0;
3037} 3098}
3038 3099
3039static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp) 3100static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
3101 void __user *argp)
3040{ 3102{
3041 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
3042 struct btrfs_trans_handle *trans; 3103 struct btrfs_trans_handle *trans;
3043 u64 transid; 3104 u64 transid;
3044 int ret; 3105 int ret;
3045 3106
3046 trans = btrfs_start_transaction(root, 0); 3107 trans = btrfs_attach_transaction(root);
3047 if (IS_ERR(trans)) 3108 if (IS_ERR(trans)) {
3048 return PTR_ERR(trans); 3109 if (PTR_ERR(trans) != -ENOENT)
3110 return PTR_ERR(trans);
3111
3112 /* No running transaction, don't bother */
3113 transid = root->fs_info->last_trans_committed;
3114 goto out;
3115 }
3049 transid = trans->transid; 3116 transid = trans->transid;
3050 ret = btrfs_commit_transaction_async(trans, root, 0); 3117 ret = btrfs_commit_transaction_async(trans, root, 0);
3051 if (ret) { 3118 if (ret) {
3052 btrfs_end_transaction(trans, root); 3119 btrfs_end_transaction(trans, root);
3053 return ret; 3120 return ret;
3054 } 3121 }
3055 3122out:
3056 if (argp) 3123 if (argp)
3057 if (copy_to_user(argp, &transid, sizeof(transid))) 3124 if (copy_to_user(argp, &transid, sizeof(transid)))
3058 return -EFAULT; 3125 return -EFAULT;
3059 return 0; 3126 return 0;
3060} 3127}
3061 3128
3062static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) 3129static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
3130 void __user *argp)
3063{ 3131{
3064 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
3065 u64 transid; 3132 u64 transid;
3066 3133
3067 if (argp) { 3134 if (argp) {
@@ -3073,10 +3140,11 @@ static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
3073 return btrfs_wait_for_commit(root, transid); 3140 return btrfs_wait_for_commit(root, transid);
3074} 3141}
3075 3142
3076static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) 3143static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
3077{ 3144{
3078 int ret; 3145 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3079 struct btrfs_ioctl_scrub_args *sa; 3146 struct btrfs_ioctl_scrub_args *sa;
3147 int ret;
3080 3148
3081 if (!capable(CAP_SYS_ADMIN)) 3149 if (!capable(CAP_SYS_ADMIN))
3082 return -EPERM; 3150 return -EPERM;
@@ -3085,12 +3153,22 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
3085 if (IS_ERR(sa)) 3153 if (IS_ERR(sa))
3086 return PTR_ERR(sa); 3154 return PTR_ERR(sa);
3087 3155
3088 ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end, 3156 if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
3089 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY); 3157 ret = mnt_want_write_file(file);
3158 if (ret)
3159 goto out;
3160 }
3161
3162 ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
3163 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
3164 0);
3090 3165
3091 if (copy_to_user(arg, sa, sizeof(*sa))) 3166 if (copy_to_user(arg, sa, sizeof(*sa)))
3092 ret = -EFAULT; 3167 ret = -EFAULT;
3093 3168
3169 if (!(sa->flags & BTRFS_SCRUB_READONLY))
3170 mnt_drop_write_file(file);
3171out:
3094 kfree(sa); 3172 kfree(sa);
3095 return ret; 3173 return ret;
3096} 3174}
@@ -3100,7 +3178,7 @@ static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
3100 if (!capable(CAP_SYS_ADMIN)) 3178 if (!capable(CAP_SYS_ADMIN))
3101 return -EPERM; 3179 return -EPERM;
3102 3180
3103 return btrfs_scrub_cancel(root); 3181 return btrfs_scrub_cancel(root->fs_info);
3104} 3182}
3105 3183
3106static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, 3184static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
@@ -3149,6 +3227,51 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
3149 return ret; 3227 return ret;
3150} 3228}
3151 3229
3230static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
3231{
3232 struct btrfs_ioctl_dev_replace_args *p;
3233 int ret;
3234
3235 if (!capable(CAP_SYS_ADMIN))
3236 return -EPERM;
3237
3238 p = memdup_user(arg, sizeof(*p));
3239 if (IS_ERR(p))
3240 return PTR_ERR(p);
3241
3242 switch (p->cmd) {
3243 case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
3244 if (atomic_xchg(
3245 &root->fs_info->mutually_exclusive_operation_running,
3246 1)) {
3247 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
3248 ret = -EINPROGRESS;
3249 } else {
3250 ret = btrfs_dev_replace_start(root, p);
3251 atomic_set(
3252 &root->fs_info->mutually_exclusive_operation_running,
3253 0);
3254 }
3255 break;
3256 case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
3257 btrfs_dev_replace_status(root->fs_info, p);
3258 ret = 0;
3259 break;
3260 case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
3261 ret = btrfs_dev_replace_cancel(root->fs_info, p);
3262 break;
3263 default:
3264 ret = -EINVAL;
3265 break;
3266 }
3267
3268 if (copy_to_user(arg, p, sizeof(*p)))
3269 ret = -EFAULT;
3270
3271 kfree(p);
3272 return ret;
3273}
3274
3152static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) 3275static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
3153{ 3276{
3154 int ret = 0; 3277 int ret = 0;
@@ -3315,6 +3438,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3315 struct btrfs_ioctl_balance_args *bargs; 3438 struct btrfs_ioctl_balance_args *bargs;
3316 struct btrfs_balance_control *bctl; 3439 struct btrfs_balance_control *bctl;
3317 int ret; 3440 int ret;
3441 int need_to_clear_lock = 0;
3318 3442
3319 if (!capable(CAP_SYS_ADMIN)) 3443 if (!capable(CAP_SYS_ADMIN))
3320 return -EPERM; 3444 return -EPERM;
@@ -3350,10 +3474,13 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3350 bargs = NULL; 3474 bargs = NULL;
3351 } 3475 }
3352 3476
3353 if (fs_info->balance_ctl) { 3477 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
3478 1)) {
3479 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
3354 ret = -EINPROGRESS; 3480 ret = -EINPROGRESS;
3355 goto out_bargs; 3481 goto out_bargs;
3356 } 3482 }
3483 need_to_clear_lock = 1;
3357 3484
3358 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 3485 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
3359 if (!bctl) { 3486 if (!bctl) {
@@ -3387,6 +3514,9 @@ do_balance:
3387out_bargs: 3514out_bargs:
3388 kfree(bargs); 3515 kfree(bargs);
3389out: 3516out:
3517 if (need_to_clear_lock)
3518 atomic_set(&root->fs_info->mutually_exclusive_operation_running,
3519 0);
3390 mutex_unlock(&fs_info->balance_mutex); 3520 mutex_unlock(&fs_info->balance_mutex);
3391 mutex_unlock(&fs_info->volume_mutex); 3521 mutex_unlock(&fs_info->volume_mutex);
3392 mnt_drop_write_file(file); 3522 mnt_drop_write_file(file);
@@ -3441,8 +3571,9 @@ out:
3441 return ret; 3571 return ret;
3442} 3572}
3443 3573
3444static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) 3574static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
3445{ 3575{
3576 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3446 struct btrfs_ioctl_quota_ctl_args *sa; 3577 struct btrfs_ioctl_quota_ctl_args *sa;
3447 struct btrfs_trans_handle *trans = NULL; 3578 struct btrfs_trans_handle *trans = NULL;
3448 int ret; 3579 int ret;
@@ -3451,12 +3582,15 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
3451 if (!capable(CAP_SYS_ADMIN)) 3582 if (!capable(CAP_SYS_ADMIN))
3452 return -EPERM; 3583 return -EPERM;
3453 3584
3454 if (root->fs_info->sb->s_flags & MS_RDONLY) 3585 ret = mnt_want_write_file(file);
3455 return -EROFS; 3586 if (ret)
3587 return ret;
3456 3588
3457 sa = memdup_user(arg, sizeof(*sa)); 3589 sa = memdup_user(arg, sizeof(*sa));
3458 if (IS_ERR(sa)) 3590 if (IS_ERR(sa)) {
3459 return PTR_ERR(sa); 3591 ret = PTR_ERR(sa);
3592 goto drop_write;
3593 }
3460 3594
3461 if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) { 3595 if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
3462 trans = btrfs_start_transaction(root, 2); 3596 trans = btrfs_start_transaction(root, 2);
@@ -3489,14 +3623,16 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
3489 if (err && !ret) 3623 if (err && !ret)
3490 ret = err; 3624 ret = err;
3491 } 3625 }
3492
3493out: 3626out:
3494 kfree(sa); 3627 kfree(sa);
3628drop_write:
3629 mnt_drop_write_file(file);
3495 return ret; 3630 return ret;
3496} 3631}
3497 3632
3498static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) 3633static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
3499{ 3634{
3635 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3500 struct btrfs_ioctl_qgroup_assign_args *sa; 3636 struct btrfs_ioctl_qgroup_assign_args *sa;
3501 struct btrfs_trans_handle *trans; 3637 struct btrfs_trans_handle *trans;
3502 int ret; 3638 int ret;
@@ -3505,12 +3641,15 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
3505 if (!capable(CAP_SYS_ADMIN)) 3641 if (!capable(CAP_SYS_ADMIN))
3506 return -EPERM; 3642 return -EPERM;
3507 3643
3508 if (root->fs_info->sb->s_flags & MS_RDONLY) 3644 ret = mnt_want_write_file(file);
3509 return -EROFS; 3645 if (ret)
3646 return ret;
3510 3647
3511 sa = memdup_user(arg, sizeof(*sa)); 3648 sa = memdup_user(arg, sizeof(*sa));
3512 if (IS_ERR(sa)) 3649 if (IS_ERR(sa)) {
3513 return PTR_ERR(sa); 3650 ret = PTR_ERR(sa);
3651 goto drop_write;
3652 }
3514 3653
3515 trans = btrfs_join_transaction(root); 3654 trans = btrfs_join_transaction(root);
3516 if (IS_ERR(trans)) { 3655 if (IS_ERR(trans)) {
@@ -3533,11 +3672,14 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
3533 3672
3534out: 3673out:
3535 kfree(sa); 3674 kfree(sa);
3675drop_write:
3676 mnt_drop_write_file(file);
3536 return ret; 3677 return ret;
3537} 3678}
3538 3679
3539static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) 3680static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
3540{ 3681{
3682 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3541 struct btrfs_ioctl_qgroup_create_args *sa; 3683 struct btrfs_ioctl_qgroup_create_args *sa;
3542 struct btrfs_trans_handle *trans; 3684 struct btrfs_trans_handle *trans;
3543 int ret; 3685 int ret;
@@ -3546,12 +3688,15 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
3546 if (!capable(CAP_SYS_ADMIN)) 3688 if (!capable(CAP_SYS_ADMIN))
3547 return -EPERM; 3689 return -EPERM;
3548 3690
3549 if (root->fs_info->sb->s_flags & MS_RDONLY) 3691 ret = mnt_want_write_file(file);
3550 return -EROFS; 3692 if (ret)
3693 return ret;
3551 3694
3552 sa = memdup_user(arg, sizeof(*sa)); 3695 sa = memdup_user(arg, sizeof(*sa));
3553 if (IS_ERR(sa)) 3696 if (IS_ERR(sa)) {
3554 return PTR_ERR(sa); 3697 ret = PTR_ERR(sa);
3698 goto drop_write;
3699 }
3555 3700
3556 trans = btrfs_join_transaction(root); 3701 trans = btrfs_join_transaction(root);
3557 if (IS_ERR(trans)) { 3702 if (IS_ERR(trans)) {
@@ -3573,11 +3718,14 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
3573 3718
3574out: 3719out:
3575 kfree(sa); 3720 kfree(sa);
3721drop_write:
3722 mnt_drop_write_file(file);
3576 return ret; 3723 return ret;
3577} 3724}
3578 3725
3579static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) 3726static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
3580{ 3727{
3728 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3581 struct btrfs_ioctl_qgroup_limit_args *sa; 3729 struct btrfs_ioctl_qgroup_limit_args *sa;
3582 struct btrfs_trans_handle *trans; 3730 struct btrfs_trans_handle *trans;
3583 int ret; 3731 int ret;
@@ -3587,12 +3735,15 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
3587 if (!capable(CAP_SYS_ADMIN)) 3735 if (!capable(CAP_SYS_ADMIN))
3588 return -EPERM; 3736 return -EPERM;
3589 3737
3590 if (root->fs_info->sb->s_flags & MS_RDONLY) 3738 ret = mnt_want_write_file(file);
3591 return -EROFS; 3739 if (ret)
3740 return ret;
3592 3741
3593 sa = memdup_user(arg, sizeof(*sa)); 3742 sa = memdup_user(arg, sizeof(*sa));
3594 if (IS_ERR(sa)) 3743 if (IS_ERR(sa)) {
3595 return PTR_ERR(sa); 3744 ret = PTR_ERR(sa);
3745 goto drop_write;
3746 }
3596 3747
3597 trans = btrfs_join_transaction(root); 3748 trans = btrfs_join_transaction(root);
3598 if (IS_ERR(trans)) { 3749 if (IS_ERR(trans)) {
@@ -3615,6 +3766,8 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
3615 3766
3616out: 3767out:
3617 kfree(sa); 3768 kfree(sa);
3769drop_write:
3770 mnt_drop_write_file(file);
3618 return ret; 3771 return ret;
3619} 3772}
3620 3773
@@ -3735,11 +3888,11 @@ long btrfs_ioctl(struct file *file, unsigned int
3735 case BTRFS_IOC_DEFRAG_RANGE: 3888 case BTRFS_IOC_DEFRAG_RANGE:
3736 return btrfs_ioctl_defrag(file, argp); 3889 return btrfs_ioctl_defrag(file, argp);
3737 case BTRFS_IOC_RESIZE: 3890 case BTRFS_IOC_RESIZE:
3738 return btrfs_ioctl_resize(root, argp); 3891 return btrfs_ioctl_resize(file, argp);
3739 case BTRFS_IOC_ADD_DEV: 3892 case BTRFS_IOC_ADD_DEV:
3740 return btrfs_ioctl_add_dev(root, argp); 3893 return btrfs_ioctl_add_dev(root, argp);
3741 case BTRFS_IOC_RM_DEV: 3894 case BTRFS_IOC_RM_DEV:
3742 return btrfs_ioctl_rm_dev(root, argp); 3895 return btrfs_ioctl_rm_dev(file, argp);
3743 case BTRFS_IOC_FS_INFO: 3896 case BTRFS_IOC_FS_INFO:
3744 return btrfs_ioctl_fs_info(root, argp); 3897 return btrfs_ioctl_fs_info(root, argp);
3745 case BTRFS_IOC_DEV_INFO: 3898 case BTRFS_IOC_DEV_INFO:
@@ -3768,11 +3921,11 @@ long btrfs_ioctl(struct file *file, unsigned int
3768 btrfs_sync_fs(file->f_dentry->d_sb, 1); 3921 btrfs_sync_fs(file->f_dentry->d_sb, 1);
3769 return 0; 3922 return 0;
3770 case BTRFS_IOC_START_SYNC: 3923 case BTRFS_IOC_START_SYNC:
3771 return btrfs_ioctl_start_sync(file, argp); 3924 return btrfs_ioctl_start_sync(root, argp);
3772 case BTRFS_IOC_WAIT_SYNC: 3925 case BTRFS_IOC_WAIT_SYNC:
3773 return btrfs_ioctl_wait_sync(file, argp); 3926 return btrfs_ioctl_wait_sync(root, argp);
3774 case BTRFS_IOC_SCRUB: 3927 case BTRFS_IOC_SCRUB:
3775 return btrfs_ioctl_scrub(root, argp); 3928 return btrfs_ioctl_scrub(file, argp);
3776 case BTRFS_IOC_SCRUB_CANCEL: 3929 case BTRFS_IOC_SCRUB_CANCEL:
3777 return btrfs_ioctl_scrub_cancel(root, argp); 3930 return btrfs_ioctl_scrub_cancel(root, argp);
3778 case BTRFS_IOC_SCRUB_PROGRESS: 3931 case BTRFS_IOC_SCRUB_PROGRESS:
@@ -3790,13 +3943,15 @@ long btrfs_ioctl(struct file *file, unsigned int
3790 case BTRFS_IOC_GET_DEV_STATS: 3943 case BTRFS_IOC_GET_DEV_STATS:
3791 return btrfs_ioctl_get_dev_stats(root, argp); 3944 return btrfs_ioctl_get_dev_stats(root, argp);
3792 case BTRFS_IOC_QUOTA_CTL: 3945 case BTRFS_IOC_QUOTA_CTL:
3793 return btrfs_ioctl_quota_ctl(root, argp); 3946 return btrfs_ioctl_quota_ctl(file, argp);
3794 case BTRFS_IOC_QGROUP_ASSIGN: 3947 case BTRFS_IOC_QGROUP_ASSIGN:
3795 return btrfs_ioctl_qgroup_assign(root, argp); 3948 return btrfs_ioctl_qgroup_assign(file, argp);
3796 case BTRFS_IOC_QGROUP_CREATE: 3949 case BTRFS_IOC_QGROUP_CREATE:
3797 return btrfs_ioctl_qgroup_create(root, argp); 3950 return btrfs_ioctl_qgroup_create(file, argp);
3798 case BTRFS_IOC_QGROUP_LIMIT: 3951 case BTRFS_IOC_QGROUP_LIMIT:
3799 return btrfs_ioctl_qgroup_limit(root, argp); 3952 return btrfs_ioctl_qgroup_limit(file, argp);
3953 case BTRFS_IOC_DEV_REPLACE:
3954 return btrfs_ioctl_dev_replace(root, argp);
3800 } 3955 }
3801 3956
3802 return -ENOTTY; 3957 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 731e2875ab93..dabca9cc8c2e 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,6 +30,8 @@ struct btrfs_ioctl_vol_args {
30 char name[BTRFS_PATH_NAME_MAX + 1]; 30 char name[BTRFS_PATH_NAME_MAX + 1];
31}; 31};
32 32
33#define BTRFS_DEVICE_PATH_NAME_MAX 1024
34
33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) 35#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
34#define BTRFS_SUBVOL_RDONLY (1ULL << 1) 36#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
35#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) 37#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2)
@@ -123,7 +125,48 @@ struct btrfs_ioctl_scrub_args {
123 __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; 125 __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
124}; 126};
125 127
126#define BTRFS_DEVICE_PATH_NAME_MAX 1024 128#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
129#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
130struct btrfs_ioctl_dev_replace_start_params {
131 __u64 srcdevid; /* in, if 0, use srcdev_name instead */
132 __u64 cont_reading_from_srcdev_mode; /* in, see #define
133 * above */
134 __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
135 __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
136};
137
138#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0
139#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1
140#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2
141#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3
142#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4
143struct btrfs_ioctl_dev_replace_status_params {
144 __u64 replace_state; /* out, see #define above */
145 __u64 progress_1000; /* out, 0 <= x <= 1000 */
146 __u64 time_started; /* out, seconds since 1-Jan-1970 */
147 __u64 time_stopped; /* out, seconds since 1-Jan-1970 */
148 __u64 num_write_errors; /* out */
149 __u64 num_uncorrectable_read_errors; /* out */
150};
151
152#define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0
153#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1
154#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2
155#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0
156#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1
157#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2
158struct btrfs_ioctl_dev_replace_args {
159 __u64 cmd; /* in */
160 __u64 result; /* out */
161
162 union {
163 struct btrfs_ioctl_dev_replace_start_params start;
164 struct btrfs_ioctl_dev_replace_status_params status;
165 }; /* in/out */
166
167 __u64 spare[64];
168};
169
127struct btrfs_ioctl_dev_info_args { 170struct btrfs_ioctl_dev_info_args {
128 __u64 devid; /* in/out */ 171 __u64 devid; /* in/out */
129 __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ 172 __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */
@@ -453,4 +496,7 @@ struct btrfs_ioctl_send_args {
453 struct btrfs_ioctl_qgroup_limit_args) 496 struct btrfs_ioctl_qgroup_limit_args)
454#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ 497#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
455 struct btrfs_ioctl_get_dev_stats) 498 struct btrfs_ioctl_get_dev_stats)
499#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
500 struct btrfs_ioctl_dev_replace_args)
501
456#endif 502#endif
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h
new file mode 100644
index 000000000000..b7816cefbd13
--- /dev/null
+++ b/fs/btrfs/math.h
@@ -0,0 +1,44 @@
1
2/*
3 * Copyright (C) 2012 Fujitsu. All rights reserved.
4 * Written by Miao Xie <miaox@cn.fujitsu.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License v2 as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public
16 * License along with this program; if not, write to the
17 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 * Boston, MA 021110-1307, USA.
19 */
20
21#ifndef __BTRFS_MATH_H
22#define __BTRFS_MATH_H
23
24#include <asm/div64.h>
25
26static inline u64 div_factor(u64 num, int factor)
27{
28 if (factor == 10)
29 return num;
30 num *= factor;
31 do_div(num, 10);
32 return num;
33}
34
35static inline u64 div_factor_fine(u64 num, int factor)
36{
37 if (factor == 100)
38 return num;
39 num *= factor;
40 do_div(num, 100);
41 return num;
42}
43
44#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 7772f02ba28e..f10731297040 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -211,6 +211,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
211 init_waitqueue_head(&entry->wait); 211 init_waitqueue_head(&entry->wait);
212 INIT_LIST_HEAD(&entry->list); 212 INIT_LIST_HEAD(&entry->list);
213 INIT_LIST_HEAD(&entry->root_extent_list); 213 INIT_LIST_HEAD(&entry->root_extent_list);
214 INIT_LIST_HEAD(&entry->work_list);
215 init_completion(&entry->completion);
214 216
215 trace_btrfs_ordered_extent_add(inode, entry); 217 trace_btrfs_ordered_extent_add(inode, entry);
216 218
@@ -464,18 +466,28 @@ void btrfs_remove_ordered_extent(struct inode *inode,
464 wake_up(&entry->wait); 466 wake_up(&entry->wait);
465} 467}
466 468
469static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
470{
471 struct btrfs_ordered_extent *ordered;
472
473 ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
474 btrfs_start_ordered_extent(ordered->inode, ordered, 1);
475 complete(&ordered->completion);
476}
477
467/* 478/*
468 * wait for all the ordered extents in a root. This is done when balancing 479 * wait for all the ordered extents in a root. This is done when balancing
469 * space between drives. 480 * space between drives.
470 */ 481 */
471void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) 482void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
472{ 483{
473 struct list_head splice; 484 struct list_head splice, works;
474 struct list_head *cur; 485 struct list_head *cur;
475 struct btrfs_ordered_extent *ordered; 486 struct btrfs_ordered_extent *ordered, *next;
476 struct inode *inode; 487 struct inode *inode;
477 488
478 INIT_LIST_HEAD(&splice); 489 INIT_LIST_HEAD(&splice);
490 INIT_LIST_HEAD(&works);
479 491
480 spin_lock(&root->fs_info->ordered_extent_lock); 492 spin_lock(&root->fs_info->ordered_extent_lock);
481 list_splice_init(&root->fs_info->ordered_extents, &splice); 493 list_splice_init(&root->fs_info->ordered_extents, &splice);
@@ -494,19 +506,32 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
494 spin_unlock(&root->fs_info->ordered_extent_lock); 506 spin_unlock(&root->fs_info->ordered_extent_lock);
495 507
496 if (inode) { 508 if (inode) {
497 btrfs_start_ordered_extent(inode, ordered, 1); 509 ordered->flush_work.func = btrfs_run_ordered_extent_work;
498 btrfs_put_ordered_extent(ordered); 510 list_add_tail(&ordered->work_list, &works);
499 if (delay_iput) 511 btrfs_queue_worker(&root->fs_info->flush_workers,
500 btrfs_add_delayed_iput(inode); 512 &ordered->flush_work);
501 else
502 iput(inode);
503 } else { 513 } else {
504 btrfs_put_ordered_extent(ordered); 514 btrfs_put_ordered_extent(ordered);
505 } 515 }
506 516
517 cond_resched();
507 spin_lock(&root->fs_info->ordered_extent_lock); 518 spin_lock(&root->fs_info->ordered_extent_lock);
508 } 519 }
509 spin_unlock(&root->fs_info->ordered_extent_lock); 520 spin_unlock(&root->fs_info->ordered_extent_lock);
521
522 list_for_each_entry_safe(ordered, next, &works, work_list) {
523 list_del_init(&ordered->work_list);
524 wait_for_completion(&ordered->completion);
525
526 inode = ordered->inode;
527 btrfs_put_ordered_extent(ordered);
528 if (delay_iput)
529 btrfs_add_delayed_iput(inode);
530 else
531 iput(inode);
532
533 cond_resched();
534 }
510} 535}
511 536
512/* 537/*
@@ -519,13 +544,17 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
519 * extra check to make sure the ordered operation list really is empty 544 * extra check to make sure the ordered operation list really is empty
520 * before we return 545 * before we return
521 */ 546 */
522void btrfs_run_ordered_operations(struct btrfs_root *root, int wait) 547int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
523{ 548{
524 struct btrfs_inode *btrfs_inode; 549 struct btrfs_inode *btrfs_inode;
525 struct inode *inode; 550 struct inode *inode;
526 struct list_head splice; 551 struct list_head splice;
552 struct list_head works;
553 struct btrfs_delalloc_work *work, *next;
554 int ret = 0;
527 555
528 INIT_LIST_HEAD(&splice); 556 INIT_LIST_HEAD(&splice);
557 INIT_LIST_HEAD(&works);
529 558
530 mutex_lock(&root->fs_info->ordered_operations_mutex); 559 mutex_lock(&root->fs_info->ordered_operations_mutex);
531 spin_lock(&root->fs_info->ordered_extent_lock); 560 spin_lock(&root->fs_info->ordered_extent_lock);
@@ -533,6 +562,7 @@ again:
533 list_splice_init(&root->fs_info->ordered_operations, &splice); 562 list_splice_init(&root->fs_info->ordered_operations, &splice);
534 563
535 while (!list_empty(&splice)) { 564 while (!list_empty(&splice)) {
565
536 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 566 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
537 ordered_operations); 567 ordered_operations);
538 568
@@ -549,15 +579,26 @@ again:
549 list_add_tail(&BTRFS_I(inode)->ordered_operations, 579 list_add_tail(&BTRFS_I(inode)->ordered_operations,
550 &root->fs_info->ordered_operations); 580 &root->fs_info->ordered_operations);
551 } 581 }
582
583 if (!inode)
584 continue;
552 spin_unlock(&root->fs_info->ordered_extent_lock); 585 spin_unlock(&root->fs_info->ordered_extent_lock);
553 586
554 if (inode) { 587 work = btrfs_alloc_delalloc_work(inode, wait, 1);
555 if (wait) 588 if (!work) {
556 btrfs_wait_ordered_range(inode, 0, (u64)-1); 589 if (list_empty(&BTRFS_I(inode)->ordered_operations))
557 else 590 list_add_tail(&btrfs_inode->ordered_operations,
558 filemap_flush(inode->i_mapping); 591 &splice);
559 btrfs_add_delayed_iput(inode); 592 spin_lock(&root->fs_info->ordered_extent_lock);
593 list_splice_tail(&splice,
594 &root->fs_info->ordered_operations);
595 spin_unlock(&root->fs_info->ordered_extent_lock);
596 ret = -ENOMEM;
597 goto out;
560 } 598 }
599 list_add_tail(&work->list, &works);
600 btrfs_queue_worker(&root->fs_info->flush_workers,
601 &work->work);
561 602
562 cond_resched(); 603 cond_resched();
563 spin_lock(&root->fs_info->ordered_extent_lock); 604 spin_lock(&root->fs_info->ordered_extent_lock);
@@ -566,7 +607,13 @@ again:
566 goto again; 607 goto again;
567 608
568 spin_unlock(&root->fs_info->ordered_extent_lock); 609 spin_unlock(&root->fs_info->ordered_extent_lock);
610out:
611 list_for_each_entry_safe(work, next, &works, list) {
612 list_del_init(&work->list);
613 btrfs_wait_and_free_delalloc_work(work);
614 }
569 mutex_unlock(&root->fs_info->ordered_operations_mutex); 615 mutex_unlock(&root->fs_info->ordered_operations_mutex);
616 return ret;
570} 617}
571 618
572/* 619/*
@@ -606,7 +653,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
606 u64 end; 653 u64 end;
607 u64 orig_end; 654 u64 orig_end;
608 struct btrfs_ordered_extent *ordered; 655 struct btrfs_ordered_extent *ordered;
609 int found;
610 656
611 if (start + len < start) { 657 if (start + len < start) {
612 orig_end = INT_LIMIT(loff_t); 658 orig_end = INT_LIMIT(loff_t);
@@ -642,7 +688,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
642 filemap_fdatawait_range(inode->i_mapping, start, orig_end); 688 filemap_fdatawait_range(inode->i_mapping, start, orig_end);
643 689
644 end = orig_end; 690 end = orig_end;
645 found = 0;
646 while (1) { 691 while (1) {
647 ordered = btrfs_lookup_first_ordered_extent(inode, end); 692 ordered = btrfs_lookup_first_ordered_extent(inode, end);
648 if (!ordered) 693 if (!ordered)
@@ -655,7 +700,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
655 btrfs_put_ordered_extent(ordered); 700 btrfs_put_ordered_extent(ordered);
656 break; 701 break;
657 } 702 }
658 found++;
659 btrfs_start_ordered_extent(inode, ordered, 1); 703 btrfs_start_ordered_extent(inode, ordered, 1);
660 end = ordered->file_offset; 704 end = ordered->file_offset;
661 btrfs_put_ordered_extent(ordered); 705 btrfs_put_ordered_extent(ordered);
@@ -934,15 +978,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
934 if (last_mod < root->fs_info->last_trans_committed) 978 if (last_mod < root->fs_info->last_trans_committed)
935 return; 979 return;
936 980
937 /*
938 * the transaction is already committing. Just start the IO and
939 * don't bother with all of this list nonsense
940 */
941 if (trans && root->fs_info->running_transaction->blocked) {
942 btrfs_wait_ordered_range(inode, 0, (u64)-1);
943 return;
944 }
945
946 spin_lock(&root->fs_info->ordered_extent_lock); 981 spin_lock(&root->fs_info->ordered_extent_lock);
947 if (list_empty(&BTRFS_I(inode)->ordered_operations)) { 982 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
948 list_add_tail(&BTRFS_I(inode)->ordered_operations, 983 list_add_tail(&BTRFS_I(inode)->ordered_operations,
@@ -959,6 +994,7 @@ int __init ordered_data_init(void)
959 NULL); 994 NULL);
960 if (!btrfs_ordered_extent_cache) 995 if (!btrfs_ordered_extent_cache)
961 return -ENOMEM; 996 return -ENOMEM;
997
962 return 0; 998 return 0;
963} 999}
964 1000
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index dd27a0b46a37..f29d4bf5fbe7 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -76,7 +76,7 @@ struct btrfs_ordered_sum {
76 76
77#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */ 77#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
78 78
79#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent 79#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
80 * has done its due diligence in updating 80 * has done its due diligence in updating
81 * the isize. */ 81 * the isize. */
82 82
@@ -128,8 +128,11 @@ struct btrfs_ordered_extent {
128 struct list_head root_extent_list; 128 struct list_head root_extent_list;
129 129
130 struct btrfs_work work; 130 struct btrfs_work work;
131};
132 131
132 struct completion completion;
133 struct btrfs_work flush_work;
134 struct list_head work_list;
135};
133 136
134/* 137/*
135 * calculates the total size you need to allocate for an ordered sum 138 * calculates the total size you need to allocate for an ordered sum
@@ -186,7 +189,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
186int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 189int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
187 struct btrfs_ordered_extent *ordered); 190 struct btrfs_ordered_extent *ordered);
188int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 191int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
189void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 192int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
190void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 193void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
191 struct btrfs_root *root, 194 struct btrfs_root *root,
192 struct inode *inode); 195 struct inode *inode);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 5e23684887eb..50d95fd190a5 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -297,6 +297,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
297 case BTRFS_DEV_STATS_KEY: 297 case BTRFS_DEV_STATS_KEY:
298 printk(KERN_INFO "\t\tdevice stats\n"); 298 printk(KERN_INFO "\t\tdevice stats\n");
299 break; 299 break;
300 case BTRFS_DEV_REPLACE_KEY:
301 printk(KERN_INFO "\t\tdev replace\n");
302 break;
300 }; 303 };
301 } 304 }
302} 305}
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index a955669519a2..96b93daa0bbb 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -27,6 +27,7 @@
27#include "volumes.h" 27#include "volumes.h"
28#include "disk-io.h" 28#include "disk-io.h"
29#include "transaction.h" 29#include "transaction.h"
30#include "dev-replace.h"
30 31
31#undef DEBUG 32#undef DEBUG
32 33
@@ -323,7 +324,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
323 struct reada_extent *re = NULL; 324 struct reada_extent *re = NULL;
324 struct reada_extent *re_exist = NULL; 325 struct reada_extent *re_exist = NULL;
325 struct btrfs_fs_info *fs_info = root->fs_info; 326 struct btrfs_fs_info *fs_info = root->fs_info;
326 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
327 struct btrfs_bio *bbio = NULL; 327 struct btrfs_bio *bbio = NULL;
328 struct btrfs_device *dev; 328 struct btrfs_device *dev;
329 struct btrfs_device *prev_dev; 329 struct btrfs_device *prev_dev;
@@ -332,6 +332,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
332 int nzones = 0; 332 int nzones = 0;
333 int i; 333 int i;
334 unsigned long index = logical >> PAGE_CACHE_SHIFT; 334 unsigned long index = logical >> PAGE_CACHE_SHIFT;
335 int dev_replace_is_ongoing;
335 336
336 spin_lock(&fs_info->reada_lock); 337 spin_lock(&fs_info->reada_lock);
337 re = radix_tree_lookup(&fs_info->reada_tree, index); 338 re = radix_tree_lookup(&fs_info->reada_tree, index);
@@ -358,7 +359,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
358 * map block 359 * map block
359 */ 360 */
360 length = blocksize; 361 length = blocksize;
361 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0); 362 ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
363 &bbio, 0);
362 if (ret || !bbio || length < blocksize) 364 if (ret || !bbio || length < blocksize)
363 goto error; 365 goto error;
364 366
@@ -393,6 +395,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
393 } 395 }
394 396
395 /* insert extent in reada_tree + all per-device trees, all or nothing */ 397 /* insert extent in reada_tree + all per-device trees, all or nothing */
398 btrfs_dev_replace_lock(&fs_info->dev_replace);
396 spin_lock(&fs_info->reada_lock); 399 spin_lock(&fs_info->reada_lock);
397 ret = radix_tree_insert(&fs_info->reada_tree, index, re); 400 ret = radix_tree_insert(&fs_info->reada_tree, index, re);
398 if (ret == -EEXIST) { 401 if (ret == -EEXIST) {
@@ -400,13 +403,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
400 BUG_ON(!re_exist); 403 BUG_ON(!re_exist);
401 re_exist->refcnt++; 404 re_exist->refcnt++;
402 spin_unlock(&fs_info->reada_lock); 405 spin_unlock(&fs_info->reada_lock);
406 btrfs_dev_replace_unlock(&fs_info->dev_replace);
403 goto error; 407 goto error;
404 } 408 }
405 if (ret) { 409 if (ret) {
406 spin_unlock(&fs_info->reada_lock); 410 spin_unlock(&fs_info->reada_lock);
411 btrfs_dev_replace_unlock(&fs_info->dev_replace);
407 goto error; 412 goto error;
408 } 413 }
409 prev_dev = NULL; 414 prev_dev = NULL;
415 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
416 &fs_info->dev_replace);
410 for (i = 0; i < nzones; ++i) { 417 for (i = 0; i < nzones; ++i) {
411 dev = bbio->stripes[i].dev; 418 dev = bbio->stripes[i].dev;
412 if (dev == prev_dev) { 419 if (dev == prev_dev) {
@@ -419,21 +426,36 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
419 */ 426 */
420 continue; 427 continue;
421 } 428 }
429 if (!dev->bdev) {
430 /* cannot read ahead on missing device */
431 continue;
432 }
433 if (dev_replace_is_ongoing &&
434 dev == fs_info->dev_replace.tgtdev) {
435 /*
436 * as this device is selected for reading only as
437 * a last resort, skip it for read ahead.
438 */
439 continue;
440 }
422 prev_dev = dev; 441 prev_dev = dev;
423 ret = radix_tree_insert(&dev->reada_extents, index, re); 442 ret = radix_tree_insert(&dev->reada_extents, index, re);
424 if (ret) { 443 if (ret) {
425 while (--i >= 0) { 444 while (--i >= 0) {
426 dev = bbio->stripes[i].dev; 445 dev = bbio->stripes[i].dev;
427 BUG_ON(dev == NULL); 446 BUG_ON(dev == NULL);
447 /* ignore whether the entry was inserted */
428 radix_tree_delete(&dev->reada_extents, index); 448 radix_tree_delete(&dev->reada_extents, index);
429 } 449 }
430 BUG_ON(fs_info == NULL); 450 BUG_ON(fs_info == NULL);
431 radix_tree_delete(&fs_info->reada_tree, index); 451 radix_tree_delete(&fs_info->reada_tree, index);
432 spin_unlock(&fs_info->reada_lock); 452 spin_unlock(&fs_info->reada_lock);
453 btrfs_dev_replace_unlock(&fs_info->dev_replace);
433 goto error; 454 goto error;
434 } 455 }
435 } 456 }
436 spin_unlock(&fs_info->reada_lock); 457 spin_unlock(&fs_info->reada_lock);
458 btrfs_dev_replace_unlock(&fs_info->dev_replace);
437 459
438 kfree(bbio); 460 kfree(bbio);
439 return re; 461 return re;
@@ -915,7 +937,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
915 generation = btrfs_header_generation(node); 937 generation = btrfs_header_generation(node);
916 free_extent_buffer(node); 938 free_extent_buffer(node);
917 939
918 reada_add_block(rc, start, &max_key, level, generation); 940 if (reada_add_block(rc, start, &max_key, level, generation)) {
941 kfree(rc);
942 return ERR_PTR(-ENOMEM);
943 }
919 944
920 reada_start_machine(root->fs_info); 945 reada_start_machine(root->fs_info);
921 946
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 776f0aa128fc..300e09ac3659 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2025,7 +2025,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2025 struct btrfs_root_item *root_item; 2025 struct btrfs_root_item *root_item;
2026 struct btrfs_path *path; 2026 struct btrfs_path *path;
2027 struct extent_buffer *leaf; 2027 struct extent_buffer *leaf;
2028 unsigned long nr;
2029 int level; 2028 int level;
2030 int max_level; 2029 int max_level;
2031 int replaced = 0; 2030 int replaced = 0;
@@ -2074,7 +2073,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2074 BUG_ON(IS_ERR(trans)); 2073 BUG_ON(IS_ERR(trans));
2075 trans->block_rsv = rc->block_rsv; 2074 trans->block_rsv = rc->block_rsv;
2076 2075
2077 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved); 2076 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2077 BTRFS_RESERVE_FLUSH_ALL);
2078 if (ret) { 2078 if (ret) {
2079 BUG_ON(ret != -EAGAIN); 2079 BUG_ON(ret != -EAGAIN);
2080 ret = btrfs_commit_transaction(trans, root); 2080 ret = btrfs_commit_transaction(trans, root);
@@ -2125,10 +2125,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2125 path->slots[level]); 2125 path->slots[level]);
2126 root_item->drop_level = level; 2126 root_item->drop_level = level;
2127 2127
2128 nr = trans->blocks_used;
2129 btrfs_end_transaction_throttle(trans, root); 2128 btrfs_end_transaction_throttle(trans, root);
2130 2129
2131 btrfs_btree_balance_dirty(root, nr); 2130 btrfs_btree_balance_dirty(root);
2132 2131
2133 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2132 if (replaced && rc->stage == UPDATE_DATA_PTRS)
2134 invalidate_extent_cache(root, &key, &next_key); 2133 invalidate_extent_cache(root, &key, &next_key);
@@ -2155,10 +2154,9 @@ out:
2155 btrfs_update_reloc_root(trans, root); 2154 btrfs_update_reloc_root(trans, root);
2156 } 2155 }
2157 2156
2158 nr = trans->blocks_used;
2159 btrfs_end_transaction_throttle(trans, root); 2157 btrfs_end_transaction_throttle(trans, root);
2160 2158
2161 btrfs_btree_balance_dirty(root, nr); 2159 btrfs_btree_balance_dirty(root);
2162 2160
2163 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2161 if (replaced && rc->stage == UPDATE_DATA_PTRS)
2164 invalidate_extent_cache(root, &key, &next_key); 2162 invalidate_extent_cache(root, &key, &next_key);
@@ -2184,7 +2182,8 @@ int prepare_to_merge(struct reloc_control *rc, int err)
2184again: 2182again:
2185 if (!err) { 2183 if (!err) {
2186 num_bytes = rc->merging_rsv_size; 2184 num_bytes = rc->merging_rsv_size;
2187 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); 2185 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2186 BTRFS_RESERVE_FLUSH_ALL);
2188 if (ret) 2187 if (ret)
2189 err = ret; 2188 err = ret;
2190 } 2189 }
@@ -2459,7 +2458,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2459 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2458 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2460 2459
2461 trans->block_rsv = rc->block_rsv; 2460 trans->block_rsv = rc->block_rsv;
2462 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); 2461 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2462 BTRFS_RESERVE_FLUSH_ALL);
2463 if (ret) { 2463 if (ret) {
2464 if (ret == -EAGAIN) 2464 if (ret == -EAGAIN)
2465 rc->commit_transaction = 1; 2465 rc->commit_transaction = 1;
@@ -3259,7 +3259,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3259 struct btrfs_path *path; 3259 struct btrfs_path *path;
3260 struct btrfs_root *root = fs_info->tree_root; 3260 struct btrfs_root *root = fs_info->tree_root;
3261 struct btrfs_trans_handle *trans; 3261 struct btrfs_trans_handle *trans;
3262 unsigned long nr;
3263 int ret = 0; 3262 int ret = 0;
3264 3263
3265 if (inode) 3264 if (inode)
@@ -3293,9 +3292,8 @@ truncate:
3293 ret = btrfs_truncate_free_space_cache(root, trans, path, inode); 3292 ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
3294 3293
3295 btrfs_free_path(path); 3294 btrfs_free_path(path);
3296 nr = trans->blocks_used;
3297 btrfs_end_transaction(trans, root); 3295 btrfs_end_transaction(trans, root);
3298 btrfs_btree_balance_dirty(root, nr); 3296 btrfs_btree_balance_dirty(root);
3299out: 3297out:
3300 iput(inode); 3298 iput(inode);
3301 return ret; 3299 return ret;
@@ -3685,7 +3683,8 @@ int prepare_to_relocate(struct reloc_control *rc)
3685 * is no reservation in transaction handle. 3683 * is no reservation in transaction handle.
3686 */ 3684 */
3687 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, 3685 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
3688 rc->extent_root->nodesize * 256); 3686 rc->extent_root->nodesize * 256,
3687 BTRFS_RESERVE_FLUSH_ALL);
3689 if (ret) 3688 if (ret)
3690 return ret; 3689 return ret;
3691 3690
@@ -3711,7 +3710,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3711 struct btrfs_trans_handle *trans = NULL; 3710 struct btrfs_trans_handle *trans = NULL;
3712 struct btrfs_path *path; 3711 struct btrfs_path *path;
3713 struct btrfs_extent_item *ei; 3712 struct btrfs_extent_item *ei;
3714 unsigned long nr;
3715 u64 flags; 3713 u64 flags;
3716 u32 item_size; 3714 u32 item_size;
3717 int ret; 3715 int ret;
@@ -3828,9 +3826,8 @@ restart:
3828 ret = btrfs_commit_transaction(trans, rc->extent_root); 3826 ret = btrfs_commit_transaction(trans, rc->extent_root);
3829 BUG_ON(ret); 3827 BUG_ON(ret);
3830 } else { 3828 } else {
3831 nr = trans->blocks_used;
3832 btrfs_end_transaction_throttle(trans, rc->extent_root); 3829 btrfs_end_transaction_throttle(trans, rc->extent_root);
3833 btrfs_btree_balance_dirty(rc->extent_root, nr); 3830 btrfs_btree_balance_dirty(rc->extent_root);
3834 } 3831 }
3835 trans = NULL; 3832 trans = NULL;
3836 3833
@@ -3860,9 +3857,8 @@ restart:
3860 GFP_NOFS); 3857 GFP_NOFS);
3861 3858
3862 if (trans) { 3859 if (trans) {
3863 nr = trans->blocks_used;
3864 btrfs_end_transaction_throttle(trans, rc->extent_root); 3860 btrfs_end_transaction_throttle(trans, rc->extent_root);
3865 btrfs_btree_balance_dirty(rc->extent_root, nr); 3861 btrfs_btree_balance_dirty(rc->extent_root);
3866 } 3862 }
3867 3863
3868 if (!err) { 3864 if (!err) {
@@ -3941,7 +3937,6 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3941 struct btrfs_trans_handle *trans; 3937 struct btrfs_trans_handle *trans;
3942 struct btrfs_root *root; 3938 struct btrfs_root *root;
3943 struct btrfs_key key; 3939 struct btrfs_key key;
3944 unsigned long nr;
3945 u64 objectid = BTRFS_FIRST_FREE_OBJECTID; 3940 u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
3946 int err = 0; 3941 int err = 0;
3947 3942
@@ -3969,9 +3964,8 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3969 3964
3970 err = btrfs_orphan_add(trans, inode); 3965 err = btrfs_orphan_add(trans, inode);
3971out: 3966out:
3972 nr = trans->blocks_used;
3973 btrfs_end_transaction(trans, root); 3967 btrfs_end_transaction(trans, root);
3974 btrfs_btree_balance_dirty(root, nr); 3968 btrfs_btree_balance_dirty(root);
3975 if (err) { 3969 if (err) {
3976 if (inode) 3970 if (inode)
3977 iput(inode); 3971 iput(inode);
@@ -4057,7 +4051,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4057 (unsigned long long)rc->block_group->key.objectid, 4051 (unsigned long long)rc->block_group->key.objectid,
4058 (unsigned long long)rc->block_group->flags); 4052 (unsigned long long)rc->block_group->flags);
4059 4053
4060 btrfs_start_delalloc_inodes(fs_info->tree_root, 0); 4054 ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
4055 if (ret < 0) {
4056 err = ret;
4057 goto out;
4058 }
4061 btrfs_wait_ordered_extents(fs_info->tree_root, 0); 4059 btrfs_wait_ordered_extents(fs_info->tree_root, 0);
4062 4060
4063 while (1) { 4061 while (1) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index eb923d087da7..668af537a3ea 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -548,9 +548,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
548 struct btrfs_root_item *item = &root->root_item; 548 struct btrfs_root_item *item = &root->root_item;
549 struct timespec ct = CURRENT_TIME; 549 struct timespec ct = CURRENT_TIME;
550 550
551 spin_lock(&root->root_times_lock); 551 spin_lock(&root->root_item_lock);
552 item->ctransid = cpu_to_le64(trans->transid); 552 item->ctransid = cpu_to_le64(trans->transid);
553 item->ctime.sec = cpu_to_le64(ct.tv_sec); 553 item->ctime.sec = cpu_to_le64(ct.tv_sec);
554 item->ctime.nsec = cpu_to_le32(ct.tv_nsec); 554 item->ctime.nsec = cpu_to_le32(ct.tv_nsec);
555 spin_unlock(&root->root_times_lock); 555 spin_unlock(&root->root_item_lock);
556} 556}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 27892f67e69b..bdbb94f245c9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2011 STRATO. All rights reserved. 2 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or 4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 5 * modify it under the terms of the GNU General Public
@@ -25,6 +25,7 @@
25#include "transaction.h" 25#include "transaction.h"
26#include "backref.h" 26#include "backref.h"
27#include "extent_io.h" 27#include "extent_io.h"
28#include "dev-replace.h"
28#include "check-integrity.h" 29#include "check-integrity.h"
29#include "rcu-string.h" 30#include "rcu-string.h"
30 31
@@ -42,10 +43,23 @@
42 */ 43 */
43 44
44struct scrub_block; 45struct scrub_block;
45struct scrub_dev; 46struct scrub_ctx;
46 47
47#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ 48/*
48#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ 49 * the following three values only influence the performance.
50 * The last one configures the number of parallel and outstanding I/O
51 * operations. The first two values configure an upper limit for the number
52 * of (dynamically allocated) pages that are added to a bio.
53 */
54#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
55#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
56#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
57
58/*
59 * the following value times PAGE_SIZE needs to be large enough to match the
60 * largest node/leaf/sector size that shall be supported.
61 * Values larger than BTRFS_STRIPE_LEN are not supported.
62 */
49#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ 63#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
50 64
51struct scrub_page { 65struct scrub_page {
@@ -56,6 +70,8 @@ struct scrub_page {
56 u64 generation; 70 u64 generation;
57 u64 logical; 71 u64 logical;
58 u64 physical; 72 u64 physical;
73 u64 physical_for_dev_replace;
74 atomic_t ref_count;
59 struct { 75 struct {
60 unsigned int mirror_num:8; 76 unsigned int mirror_num:8;
61 unsigned int have_csum:1; 77 unsigned int have_csum:1;
@@ -66,23 +82,28 @@ struct scrub_page {
66 82
67struct scrub_bio { 83struct scrub_bio {
68 int index; 84 int index;
69 struct scrub_dev *sdev; 85 struct scrub_ctx *sctx;
86 struct btrfs_device *dev;
70 struct bio *bio; 87 struct bio *bio;
71 int err; 88 int err;
72 u64 logical; 89 u64 logical;
73 u64 physical; 90 u64 physical;
74 struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; 91#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
92 struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
93#else
94 struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
95#endif
75 int page_count; 96 int page_count;
76 int next_free; 97 int next_free;
77 struct btrfs_work work; 98 struct btrfs_work work;
78}; 99};
79 100
80struct scrub_block { 101struct scrub_block {
81 struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; 102 struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
82 int page_count; 103 int page_count;
83 atomic_t outstanding_pages; 104 atomic_t outstanding_pages;
84 atomic_t ref_count; /* free mem on transition to zero */ 105 atomic_t ref_count; /* free mem on transition to zero */
85 struct scrub_dev *sdev; 106 struct scrub_ctx *sctx;
86 struct { 107 struct {
87 unsigned int header_error:1; 108 unsigned int header_error:1;
88 unsigned int checksum_error:1; 109 unsigned int checksum_error:1;
@@ -91,23 +112,35 @@ struct scrub_block {
91 }; 112 };
92}; 113};
93 114
94struct scrub_dev { 115struct scrub_wr_ctx {
95 struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; 116 struct scrub_bio *wr_curr_bio;
96 struct btrfs_device *dev; 117 struct btrfs_device *tgtdev;
118 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
119 atomic_t flush_all_writes;
120 struct mutex wr_lock;
121};
122
123struct scrub_ctx {
124 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
125 struct btrfs_root *dev_root;
97 int first_free; 126 int first_free;
98 int curr; 127 int curr;
99 atomic_t in_flight; 128 atomic_t bios_in_flight;
100 atomic_t fixup_cnt; 129 atomic_t workers_pending;
101 spinlock_t list_lock; 130 spinlock_t list_lock;
102 wait_queue_head_t list_wait; 131 wait_queue_head_t list_wait;
103 u16 csum_size; 132 u16 csum_size;
104 struct list_head csum_list; 133 struct list_head csum_list;
105 atomic_t cancel_req; 134 atomic_t cancel_req;
106 int readonly; 135 int readonly;
107 int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ 136 int pages_per_rd_bio;
108 u32 sectorsize; 137 u32 sectorsize;
109 u32 nodesize; 138 u32 nodesize;
110 u32 leafsize; 139 u32 leafsize;
140
141 int is_dev_replace;
142 struct scrub_wr_ctx wr_ctx;
143
111 /* 144 /*
112 * statistics 145 * statistics
113 */ 146 */
@@ -116,13 +149,23 @@ struct scrub_dev {
116}; 149};
117 150
118struct scrub_fixup_nodatasum { 151struct scrub_fixup_nodatasum {
119 struct scrub_dev *sdev; 152 struct scrub_ctx *sctx;
153 struct btrfs_device *dev;
120 u64 logical; 154 u64 logical;
121 struct btrfs_root *root; 155 struct btrfs_root *root;
122 struct btrfs_work work; 156 struct btrfs_work work;
123 int mirror_num; 157 int mirror_num;
124}; 158};
125 159
160struct scrub_copy_nocow_ctx {
161 struct scrub_ctx *sctx;
162 u64 logical;
163 u64 len;
164 int mirror_num;
165 u64 physical_for_dev_replace;
166 struct btrfs_work work;
167};
168
126struct scrub_warning { 169struct scrub_warning {
127 struct btrfs_path *path; 170 struct btrfs_path *path;
128 u64 extent_item_size; 171 u64 extent_item_size;
@@ -137,15 +180,20 @@ struct scrub_warning {
137}; 180};
138 181
139 182
183static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
184static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
185static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
186static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
140static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); 187static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
141static int scrub_setup_recheck_block(struct scrub_dev *sdev, 188static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
142 struct btrfs_mapping_tree *map_tree, 189 struct btrfs_fs_info *fs_info,
190 struct scrub_block *original_sblock,
143 u64 length, u64 logical, 191 u64 length, u64 logical,
144 struct scrub_block *sblock); 192 struct scrub_block *sblocks_for_recheck);
145static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 193static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
146 struct scrub_block *sblock, int is_metadata, 194 struct scrub_block *sblock, int is_metadata,
147 int have_csum, u8 *csum, u64 generation, 195 int have_csum, u8 *csum, u64 generation,
148 u16 csum_size); 196 u16 csum_size);
149static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 197static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
150 struct scrub_block *sblock, 198 struct scrub_block *sblock,
151 int is_metadata, int have_csum, 199 int is_metadata, int have_csum,
@@ -158,118 +206,221 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
158static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 206static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
159 struct scrub_block *sblock_good, 207 struct scrub_block *sblock_good,
160 int page_num, int force_write); 208 int page_num, int force_write);
209static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
210static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
211 int page_num);
161static int scrub_checksum_data(struct scrub_block *sblock); 212static int scrub_checksum_data(struct scrub_block *sblock);
162static int scrub_checksum_tree_block(struct scrub_block *sblock); 213static int scrub_checksum_tree_block(struct scrub_block *sblock);
163static int scrub_checksum_super(struct scrub_block *sblock); 214static int scrub_checksum_super(struct scrub_block *sblock);
164static void scrub_block_get(struct scrub_block *sblock); 215static void scrub_block_get(struct scrub_block *sblock);
165static void scrub_block_put(struct scrub_block *sblock); 216static void scrub_block_put(struct scrub_block *sblock);
166static int scrub_add_page_to_bio(struct scrub_dev *sdev, 217static void scrub_page_get(struct scrub_page *spage);
167 struct scrub_page *spage); 218static void scrub_page_put(struct scrub_page *spage);
168static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, 219static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
169 u64 physical, u64 flags, u64 gen, int mirror_num, 220 struct scrub_page *spage);
170 u8 *csum, int force); 221static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
222 u64 physical, struct btrfs_device *dev, u64 flags,
223 u64 gen, int mirror_num, u8 *csum, int force,
224 u64 physical_for_dev_replace);
171static void scrub_bio_end_io(struct bio *bio, int err); 225static void scrub_bio_end_io(struct bio *bio, int err);
172static void scrub_bio_end_io_worker(struct btrfs_work *work); 226static void scrub_bio_end_io_worker(struct btrfs_work *work);
173static void scrub_block_complete(struct scrub_block *sblock); 227static void scrub_block_complete(struct scrub_block *sblock);
228static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
229 u64 extent_logical, u64 extent_len,
230 u64 *extent_physical,
231 struct btrfs_device **extent_dev,
232 int *extent_mirror_num);
233static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
234 struct scrub_wr_ctx *wr_ctx,
235 struct btrfs_fs_info *fs_info,
236 struct btrfs_device *dev,
237 int is_dev_replace);
238static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
239static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
240 struct scrub_page *spage);
241static void scrub_wr_submit(struct scrub_ctx *sctx);
242static void scrub_wr_bio_end_io(struct bio *bio, int err);
243static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
244static int write_page_nocow(struct scrub_ctx *sctx,
245 u64 physical_for_dev_replace, struct page *page);
246static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
247 void *ctx);
248static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
249 int mirror_num, u64 physical_for_dev_replace);
250static void copy_nocow_pages_worker(struct btrfs_work *work);
251
252
253static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
254{
255 atomic_inc(&sctx->bios_in_flight);
256}
257
258static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
259{
260 atomic_dec(&sctx->bios_in_flight);
261 wake_up(&sctx->list_wait);
262}
263
264/*
265 * used for workers that require transaction commits (i.e., for the
266 * NOCOW case)
267 */
268static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
269{
270 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
271
272 /*
273 * increment scrubs_running to prevent cancel requests from
274 * completing as long as a worker is running. we must also
275 * increment scrubs_paused to prevent deadlocking on pause
276 * requests used for transactions commits (as the worker uses a
277 * transaction context). it is safe to regard the worker
278 * as paused for all matters practical. effectively, we only
279 * avoid cancellation requests from completing.
280 */
281 mutex_lock(&fs_info->scrub_lock);
282 atomic_inc(&fs_info->scrubs_running);
283 atomic_inc(&fs_info->scrubs_paused);
284 mutex_unlock(&fs_info->scrub_lock);
285 atomic_inc(&sctx->workers_pending);
286}
174 287
288/* used for workers that require transaction commits */
289static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
290{
291 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
175 292
176static void scrub_free_csums(struct scrub_dev *sdev) 293 /*
294 * see scrub_pending_trans_workers_inc() why we're pretending
295 * to be paused in the scrub counters
296 */
297 mutex_lock(&fs_info->scrub_lock);
298 atomic_dec(&fs_info->scrubs_running);
299 atomic_dec(&fs_info->scrubs_paused);
300 mutex_unlock(&fs_info->scrub_lock);
301 atomic_dec(&sctx->workers_pending);
302 wake_up(&fs_info->scrub_pause_wait);
303 wake_up(&sctx->list_wait);
304}
305
306static void scrub_free_csums(struct scrub_ctx *sctx)
177{ 307{
178 while (!list_empty(&sdev->csum_list)) { 308 while (!list_empty(&sctx->csum_list)) {
179 struct btrfs_ordered_sum *sum; 309 struct btrfs_ordered_sum *sum;
180 sum = list_first_entry(&sdev->csum_list, 310 sum = list_first_entry(&sctx->csum_list,
181 struct btrfs_ordered_sum, list); 311 struct btrfs_ordered_sum, list);
182 list_del(&sum->list); 312 list_del(&sum->list);
183 kfree(sum); 313 kfree(sum);
184 } 314 }
185} 315}
186 316
187static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) 317static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
188{ 318{
189 int i; 319 int i;
190 320
191 if (!sdev) 321 if (!sctx)
192 return; 322 return;
193 323
324 scrub_free_wr_ctx(&sctx->wr_ctx);
325
194 /* this can happen when scrub is cancelled */ 326 /* this can happen when scrub is cancelled */
195 if (sdev->curr != -1) { 327 if (sctx->curr != -1) {
196 struct scrub_bio *sbio = sdev->bios[sdev->curr]; 328 struct scrub_bio *sbio = sctx->bios[sctx->curr];
197 329
198 for (i = 0; i < sbio->page_count; i++) { 330 for (i = 0; i < sbio->page_count; i++) {
199 BUG_ON(!sbio->pagev[i]); 331 WARN_ON(!sbio->pagev[i]->page);
200 BUG_ON(!sbio->pagev[i]->page);
201 scrub_block_put(sbio->pagev[i]->sblock); 332 scrub_block_put(sbio->pagev[i]->sblock);
202 } 333 }
203 bio_put(sbio->bio); 334 bio_put(sbio->bio);
204 } 335 }
205 336
206 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 337 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
207 struct scrub_bio *sbio = sdev->bios[i]; 338 struct scrub_bio *sbio = sctx->bios[i];
208 339
209 if (!sbio) 340 if (!sbio)
210 break; 341 break;
211 kfree(sbio); 342 kfree(sbio);
212 } 343 }
213 344
214 scrub_free_csums(sdev); 345 scrub_free_csums(sctx);
215 kfree(sdev); 346 kfree(sctx);
216} 347}
217 348
218static noinline_for_stack 349static noinline_for_stack
219struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) 350struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
220{ 351{
221 struct scrub_dev *sdev; 352 struct scrub_ctx *sctx;
222 int i; 353 int i;
223 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 354 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
224 int pages_per_bio; 355 int pages_per_rd_bio;
356 int ret;
225 357
226 pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, 358 /*
227 bio_get_nr_vecs(dev->bdev)); 359 * the setting of pages_per_rd_bio is correct for scrub but might
228 sdev = kzalloc(sizeof(*sdev), GFP_NOFS); 360 * be wrong for the dev_replace code where we might read from
229 if (!sdev) 361 * different devices in the initial huge bios. However, that
362 * code is able to correctly handle the case when adding a page
363 * to a bio fails.
364 */
365 if (dev->bdev)
366 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
367 bio_get_nr_vecs(dev->bdev));
368 else
369 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
370 sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
371 if (!sctx)
230 goto nomem; 372 goto nomem;
231 sdev->dev = dev; 373 sctx->is_dev_replace = is_dev_replace;
232 sdev->pages_per_bio = pages_per_bio; 374 sctx->pages_per_rd_bio = pages_per_rd_bio;
233 sdev->curr = -1; 375 sctx->curr = -1;
234 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 376 sctx->dev_root = dev->dev_root;
377 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
235 struct scrub_bio *sbio; 378 struct scrub_bio *sbio;
236 379
237 sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 380 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
238 if (!sbio) 381 if (!sbio)
239 goto nomem; 382 goto nomem;
240 sdev->bios[i] = sbio; 383 sctx->bios[i] = sbio;
241 384
242 sbio->index = i; 385 sbio->index = i;
243 sbio->sdev = sdev; 386 sbio->sctx = sctx;
244 sbio->page_count = 0; 387 sbio->page_count = 0;
245 sbio->work.func = scrub_bio_end_io_worker; 388 sbio->work.func = scrub_bio_end_io_worker;
246 389
247 if (i != SCRUB_BIOS_PER_DEV-1) 390 if (i != SCRUB_BIOS_PER_SCTX - 1)
248 sdev->bios[i]->next_free = i + 1; 391 sctx->bios[i]->next_free = i + 1;
249 else 392 else
250 sdev->bios[i]->next_free = -1; 393 sctx->bios[i]->next_free = -1;
251 } 394 }
252 sdev->first_free = 0; 395 sctx->first_free = 0;
253 sdev->nodesize = dev->dev_root->nodesize; 396 sctx->nodesize = dev->dev_root->nodesize;
254 sdev->leafsize = dev->dev_root->leafsize; 397 sctx->leafsize = dev->dev_root->leafsize;
255 sdev->sectorsize = dev->dev_root->sectorsize; 398 sctx->sectorsize = dev->dev_root->sectorsize;
256 atomic_set(&sdev->in_flight, 0); 399 atomic_set(&sctx->bios_in_flight, 0);
257 atomic_set(&sdev->fixup_cnt, 0); 400 atomic_set(&sctx->workers_pending, 0);
258 atomic_set(&sdev->cancel_req, 0); 401 atomic_set(&sctx->cancel_req, 0);
259 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); 402 sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
260 INIT_LIST_HEAD(&sdev->csum_list); 403 INIT_LIST_HEAD(&sctx->csum_list);
261 404
262 spin_lock_init(&sdev->list_lock); 405 spin_lock_init(&sctx->list_lock);
263 spin_lock_init(&sdev->stat_lock); 406 spin_lock_init(&sctx->stat_lock);
264 init_waitqueue_head(&sdev->list_wait); 407 init_waitqueue_head(&sctx->list_wait);
265 return sdev; 408
409 ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
410 fs_info->dev_replace.tgtdev, is_dev_replace);
411 if (ret) {
412 scrub_free_ctx(sctx);
413 return ERR_PTR(ret);
414 }
415 return sctx;
266 416
267nomem: 417nomem:
268 scrub_free_dev(sdev); 418 scrub_free_ctx(sctx);
269 return ERR_PTR(-ENOMEM); 419 return ERR_PTR(-ENOMEM);
270} 420}
271 421
272static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) 422static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
423 void *warn_ctx)
273{ 424{
274 u64 isize; 425 u64 isize;
275 u32 nlink; 426 u32 nlink;
@@ -277,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
277 int i; 428 int i;
278 struct extent_buffer *eb; 429 struct extent_buffer *eb;
279 struct btrfs_inode_item *inode_item; 430 struct btrfs_inode_item *inode_item;
280 struct scrub_warning *swarn = ctx; 431 struct scrub_warning *swarn = warn_ctx;
281 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; 432 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
282 struct inode_fs_paths *ipath = NULL; 433 struct inode_fs_paths *ipath = NULL;
283 struct btrfs_root *local_root; 434 struct btrfs_root *local_root;
@@ -345,8 +496,8 @@ err:
345 496
346static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) 497static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
347{ 498{
348 struct btrfs_device *dev = sblock->sdev->dev; 499 struct btrfs_device *dev;
349 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 500 struct btrfs_fs_info *fs_info;
350 struct btrfs_path *path; 501 struct btrfs_path *path;
351 struct btrfs_key found_key; 502 struct btrfs_key found_key;
352 struct extent_buffer *eb; 503 struct extent_buffer *eb;
@@ -361,15 +512,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
361 const int bufsize = 4096; 512 const int bufsize = 4096;
362 int ret; 513 int ret;
363 514
515 WARN_ON(sblock->page_count < 1);
516 dev = sblock->pagev[0]->dev;
517 fs_info = sblock->sctx->dev_root->fs_info;
518
364 path = btrfs_alloc_path(); 519 path = btrfs_alloc_path();
365 520
366 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); 521 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
367 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); 522 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
368 BUG_ON(sblock->page_count < 1); 523 swarn.sector = (sblock->pagev[0]->physical) >> 9;
369 swarn.sector = (sblock->pagev[0].physical) >> 9; 524 swarn.logical = sblock->pagev[0]->logical;
370 swarn.logical = sblock->pagev[0].logical;
371 swarn.errstr = errstr; 525 swarn.errstr = errstr;
372 swarn.dev = dev; 526 swarn.dev = NULL;
373 swarn.msg_bufsize = bufsize; 527 swarn.msg_bufsize = bufsize;
374 swarn.scratch_bufsize = bufsize; 528 swarn.scratch_bufsize = bufsize;
375 529
@@ -405,6 +559,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
405 } while (ret != 1); 559 } while (ret != 1);
406 } else { 560 } else {
407 swarn.path = path; 561 swarn.path = path;
562 swarn.dev = dev;
408 iterate_extent_inodes(fs_info, found_key.objectid, 563 iterate_extent_inodes(fs_info, found_key.objectid,
409 extent_item_pos, 1, 564 extent_item_pos, 1,
410 scrub_print_warning_inode, &swarn); 565 scrub_print_warning_inode, &swarn);
@@ -416,11 +571,11 @@ out:
416 kfree(swarn.msg_buf); 571 kfree(swarn.msg_buf);
417} 572}
418 573
419static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) 574static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
420{ 575{
421 struct page *page = NULL; 576 struct page *page = NULL;
422 unsigned long index; 577 unsigned long index;
423 struct scrub_fixup_nodatasum *fixup = ctx; 578 struct scrub_fixup_nodatasum *fixup = fixup_ctx;
424 int ret; 579 int ret;
425 int corrected = 0; 580 int corrected = 0;
426 struct btrfs_key key; 581 struct btrfs_key key;
@@ -451,7 +606,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
451 } 606 }
452 607
453 if (PageUptodate(page)) { 608 if (PageUptodate(page)) {
454 struct btrfs_mapping_tree *map_tree; 609 struct btrfs_fs_info *fs_info;
455 if (PageDirty(page)) { 610 if (PageDirty(page)) {
456 /* 611 /*
457 * we need to write the data to the defect sector. the 612 * we need to write the data to the defect sector. the
@@ -472,8 +627,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
472 ret = -EIO; 627 ret = -EIO;
473 goto out; 628 goto out;
474 } 629 }
475 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 630 fs_info = BTRFS_I(inode)->root->fs_info;
476 ret = repair_io_failure(map_tree, offset, PAGE_SIZE, 631 ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
477 fixup->logical, page, 632 fixup->logical, page,
478 fixup->mirror_num); 633 fixup->mirror_num);
479 unlock_page(page); 634 unlock_page(page);
@@ -530,21 +685,21 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
530{ 685{
531 int ret; 686 int ret;
532 struct scrub_fixup_nodatasum *fixup; 687 struct scrub_fixup_nodatasum *fixup;
533 struct scrub_dev *sdev; 688 struct scrub_ctx *sctx;
534 struct btrfs_trans_handle *trans = NULL; 689 struct btrfs_trans_handle *trans = NULL;
535 struct btrfs_fs_info *fs_info; 690 struct btrfs_fs_info *fs_info;
536 struct btrfs_path *path; 691 struct btrfs_path *path;
537 int uncorrectable = 0; 692 int uncorrectable = 0;
538 693
539 fixup = container_of(work, struct scrub_fixup_nodatasum, work); 694 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
540 sdev = fixup->sdev; 695 sctx = fixup->sctx;
541 fs_info = fixup->root->fs_info; 696 fs_info = fixup->root->fs_info;
542 697
543 path = btrfs_alloc_path(); 698 path = btrfs_alloc_path();
544 if (!path) { 699 if (!path) {
545 spin_lock(&sdev->stat_lock); 700 spin_lock(&sctx->stat_lock);
546 ++sdev->stat.malloc_errors; 701 ++sctx->stat.malloc_errors;
547 spin_unlock(&sdev->stat_lock); 702 spin_unlock(&sctx->stat_lock);
548 uncorrectable = 1; 703 uncorrectable = 1;
549 goto out; 704 goto out;
550 } 705 }
@@ -573,35 +728,30 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
573 } 728 }
574 WARN_ON(ret != 1); 729 WARN_ON(ret != 1);
575 730
576 spin_lock(&sdev->stat_lock); 731 spin_lock(&sctx->stat_lock);
577 ++sdev->stat.corrected_errors; 732 ++sctx->stat.corrected_errors;
578 spin_unlock(&sdev->stat_lock); 733 spin_unlock(&sctx->stat_lock);
579 734
580out: 735out:
581 if (trans && !IS_ERR(trans)) 736 if (trans && !IS_ERR(trans))
582 btrfs_end_transaction(trans, fixup->root); 737 btrfs_end_transaction(trans, fixup->root);
583 if (uncorrectable) { 738 if (uncorrectable) {
584 spin_lock(&sdev->stat_lock); 739 spin_lock(&sctx->stat_lock);
585 ++sdev->stat.uncorrectable_errors; 740 ++sctx->stat.uncorrectable_errors;
586 spin_unlock(&sdev->stat_lock); 741 spin_unlock(&sctx->stat_lock);
587 742 btrfs_dev_replace_stats_inc(
743 &sctx->dev_root->fs_info->dev_replace.
744 num_uncorrectable_read_errors);
588 printk_ratelimited_in_rcu(KERN_ERR 745 printk_ratelimited_in_rcu(KERN_ERR
589 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", 746 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
590 (unsigned long long)fixup->logical, 747 (unsigned long long)fixup->logical,
591 rcu_str_deref(sdev->dev->name)); 748 rcu_str_deref(fixup->dev->name));
592 } 749 }
593 750
594 btrfs_free_path(path); 751 btrfs_free_path(path);
595 kfree(fixup); 752 kfree(fixup);
596 753
597 /* see caller why we're pretending to be paused in the scrub counters */ 754 scrub_pending_trans_workers_dec(sctx);
598 mutex_lock(&fs_info->scrub_lock);
599 atomic_dec(&fs_info->scrubs_running);
600 atomic_dec(&fs_info->scrubs_paused);
601 mutex_unlock(&fs_info->scrub_lock);
602 atomic_dec(&sdev->fixup_cnt);
603 wake_up(&fs_info->scrub_pause_wait);
604 wake_up(&sdev->list_wait);
605} 755}
606 756
607/* 757/*
@@ -614,7 +764,8 @@ out:
614 */ 764 */
615static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) 765static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
616{ 766{
617 struct scrub_dev *sdev = sblock_to_check->sdev; 767 struct scrub_ctx *sctx = sblock_to_check->sctx;
768 struct btrfs_device *dev;
618 struct btrfs_fs_info *fs_info; 769 struct btrfs_fs_info *fs_info;
619 u64 length; 770 u64 length;
620 u64 logical; 771 u64 logical;
@@ -633,16 +784,33 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
633 DEFAULT_RATELIMIT_BURST); 784 DEFAULT_RATELIMIT_BURST);
634 785
635 BUG_ON(sblock_to_check->page_count < 1); 786 BUG_ON(sblock_to_check->page_count < 1);
636 fs_info = sdev->dev->dev_root->fs_info; 787 fs_info = sctx->dev_root->fs_info;
788 if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
789 /*
790 * if we find an error in a super block, we just report it.
791 * They will get written with the next transaction commit
792 * anyway
793 */
794 spin_lock(&sctx->stat_lock);
795 ++sctx->stat.super_errors;
796 spin_unlock(&sctx->stat_lock);
797 return 0;
798 }
637 length = sblock_to_check->page_count * PAGE_SIZE; 799 length = sblock_to_check->page_count * PAGE_SIZE;
638 logical = sblock_to_check->pagev[0].logical; 800 logical = sblock_to_check->pagev[0]->logical;
639 generation = sblock_to_check->pagev[0].generation; 801 generation = sblock_to_check->pagev[0]->generation;
640 BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); 802 BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
641 failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; 803 failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
642 is_metadata = !(sblock_to_check->pagev[0].flags & 804 is_metadata = !(sblock_to_check->pagev[0]->flags &
643 BTRFS_EXTENT_FLAG_DATA); 805 BTRFS_EXTENT_FLAG_DATA);
644 have_csum = sblock_to_check->pagev[0].have_csum; 806 have_csum = sblock_to_check->pagev[0]->have_csum;
645 csum = sblock_to_check->pagev[0].csum; 807 csum = sblock_to_check->pagev[0]->csum;
808 dev = sblock_to_check->pagev[0]->dev;
809
810 if (sctx->is_dev_replace && !is_metadata && !have_csum) {
811 sblocks_for_recheck = NULL;
812 goto nodatasum_case;
813 }
646 814
647 /* 815 /*
648 * read all mirrors one after the other. This includes to 816 * read all mirrors one after the other. This includes to
@@ -677,43 +845,32 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
677 sizeof(*sblocks_for_recheck), 845 sizeof(*sblocks_for_recheck),
678 GFP_NOFS); 846 GFP_NOFS);
679 if (!sblocks_for_recheck) { 847 if (!sblocks_for_recheck) {
680 spin_lock(&sdev->stat_lock); 848 spin_lock(&sctx->stat_lock);
681 sdev->stat.malloc_errors++; 849 sctx->stat.malloc_errors++;
682 sdev->stat.read_errors++; 850 sctx->stat.read_errors++;
683 sdev->stat.uncorrectable_errors++; 851 sctx->stat.uncorrectable_errors++;
684 spin_unlock(&sdev->stat_lock); 852 spin_unlock(&sctx->stat_lock);
685 btrfs_dev_stat_inc_and_print(sdev->dev, 853 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
686 BTRFS_DEV_STAT_READ_ERRS);
687 goto out; 854 goto out;
688 } 855 }
689 856
690 /* setup the context, map the logical blocks and alloc the pages */ 857 /* setup the context, map the logical blocks and alloc the pages */
691 ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, 858 ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
692 logical, sblocks_for_recheck); 859 logical, sblocks_for_recheck);
693 if (ret) { 860 if (ret) {
694 spin_lock(&sdev->stat_lock); 861 spin_lock(&sctx->stat_lock);
695 sdev->stat.read_errors++; 862 sctx->stat.read_errors++;
696 sdev->stat.uncorrectable_errors++; 863 sctx->stat.uncorrectable_errors++;
697 spin_unlock(&sdev->stat_lock); 864 spin_unlock(&sctx->stat_lock);
698 btrfs_dev_stat_inc_and_print(sdev->dev, 865 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
699 BTRFS_DEV_STAT_READ_ERRS);
700 goto out; 866 goto out;
701 } 867 }
702 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); 868 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
703 sblock_bad = sblocks_for_recheck + failed_mirror_index; 869 sblock_bad = sblocks_for_recheck + failed_mirror_index;
704 870
705 /* build and submit the bios for the failed mirror, check checksums */ 871 /* build and submit the bios for the failed mirror, check checksums */
706 ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, 872 scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
707 csum, generation, sdev->csum_size); 873 csum, generation, sctx->csum_size);
708 if (ret) {
709 spin_lock(&sdev->stat_lock);
710 sdev->stat.read_errors++;
711 sdev->stat.uncorrectable_errors++;
712 spin_unlock(&sdev->stat_lock);
713 btrfs_dev_stat_inc_and_print(sdev->dev,
714 BTRFS_DEV_STAT_READ_ERRS);
715 goto out;
716 }
717 874
718 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 875 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
719 sblock_bad->no_io_error_seen) { 876 sblock_bad->no_io_error_seen) {
@@ -725,50 +882,54 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
725 * different bio (usually one of the two latter cases is 882 * different bio (usually one of the two latter cases is
726 * the cause) 883 * the cause)
727 */ 884 */
728 spin_lock(&sdev->stat_lock); 885 spin_lock(&sctx->stat_lock);
729 sdev->stat.unverified_errors++; 886 sctx->stat.unverified_errors++;
730 spin_unlock(&sdev->stat_lock); 887 spin_unlock(&sctx->stat_lock);
731 888
889 if (sctx->is_dev_replace)
890 scrub_write_block_to_dev_replace(sblock_bad);
732 goto out; 891 goto out;
733 } 892 }
734 893
735 if (!sblock_bad->no_io_error_seen) { 894 if (!sblock_bad->no_io_error_seen) {
736 spin_lock(&sdev->stat_lock); 895 spin_lock(&sctx->stat_lock);
737 sdev->stat.read_errors++; 896 sctx->stat.read_errors++;
738 spin_unlock(&sdev->stat_lock); 897 spin_unlock(&sctx->stat_lock);
739 if (__ratelimit(&_rs)) 898 if (__ratelimit(&_rs))
740 scrub_print_warning("i/o error", sblock_to_check); 899 scrub_print_warning("i/o error", sblock_to_check);
741 btrfs_dev_stat_inc_and_print(sdev->dev, 900 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
742 BTRFS_DEV_STAT_READ_ERRS);
743 } else if (sblock_bad->checksum_error) { 901 } else if (sblock_bad->checksum_error) {
744 spin_lock(&sdev->stat_lock); 902 spin_lock(&sctx->stat_lock);
745 sdev->stat.csum_errors++; 903 sctx->stat.csum_errors++;
746 spin_unlock(&sdev->stat_lock); 904 spin_unlock(&sctx->stat_lock);
747 if (__ratelimit(&_rs)) 905 if (__ratelimit(&_rs))
748 scrub_print_warning("checksum error", sblock_to_check); 906 scrub_print_warning("checksum error", sblock_to_check);
749 btrfs_dev_stat_inc_and_print(sdev->dev, 907 btrfs_dev_stat_inc_and_print(dev,
750 BTRFS_DEV_STAT_CORRUPTION_ERRS); 908 BTRFS_DEV_STAT_CORRUPTION_ERRS);
751 } else if (sblock_bad->header_error) { 909 } else if (sblock_bad->header_error) {
752 spin_lock(&sdev->stat_lock); 910 spin_lock(&sctx->stat_lock);
753 sdev->stat.verify_errors++; 911 sctx->stat.verify_errors++;
754 spin_unlock(&sdev->stat_lock); 912 spin_unlock(&sctx->stat_lock);
755 if (__ratelimit(&_rs)) 913 if (__ratelimit(&_rs))
756 scrub_print_warning("checksum/header error", 914 scrub_print_warning("checksum/header error",
757 sblock_to_check); 915 sblock_to_check);
758 if (sblock_bad->generation_error) 916 if (sblock_bad->generation_error)
759 btrfs_dev_stat_inc_and_print(sdev->dev, 917 btrfs_dev_stat_inc_and_print(dev,
760 BTRFS_DEV_STAT_GENERATION_ERRS); 918 BTRFS_DEV_STAT_GENERATION_ERRS);
761 else 919 else
762 btrfs_dev_stat_inc_and_print(sdev->dev, 920 btrfs_dev_stat_inc_and_print(dev,
763 BTRFS_DEV_STAT_CORRUPTION_ERRS); 921 BTRFS_DEV_STAT_CORRUPTION_ERRS);
764 } 922 }
765 923
766 if (sdev->readonly) 924 if (sctx->readonly && !sctx->is_dev_replace)
767 goto did_not_correct_error; 925 goto did_not_correct_error;
768 926
769 if (!is_metadata && !have_csum) { 927 if (!is_metadata && !have_csum) {
770 struct scrub_fixup_nodatasum *fixup_nodatasum; 928 struct scrub_fixup_nodatasum *fixup_nodatasum;
771 929
930nodatasum_case:
931 WARN_ON(sctx->is_dev_replace);
932
772 /* 933 /*
773 * !is_metadata and !have_csum, this means that the data 934 * !is_metadata and !have_csum, this means that the data
774 * might not be COW'ed, that it might be modified 935 * might not be COW'ed, that it might be modified
@@ -779,24 +940,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
779 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); 940 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
780 if (!fixup_nodatasum) 941 if (!fixup_nodatasum)
781 goto did_not_correct_error; 942 goto did_not_correct_error;
782 fixup_nodatasum->sdev = sdev; 943 fixup_nodatasum->sctx = sctx;
944 fixup_nodatasum->dev = dev;
783 fixup_nodatasum->logical = logical; 945 fixup_nodatasum->logical = logical;
784 fixup_nodatasum->root = fs_info->extent_root; 946 fixup_nodatasum->root = fs_info->extent_root;
785 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 947 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
786 /* 948 scrub_pending_trans_workers_inc(sctx);
787 * increment scrubs_running to prevent cancel requests from
788 * completing as long as a fixup worker is running. we must also
789 * increment scrubs_paused to prevent deadlocking on pause
790 * requests used for transactions commits (as the worker uses a
791 * transaction context). it is safe to regard the fixup worker
792 * as paused for all matters practical. effectively, we only
793 * avoid cancellation requests from completing.
794 */
795 mutex_lock(&fs_info->scrub_lock);
796 atomic_inc(&fs_info->scrubs_running);
797 atomic_inc(&fs_info->scrubs_paused);
798 mutex_unlock(&fs_info->scrub_lock);
799 atomic_inc(&sdev->fixup_cnt);
800 fixup_nodatasum->work.func = scrub_fixup_nodatasum; 949 fixup_nodatasum->work.func = scrub_fixup_nodatasum;
801 btrfs_queue_worker(&fs_info->scrub_workers, 950 btrfs_queue_worker(&fs_info->scrub_workers,
802 &fixup_nodatasum->work); 951 &fixup_nodatasum->work);
@@ -805,26 +954,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
805 954
806 /* 955 /*
807 * now build and submit the bios for the other mirrors, check 956 * now build and submit the bios for the other mirrors, check
808 * checksums 957 * checksums.
809 */ 958 * First try to pick the mirror which is completely without I/O
810 for (mirror_index = 0;
811 mirror_index < BTRFS_MAX_MIRRORS &&
812 sblocks_for_recheck[mirror_index].page_count > 0;
813 mirror_index++) {
814 if (mirror_index == failed_mirror_index)
815 continue;
816
817 /* build and submit the bios, check checksums */
818 ret = scrub_recheck_block(fs_info,
819 sblocks_for_recheck + mirror_index,
820 is_metadata, have_csum, csum,
821 generation, sdev->csum_size);
822 if (ret)
823 goto did_not_correct_error;
824 }
825
826 /*
827 * first try to pick the mirror which is completely without I/O
828 * errors and also does not have a checksum error. 959 * errors and also does not have a checksum error.
829 * If one is found, and if a checksum is present, the full block 960 * If one is found, and if a checksum is present, the full block
830 * that is known to contain an error is rewritten. Afterwards 961 * that is known to contain an error is rewritten. Afterwards
@@ -840,24 +971,93 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
840 mirror_index < BTRFS_MAX_MIRRORS && 971 mirror_index < BTRFS_MAX_MIRRORS &&
841 sblocks_for_recheck[mirror_index].page_count > 0; 972 sblocks_for_recheck[mirror_index].page_count > 0;
842 mirror_index++) { 973 mirror_index++) {
843 struct scrub_block *sblock_other = sblocks_for_recheck + 974 struct scrub_block *sblock_other;
844 mirror_index; 975
976 if (mirror_index == failed_mirror_index)
977 continue;
978 sblock_other = sblocks_for_recheck + mirror_index;
979
980 /* build and submit the bios, check checksums */
981 scrub_recheck_block(fs_info, sblock_other, is_metadata,
982 have_csum, csum, generation,
983 sctx->csum_size);
845 984
846 if (!sblock_other->header_error && 985 if (!sblock_other->header_error &&
847 !sblock_other->checksum_error && 986 !sblock_other->checksum_error &&
848 sblock_other->no_io_error_seen) { 987 sblock_other->no_io_error_seen) {
849 int force_write = is_metadata || have_csum; 988 if (sctx->is_dev_replace) {
850 989 scrub_write_block_to_dev_replace(sblock_other);
851 ret = scrub_repair_block_from_good_copy(sblock_bad, 990 } else {
852 sblock_other, 991 int force_write = is_metadata || have_csum;
853 force_write); 992
993 ret = scrub_repair_block_from_good_copy(
994 sblock_bad, sblock_other,
995 force_write);
996 }
854 if (0 == ret) 997 if (0 == ret)
855 goto corrected_error; 998 goto corrected_error;
856 } 999 }
857 } 1000 }
858 1001
859 /* 1002 /*
860 * in case of I/O errors in the area that is supposed to be 1003 * for dev_replace, pick good pages and write to the target device.
1004 */
1005 if (sctx->is_dev_replace) {
1006 success = 1;
1007 for (page_num = 0; page_num < sblock_bad->page_count;
1008 page_num++) {
1009 int sub_success;
1010
1011 sub_success = 0;
1012 for (mirror_index = 0;
1013 mirror_index < BTRFS_MAX_MIRRORS &&
1014 sblocks_for_recheck[mirror_index].page_count > 0;
1015 mirror_index++) {
1016 struct scrub_block *sblock_other =
1017 sblocks_for_recheck + mirror_index;
1018 struct scrub_page *page_other =
1019 sblock_other->pagev[page_num];
1020
1021 if (!page_other->io_error) {
1022 ret = scrub_write_page_to_dev_replace(
1023 sblock_other, page_num);
1024 if (ret == 0) {
1025 /* succeeded for this page */
1026 sub_success = 1;
1027 break;
1028 } else {
1029 btrfs_dev_replace_stats_inc(
1030 &sctx->dev_root->
1031 fs_info->dev_replace.
1032 num_write_errors);
1033 }
1034 }
1035 }
1036
1037 if (!sub_success) {
1038 /*
1039 * did not find a mirror to fetch the page
1040 * from. scrub_write_page_to_dev_replace()
1041 * handles this case (page->io_error), by
1042 * filling the block with zeros before
1043 * submitting the write request
1044 */
1045 success = 0;
1046 ret = scrub_write_page_to_dev_replace(
1047 sblock_bad, page_num);
1048 if (ret)
1049 btrfs_dev_replace_stats_inc(
1050 &sctx->dev_root->fs_info->
1051 dev_replace.num_write_errors);
1052 }
1053 }
1054
1055 goto out;
1056 }
1057
1058 /*
1059 * for regular scrub, repair those pages that are errored.
1060 * In case of I/O errors in the area that is supposed to be
861 * repaired, continue by picking good copies of those pages. 1061 * repaired, continue by picking good copies of those pages.
862 * Select the good pages from mirrors to rewrite bad pages from 1062 * Select the good pages from mirrors to rewrite bad pages from
863 * the area to fix. Afterwards verify the checksum of the block 1063 * the area to fix. Afterwards verify the checksum of the block
@@ -887,7 +1087,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
887 1087
888 success = 1; 1088 success = 1;
889 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { 1089 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
890 struct scrub_page *page_bad = sblock_bad->pagev + page_num; 1090 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
891 1091
892 if (!page_bad->io_error) 1092 if (!page_bad->io_error)
893 continue; 1093 continue;
@@ -898,8 +1098,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
898 mirror_index++) { 1098 mirror_index++) {
899 struct scrub_block *sblock_other = sblocks_for_recheck + 1099 struct scrub_block *sblock_other = sblocks_for_recheck +
900 mirror_index; 1100 mirror_index;
901 struct scrub_page *page_other = sblock_other->pagev + 1101 struct scrub_page *page_other = sblock_other->pagev[
902 page_num; 1102 page_num];
903 1103
904 if (!page_other->io_error) { 1104 if (!page_other->io_error) {
905 ret = scrub_repair_page_from_good_copy( 1105 ret = scrub_repair_page_from_good_copy(
@@ -928,10 +1128,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
928 * is verified, but most likely the data comes out 1128 * is verified, but most likely the data comes out
929 * of the page cache. 1129 * of the page cache.
930 */ 1130 */
931 ret = scrub_recheck_block(fs_info, sblock_bad, 1131 scrub_recheck_block(fs_info, sblock_bad,
932 is_metadata, have_csum, csum, 1132 is_metadata, have_csum, csum,
933 generation, sdev->csum_size); 1133 generation, sctx->csum_size);
934 if (!ret && !sblock_bad->header_error && 1134 if (!sblock_bad->header_error &&
935 !sblock_bad->checksum_error && 1135 !sblock_bad->checksum_error &&
936 sblock_bad->no_io_error_seen) 1136 sblock_bad->no_io_error_seen)
937 goto corrected_error; 1137 goto corrected_error;
@@ -939,23 +1139,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
939 goto did_not_correct_error; 1139 goto did_not_correct_error;
940 } else { 1140 } else {
941corrected_error: 1141corrected_error:
942 spin_lock(&sdev->stat_lock); 1142 spin_lock(&sctx->stat_lock);
943 sdev->stat.corrected_errors++; 1143 sctx->stat.corrected_errors++;
944 spin_unlock(&sdev->stat_lock); 1144 spin_unlock(&sctx->stat_lock);
945 printk_ratelimited_in_rcu(KERN_ERR 1145 printk_ratelimited_in_rcu(KERN_ERR
946 "btrfs: fixed up error at logical %llu on dev %s\n", 1146 "btrfs: fixed up error at logical %llu on dev %s\n",
947 (unsigned long long)logical, 1147 (unsigned long long)logical,
948 rcu_str_deref(sdev->dev->name)); 1148 rcu_str_deref(dev->name));
949 } 1149 }
950 } else { 1150 } else {
951did_not_correct_error: 1151did_not_correct_error:
952 spin_lock(&sdev->stat_lock); 1152 spin_lock(&sctx->stat_lock);
953 sdev->stat.uncorrectable_errors++; 1153 sctx->stat.uncorrectable_errors++;
954 spin_unlock(&sdev->stat_lock); 1154 spin_unlock(&sctx->stat_lock);
955 printk_ratelimited_in_rcu(KERN_ERR 1155 printk_ratelimited_in_rcu(KERN_ERR
956 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", 1156 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
957 (unsigned long long)logical, 1157 (unsigned long long)logical,
958 rcu_str_deref(sdev->dev->name)); 1158 rcu_str_deref(dev->name));
959 } 1159 }
960 1160
961out: 1161out:
@@ -966,11 +1166,11 @@ out:
966 mirror_index; 1166 mirror_index;
967 int page_index; 1167 int page_index;
968 1168
969 for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; 1169 for (page_index = 0; page_index < sblock->page_count;
970 page_index++) 1170 page_index++) {
971 if (sblock->pagev[page_index].page) 1171 sblock->pagev[page_index]->sblock = NULL;
972 __free_page( 1172 scrub_page_put(sblock->pagev[page_index]);
973 sblock->pagev[page_index].page); 1173 }
974 } 1174 }
975 kfree(sblocks_for_recheck); 1175 kfree(sblocks_for_recheck);
976 } 1176 }
@@ -978,8 +1178,9 @@ out:
978 return 0; 1178 return 0;
979} 1179}
980 1180
981static int scrub_setup_recheck_block(struct scrub_dev *sdev, 1181static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
982 struct btrfs_mapping_tree *map_tree, 1182 struct btrfs_fs_info *fs_info,
1183 struct scrub_block *original_sblock,
983 u64 length, u64 logical, 1184 u64 length, u64 logical,
984 struct scrub_block *sblocks_for_recheck) 1185 struct scrub_block *sblocks_for_recheck)
985{ 1186{
@@ -988,7 +1189,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
988 int ret; 1189 int ret;
989 1190
990 /* 1191 /*
991 * note: the three members sdev, ref_count and outstanding_pages 1192 * note: the two members ref_count and outstanding_pages
992 * are not used (and not set) in the blocks that are used for 1193 * are not used (and not set) in the blocks that are used for
993 * the recheck procedure 1194 * the recheck procedure
994 */ 1195 */
@@ -1003,14 +1204,14 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1003 * with a length of PAGE_SIZE, each returned stripe 1204 * with a length of PAGE_SIZE, each returned stripe
1004 * represents one mirror 1205 * represents one mirror
1005 */ 1206 */
1006 ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, 1207 ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
1007 &bbio, 0); 1208 &mapped_length, &bbio, 0);
1008 if (ret || !bbio || mapped_length < sublen) { 1209 if (ret || !bbio || mapped_length < sublen) {
1009 kfree(bbio); 1210 kfree(bbio);
1010 return -EIO; 1211 return -EIO;
1011 } 1212 }
1012 1213
1013 BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); 1214 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1014 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; 1215 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1015 mirror_index++) { 1216 mirror_index++) {
1016 struct scrub_block *sblock; 1217 struct scrub_block *sblock;
@@ -1020,21 +1221,31 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1020 continue; 1221 continue;
1021 1222
1022 sblock = sblocks_for_recheck + mirror_index; 1223 sblock = sblocks_for_recheck + mirror_index;
1023 page = sblock->pagev + page_index; 1224 sblock->sctx = sctx;
1225 page = kzalloc(sizeof(*page), GFP_NOFS);
1226 if (!page) {
1227leave_nomem:
1228 spin_lock(&sctx->stat_lock);
1229 sctx->stat.malloc_errors++;
1230 spin_unlock(&sctx->stat_lock);
1231 kfree(bbio);
1232 return -ENOMEM;
1233 }
1234 scrub_page_get(page);
1235 sblock->pagev[page_index] = page;
1024 page->logical = logical; 1236 page->logical = logical;
1025 page->physical = bbio->stripes[mirror_index].physical; 1237 page->physical = bbio->stripes[mirror_index].physical;
1238 BUG_ON(page_index >= original_sblock->page_count);
1239 page->physical_for_dev_replace =
1240 original_sblock->pagev[page_index]->
1241 physical_for_dev_replace;
1026 /* for missing devices, dev->bdev is NULL */ 1242 /* for missing devices, dev->bdev is NULL */
1027 page->dev = bbio->stripes[mirror_index].dev; 1243 page->dev = bbio->stripes[mirror_index].dev;
1028 page->mirror_num = mirror_index + 1; 1244 page->mirror_num = mirror_index + 1;
1029 page->page = alloc_page(GFP_NOFS);
1030 if (!page->page) {
1031 spin_lock(&sdev->stat_lock);
1032 sdev->stat.malloc_errors++;
1033 spin_unlock(&sdev->stat_lock);
1034 kfree(bbio);
1035 return -ENOMEM;
1036 }
1037 sblock->page_count++; 1245 sblock->page_count++;
1246 page->page = alloc_page(GFP_NOFS);
1247 if (!page->page)
1248 goto leave_nomem;
1038 } 1249 }
1039 kfree(bbio); 1250 kfree(bbio);
1040 length -= sublen; 1251 length -= sublen;
@@ -1052,10 +1263,10 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1052 * to take those pages that are not errored from all the mirrors so that 1263 * to take those pages that are not errored from all the mirrors so that
1053 * the pages that are errored in the just handled mirror can be repaired. 1264 * the pages that are errored in the just handled mirror can be repaired.
1054 */ 1265 */
1055static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 1266static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1056 struct scrub_block *sblock, int is_metadata, 1267 struct scrub_block *sblock, int is_metadata,
1057 int have_csum, u8 *csum, u64 generation, 1268 int have_csum, u8 *csum, u64 generation,
1058 u16 csum_size) 1269 u16 csum_size)
1059{ 1270{
1060 int page_num; 1271 int page_num;
1061 1272
@@ -1065,8 +1276,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1065 1276
1066 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1277 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1067 struct bio *bio; 1278 struct bio *bio;
1068 int ret; 1279 struct scrub_page *page = sblock->pagev[page_num];
1069 struct scrub_page *page = sblock->pagev + page_num;
1070 DECLARE_COMPLETION_ONSTACK(complete); 1280 DECLARE_COMPLETION_ONSTACK(complete);
1071 1281
1072 if (page->dev->bdev == NULL) { 1282 if (page->dev->bdev == NULL) {
@@ -1075,20 +1285,19 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1075 continue; 1285 continue;
1076 } 1286 }
1077 1287
1078 BUG_ON(!page->page); 1288 WARN_ON(!page->page);
1079 bio = bio_alloc(GFP_NOFS, 1); 1289 bio = bio_alloc(GFP_NOFS, 1);
1080 if (!bio) 1290 if (!bio) {
1081 return -EIO; 1291 page->io_error = 1;
1292 sblock->no_io_error_seen = 0;
1293 continue;
1294 }
1082 bio->bi_bdev = page->dev->bdev; 1295 bio->bi_bdev = page->dev->bdev;
1083 bio->bi_sector = page->physical >> 9; 1296 bio->bi_sector = page->physical >> 9;
1084 bio->bi_end_io = scrub_complete_bio_end_io; 1297 bio->bi_end_io = scrub_complete_bio_end_io;
1085 bio->bi_private = &complete; 1298 bio->bi_private = &complete;
1086 1299
1087 ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); 1300 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1088 if (PAGE_SIZE != ret) {
1089 bio_put(bio);
1090 return -EIO;
1091 }
1092 btrfsic_submit_bio(READ, bio); 1301 btrfsic_submit_bio(READ, bio);
1093 1302
1094 /* this will also unplug the queue */ 1303 /* this will also unplug the queue */
@@ -1105,7 +1314,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1105 have_csum, csum, generation, 1314 have_csum, csum, generation,
1106 csum_size); 1315 csum_size);
1107 1316
1108 return 0; 1317 return;
1109} 1318}
1110 1319
1111static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 1320static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
@@ -1120,14 +1329,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1120 struct btrfs_root *root = fs_info->extent_root; 1329 struct btrfs_root *root = fs_info->extent_root;
1121 void *mapped_buffer; 1330 void *mapped_buffer;
1122 1331
1123 BUG_ON(!sblock->pagev[0].page); 1332 WARN_ON(!sblock->pagev[0]->page);
1124 if (is_metadata) { 1333 if (is_metadata) {
1125 struct btrfs_header *h; 1334 struct btrfs_header *h;
1126 1335
1127 mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1336 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1128 h = (struct btrfs_header *)mapped_buffer; 1337 h = (struct btrfs_header *)mapped_buffer;
1129 1338
1130 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || 1339 if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||
1131 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || 1340 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1132 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1341 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1133 BTRFS_UUID_SIZE)) { 1342 BTRFS_UUID_SIZE)) {
@@ -1141,7 +1350,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1141 if (!have_csum) 1350 if (!have_csum)
1142 return; 1351 return;
1143 1352
1144 mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1353 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1145 } 1354 }
1146 1355
1147 for (page_num = 0;;) { 1356 for (page_num = 0;;) {
@@ -1157,9 +1366,9 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1157 page_num++; 1366 page_num++;
1158 if (page_num >= sblock->page_count) 1367 if (page_num >= sblock->page_count)
1159 break; 1368 break;
1160 BUG_ON(!sblock->pagev[page_num].page); 1369 WARN_ON(!sblock->pagev[page_num]->page);
1161 1370
1162 mapped_buffer = kmap_atomic(sblock->pagev[page_num].page); 1371 mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1163 } 1372 }
1164 1373
1165 btrfs_csum_final(crc, calculated_csum); 1374 btrfs_csum_final(crc, calculated_csum);
@@ -1197,17 +1406,23 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1197 struct scrub_block *sblock_good, 1406 struct scrub_block *sblock_good,
1198 int page_num, int force_write) 1407 int page_num, int force_write)
1199{ 1408{
1200 struct scrub_page *page_bad = sblock_bad->pagev + page_num; 1409 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1201 struct scrub_page *page_good = sblock_good->pagev + page_num; 1410 struct scrub_page *page_good = sblock_good->pagev[page_num];
1202 1411
1203 BUG_ON(sblock_bad->pagev[page_num].page == NULL); 1412 BUG_ON(page_bad->page == NULL);
1204 BUG_ON(sblock_good->pagev[page_num].page == NULL); 1413 BUG_ON(page_good->page == NULL);
1205 if (force_write || sblock_bad->header_error || 1414 if (force_write || sblock_bad->header_error ||
1206 sblock_bad->checksum_error || page_bad->io_error) { 1415 sblock_bad->checksum_error || page_bad->io_error) {
1207 struct bio *bio; 1416 struct bio *bio;
1208 int ret; 1417 int ret;
1209 DECLARE_COMPLETION_ONSTACK(complete); 1418 DECLARE_COMPLETION_ONSTACK(complete);
1210 1419
1420 if (!page_bad->dev->bdev) {
1421 printk_ratelimited(KERN_WARNING
1422 "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
1423 return -EIO;
1424 }
1425
1211 bio = bio_alloc(GFP_NOFS, 1); 1426 bio = bio_alloc(GFP_NOFS, 1);
1212 if (!bio) 1427 if (!bio)
1213 return -EIO; 1428 return -EIO;
@@ -1228,6 +1443,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1228 if (!bio_flagged(bio, BIO_UPTODATE)) { 1443 if (!bio_flagged(bio, BIO_UPTODATE)) {
1229 btrfs_dev_stat_inc_and_print(page_bad->dev, 1444 btrfs_dev_stat_inc_and_print(page_bad->dev,
1230 BTRFS_DEV_STAT_WRITE_ERRS); 1445 BTRFS_DEV_STAT_WRITE_ERRS);
1446 btrfs_dev_replace_stats_inc(
1447 &sblock_bad->sctx->dev_root->fs_info->
1448 dev_replace.num_write_errors);
1231 bio_put(bio); 1449 bio_put(bio);
1232 return -EIO; 1450 return -EIO;
1233 } 1451 }
@@ -1237,13 +1455,174 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1237 return 0; 1455 return 0;
1238} 1456}
1239 1457
1240static void scrub_checksum(struct scrub_block *sblock) 1458static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1459{
1460 int page_num;
1461
1462 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1463 int ret;
1464
1465 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1466 if (ret)
1467 btrfs_dev_replace_stats_inc(
1468 &sblock->sctx->dev_root->fs_info->dev_replace.
1469 num_write_errors);
1470 }
1471}
1472
1473static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1474 int page_num)
1475{
1476 struct scrub_page *spage = sblock->pagev[page_num];
1477
1478 BUG_ON(spage->page == NULL);
1479 if (spage->io_error) {
1480 void *mapped_buffer = kmap_atomic(spage->page);
1481
1482 memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1483 flush_dcache_page(spage->page);
1484 kunmap_atomic(mapped_buffer);
1485 }
1486 return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1487}
1488
1489static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1490 struct scrub_page *spage)
1491{
1492 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1493 struct scrub_bio *sbio;
1494 int ret;
1495
1496 mutex_lock(&wr_ctx->wr_lock);
1497again:
1498 if (!wr_ctx->wr_curr_bio) {
1499 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1500 GFP_NOFS);
1501 if (!wr_ctx->wr_curr_bio) {
1502 mutex_unlock(&wr_ctx->wr_lock);
1503 return -ENOMEM;
1504 }
1505 wr_ctx->wr_curr_bio->sctx = sctx;
1506 wr_ctx->wr_curr_bio->page_count = 0;
1507 }
1508 sbio = wr_ctx->wr_curr_bio;
1509 if (sbio->page_count == 0) {
1510 struct bio *bio;
1511
1512 sbio->physical = spage->physical_for_dev_replace;
1513 sbio->logical = spage->logical;
1514 sbio->dev = wr_ctx->tgtdev;
1515 bio = sbio->bio;
1516 if (!bio) {
1517 bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1518 if (!bio) {
1519 mutex_unlock(&wr_ctx->wr_lock);
1520 return -ENOMEM;
1521 }
1522 sbio->bio = bio;
1523 }
1524
1525 bio->bi_private = sbio;
1526 bio->bi_end_io = scrub_wr_bio_end_io;
1527 bio->bi_bdev = sbio->dev->bdev;
1528 bio->bi_sector = sbio->physical >> 9;
1529 sbio->err = 0;
1530 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1531 spage->physical_for_dev_replace ||
1532 sbio->logical + sbio->page_count * PAGE_SIZE !=
1533 spage->logical) {
1534 scrub_wr_submit(sctx);
1535 goto again;
1536 }
1537
1538 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1539 if (ret != PAGE_SIZE) {
1540 if (sbio->page_count < 1) {
1541 bio_put(sbio->bio);
1542 sbio->bio = NULL;
1543 mutex_unlock(&wr_ctx->wr_lock);
1544 return -EIO;
1545 }
1546 scrub_wr_submit(sctx);
1547 goto again;
1548 }
1549
1550 sbio->pagev[sbio->page_count] = spage;
1551 scrub_page_get(spage);
1552 sbio->page_count++;
1553 if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1554 scrub_wr_submit(sctx);
1555 mutex_unlock(&wr_ctx->wr_lock);
1556
1557 return 0;
1558}
1559
1560static void scrub_wr_submit(struct scrub_ctx *sctx)
1561{
1562 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1563 struct scrub_bio *sbio;
1564
1565 if (!wr_ctx->wr_curr_bio)
1566 return;
1567
1568 sbio = wr_ctx->wr_curr_bio;
1569 wr_ctx->wr_curr_bio = NULL;
1570 WARN_ON(!sbio->bio->bi_bdev);
1571 scrub_pending_bio_inc(sctx);
1572 /* process all writes in a single worker thread. Then the block layer
1573 * orders the requests before sending them to the driver which
1574 * doubled the write performance on spinning disks when measured
1575 * with Linux 3.5 */
1576 btrfsic_submit_bio(WRITE, sbio->bio);
1577}
1578
1579static void scrub_wr_bio_end_io(struct bio *bio, int err)
1580{
1581 struct scrub_bio *sbio = bio->bi_private;
1582 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1583
1584 sbio->err = err;
1585 sbio->bio = bio;
1586
1587 sbio->work.func = scrub_wr_bio_end_io_worker;
1588 btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
1589}
1590
1591static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1592{
1593 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1594 struct scrub_ctx *sctx = sbio->sctx;
1595 int i;
1596
1597 WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1598 if (sbio->err) {
1599 struct btrfs_dev_replace *dev_replace =
1600 &sbio->sctx->dev_root->fs_info->dev_replace;
1601
1602 for (i = 0; i < sbio->page_count; i++) {
1603 struct scrub_page *spage = sbio->pagev[i];
1604
1605 spage->io_error = 1;
1606 btrfs_dev_replace_stats_inc(&dev_replace->
1607 num_write_errors);
1608 }
1609 }
1610
1611 for (i = 0; i < sbio->page_count; i++)
1612 scrub_page_put(sbio->pagev[i]);
1613
1614 bio_put(sbio->bio);
1615 kfree(sbio);
1616 scrub_pending_bio_dec(sctx);
1617}
1618
1619static int scrub_checksum(struct scrub_block *sblock)
1241{ 1620{
1242 u64 flags; 1621 u64 flags;
1243 int ret; 1622 int ret;
1244 1623
1245 BUG_ON(sblock->page_count < 1); 1624 WARN_ON(sblock->page_count < 1);
1246 flags = sblock->pagev[0].flags; 1625 flags = sblock->pagev[0]->flags;
1247 ret = 0; 1626 ret = 0;
1248 if (flags & BTRFS_EXTENT_FLAG_DATA) 1627 if (flags & BTRFS_EXTENT_FLAG_DATA)
1249 ret = scrub_checksum_data(sblock); 1628 ret = scrub_checksum_data(sblock);
@@ -1255,30 +1634,32 @@ static void scrub_checksum(struct scrub_block *sblock)
1255 WARN_ON(1); 1634 WARN_ON(1);
1256 if (ret) 1635 if (ret)
1257 scrub_handle_errored_block(sblock); 1636 scrub_handle_errored_block(sblock);
1637
1638 return ret;
1258} 1639}
1259 1640
1260static int scrub_checksum_data(struct scrub_block *sblock) 1641static int scrub_checksum_data(struct scrub_block *sblock)
1261{ 1642{
1262 struct scrub_dev *sdev = sblock->sdev; 1643 struct scrub_ctx *sctx = sblock->sctx;
1263 u8 csum[BTRFS_CSUM_SIZE]; 1644 u8 csum[BTRFS_CSUM_SIZE];
1264 u8 *on_disk_csum; 1645 u8 *on_disk_csum;
1265 struct page *page; 1646 struct page *page;
1266 void *buffer; 1647 void *buffer;
1267 u32 crc = ~(u32)0; 1648 u32 crc = ~(u32)0;
1268 int fail = 0; 1649 int fail = 0;
1269 struct btrfs_root *root = sdev->dev->dev_root; 1650 struct btrfs_root *root = sctx->dev_root;
1270 u64 len; 1651 u64 len;
1271 int index; 1652 int index;
1272 1653
1273 BUG_ON(sblock->page_count < 1); 1654 BUG_ON(sblock->page_count < 1);
1274 if (!sblock->pagev[0].have_csum) 1655 if (!sblock->pagev[0]->have_csum)
1275 return 0; 1656 return 0;
1276 1657
1277 on_disk_csum = sblock->pagev[0].csum; 1658 on_disk_csum = sblock->pagev[0]->csum;
1278 page = sblock->pagev[0].page; 1659 page = sblock->pagev[0]->page;
1279 buffer = kmap_atomic(page); 1660 buffer = kmap_atomic(page);
1280 1661
1281 len = sdev->sectorsize; 1662 len = sctx->sectorsize;
1282 index = 0; 1663 index = 0;
1283 for (;;) { 1664 for (;;) {
1284 u64 l = min_t(u64, len, PAGE_SIZE); 1665 u64 l = min_t(u64, len, PAGE_SIZE);
@@ -1290,13 +1671,13 @@ static int scrub_checksum_data(struct scrub_block *sblock)
1290 break; 1671 break;
1291 index++; 1672 index++;
1292 BUG_ON(index >= sblock->page_count); 1673 BUG_ON(index >= sblock->page_count);
1293 BUG_ON(!sblock->pagev[index].page); 1674 BUG_ON(!sblock->pagev[index]->page);
1294 page = sblock->pagev[index].page; 1675 page = sblock->pagev[index]->page;
1295 buffer = kmap_atomic(page); 1676 buffer = kmap_atomic(page);
1296 } 1677 }
1297 1678
1298 btrfs_csum_final(crc, csum); 1679 btrfs_csum_final(crc, csum);
1299 if (memcmp(csum, on_disk_csum, sdev->csum_size)) 1680 if (memcmp(csum, on_disk_csum, sctx->csum_size))
1300 fail = 1; 1681 fail = 1;
1301 1682
1302 return fail; 1683 return fail;
@@ -1304,9 +1685,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
1304 1685
1305static int scrub_checksum_tree_block(struct scrub_block *sblock) 1686static int scrub_checksum_tree_block(struct scrub_block *sblock)
1306{ 1687{
1307 struct scrub_dev *sdev = sblock->sdev; 1688 struct scrub_ctx *sctx = sblock->sctx;
1308 struct btrfs_header *h; 1689 struct btrfs_header *h;
1309 struct btrfs_root *root = sdev->dev->dev_root; 1690 struct btrfs_root *root = sctx->dev_root;
1310 struct btrfs_fs_info *fs_info = root->fs_info; 1691 struct btrfs_fs_info *fs_info = root->fs_info;
1311 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1692 u8 calculated_csum[BTRFS_CSUM_SIZE];
1312 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1693 u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1321,10 +1702,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1321 int index; 1702 int index;
1322 1703
1323 BUG_ON(sblock->page_count < 1); 1704 BUG_ON(sblock->page_count < 1);
1324 page = sblock->pagev[0].page; 1705 page = sblock->pagev[0]->page;
1325 mapped_buffer = kmap_atomic(page); 1706 mapped_buffer = kmap_atomic(page);
1326 h = (struct btrfs_header *)mapped_buffer; 1707 h = (struct btrfs_header *)mapped_buffer;
1327 memcpy(on_disk_csum, h->csum, sdev->csum_size); 1708 memcpy(on_disk_csum, h->csum, sctx->csum_size);
1328 1709
1329 /* 1710 /*
1330 * we don't use the getter functions here, as we 1711 * we don't use the getter functions here, as we
@@ -1332,10 +1713,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1332 * b) the page is already kmapped 1713 * b) the page is already kmapped
1333 */ 1714 */
1334 1715
1335 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) 1716 if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))
1336 ++fail; 1717 ++fail;
1337 1718
1338 if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) 1719 if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))
1339 ++fail; 1720 ++fail;
1340 1721
1341 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1722 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1345,8 +1726,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1345 BTRFS_UUID_SIZE)) 1726 BTRFS_UUID_SIZE))
1346 ++fail; 1727 ++fail;
1347 1728
1348 BUG_ON(sdev->nodesize != sdev->leafsize); 1729 WARN_ON(sctx->nodesize != sctx->leafsize);
1349 len = sdev->nodesize - BTRFS_CSUM_SIZE; 1730 len = sctx->nodesize - BTRFS_CSUM_SIZE;
1350 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1731 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1351 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1732 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1352 index = 0; 1733 index = 0;
@@ -1360,15 +1741,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1360 break; 1741 break;
1361 index++; 1742 index++;
1362 BUG_ON(index >= sblock->page_count); 1743 BUG_ON(index >= sblock->page_count);
1363 BUG_ON(!sblock->pagev[index].page); 1744 BUG_ON(!sblock->pagev[index]->page);
1364 page = sblock->pagev[index].page; 1745 page = sblock->pagev[index]->page;
1365 mapped_buffer = kmap_atomic(page); 1746 mapped_buffer = kmap_atomic(page);
1366 mapped_size = PAGE_SIZE; 1747 mapped_size = PAGE_SIZE;
1367 p = mapped_buffer; 1748 p = mapped_buffer;
1368 } 1749 }
1369 1750
1370 btrfs_csum_final(crc, calculated_csum); 1751 btrfs_csum_final(crc, calculated_csum);
1371 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1752 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1372 ++crc_fail; 1753 ++crc_fail;
1373 1754
1374 return fail || crc_fail; 1755 return fail || crc_fail;
@@ -1377,8 +1758,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1377static int scrub_checksum_super(struct scrub_block *sblock) 1758static int scrub_checksum_super(struct scrub_block *sblock)
1378{ 1759{
1379 struct btrfs_super_block *s; 1760 struct btrfs_super_block *s;
1380 struct scrub_dev *sdev = sblock->sdev; 1761 struct scrub_ctx *sctx = sblock->sctx;
1381 struct btrfs_root *root = sdev->dev->dev_root; 1762 struct btrfs_root *root = sctx->dev_root;
1382 struct btrfs_fs_info *fs_info = root->fs_info; 1763 struct btrfs_fs_info *fs_info = root->fs_info;
1383 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1764 u8 calculated_csum[BTRFS_CSUM_SIZE];
1384 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1765 u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1393,15 +1774,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1393 int index; 1774 int index;
1394 1775
1395 BUG_ON(sblock->page_count < 1); 1776 BUG_ON(sblock->page_count < 1);
1396 page = sblock->pagev[0].page; 1777 page = sblock->pagev[0]->page;
1397 mapped_buffer = kmap_atomic(page); 1778 mapped_buffer = kmap_atomic(page);
1398 s = (struct btrfs_super_block *)mapped_buffer; 1779 s = (struct btrfs_super_block *)mapped_buffer;
1399 memcpy(on_disk_csum, s->csum, sdev->csum_size); 1780 memcpy(on_disk_csum, s->csum, sctx->csum_size);
1400 1781
1401 if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) 1782 if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))
1402 ++fail_cor; 1783 ++fail_cor;
1403 1784
1404 if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) 1785 if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))
1405 ++fail_gen; 1786 ++fail_gen;
1406 1787
1407 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1788 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1421,15 +1802,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1421 break; 1802 break;
1422 index++; 1803 index++;
1423 BUG_ON(index >= sblock->page_count); 1804 BUG_ON(index >= sblock->page_count);
1424 BUG_ON(!sblock->pagev[index].page); 1805 BUG_ON(!sblock->pagev[index]->page);
1425 page = sblock->pagev[index].page; 1806 page = sblock->pagev[index]->page;
1426 mapped_buffer = kmap_atomic(page); 1807 mapped_buffer = kmap_atomic(page);
1427 mapped_size = PAGE_SIZE; 1808 mapped_size = PAGE_SIZE;
1428 p = mapped_buffer; 1809 p = mapped_buffer;
1429 } 1810 }
1430 1811
1431 btrfs_csum_final(crc, calculated_csum); 1812 btrfs_csum_final(crc, calculated_csum);
1432 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1813 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1433 ++fail_cor; 1814 ++fail_cor;
1434 1815
1435 if (fail_cor + fail_gen) { 1816 if (fail_cor + fail_gen) {
@@ -1438,14 +1819,14 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1438 * They will get written with the next transaction commit 1819 * They will get written with the next transaction commit
1439 * anyway 1820 * anyway
1440 */ 1821 */
1441 spin_lock(&sdev->stat_lock); 1822 spin_lock(&sctx->stat_lock);
1442 ++sdev->stat.super_errors; 1823 ++sctx->stat.super_errors;
1443 spin_unlock(&sdev->stat_lock); 1824 spin_unlock(&sctx->stat_lock);
1444 if (fail_cor) 1825 if (fail_cor)
1445 btrfs_dev_stat_inc_and_print(sdev->dev, 1826 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1446 BTRFS_DEV_STAT_CORRUPTION_ERRS); 1827 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1447 else 1828 else
1448 btrfs_dev_stat_inc_and_print(sdev->dev, 1829 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1449 BTRFS_DEV_STAT_GENERATION_ERRS); 1830 BTRFS_DEV_STAT_GENERATION_ERRS);
1450 } 1831 }
1451 1832
@@ -1463,28 +1844,54 @@ static void scrub_block_put(struct scrub_block *sblock)
1463 int i; 1844 int i;
1464 1845
1465 for (i = 0; i < sblock->page_count; i++) 1846 for (i = 0; i < sblock->page_count; i++)
1466 if (sblock->pagev[i].page) 1847 scrub_page_put(sblock->pagev[i]);
1467 __free_page(sblock->pagev[i].page);
1468 kfree(sblock); 1848 kfree(sblock);
1469 } 1849 }
1470} 1850}
1471 1851
1472static void scrub_submit(struct scrub_dev *sdev) 1852static void scrub_page_get(struct scrub_page *spage)
1853{
1854 atomic_inc(&spage->ref_count);
1855}
1856
1857static void scrub_page_put(struct scrub_page *spage)
1858{
1859 if (atomic_dec_and_test(&spage->ref_count)) {
1860 if (spage->page)
1861 __free_page(spage->page);
1862 kfree(spage);
1863 }
1864}
1865
1866static void scrub_submit(struct scrub_ctx *sctx)
1473{ 1867{
1474 struct scrub_bio *sbio; 1868 struct scrub_bio *sbio;
1475 1869
1476 if (sdev->curr == -1) 1870 if (sctx->curr == -1)
1477 return; 1871 return;
1478 1872
1479 sbio = sdev->bios[sdev->curr]; 1873 sbio = sctx->bios[sctx->curr];
1480 sdev->curr = -1; 1874 sctx->curr = -1;
1481 atomic_inc(&sdev->in_flight); 1875 scrub_pending_bio_inc(sctx);
1482 1876
1483 btrfsic_submit_bio(READ, sbio->bio); 1877 if (!sbio->bio->bi_bdev) {
1878 /*
1879 * this case should not happen. If btrfs_map_block() is
1880 * wrong, it could happen for dev-replace operations on
1881 * missing devices when no mirrors are available, but in
1882 * this case it should already fail the mount.
1883 * This case is handled correctly (but _very_ slowly).
1884 */
1885 printk_ratelimited(KERN_WARNING
1886 "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
1887 bio_endio(sbio->bio, -EIO);
1888 } else {
1889 btrfsic_submit_bio(READ, sbio->bio);
1890 }
1484} 1891}
1485 1892
1486static int scrub_add_page_to_bio(struct scrub_dev *sdev, 1893static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1487 struct scrub_page *spage) 1894 struct scrub_page *spage)
1488{ 1895{
1489 struct scrub_block *sblock = spage->sblock; 1896 struct scrub_block *sblock = spage->sblock;
1490 struct scrub_bio *sbio; 1897 struct scrub_bio *sbio;
@@ -1494,28 +1901,29 @@ again:
1494 /* 1901 /*
1495 * grab a fresh bio or wait for one to become available 1902 * grab a fresh bio or wait for one to become available
1496 */ 1903 */
1497 while (sdev->curr == -1) { 1904 while (sctx->curr == -1) {
1498 spin_lock(&sdev->list_lock); 1905 spin_lock(&sctx->list_lock);
1499 sdev->curr = sdev->first_free; 1906 sctx->curr = sctx->first_free;
1500 if (sdev->curr != -1) { 1907 if (sctx->curr != -1) {
1501 sdev->first_free = sdev->bios[sdev->curr]->next_free; 1908 sctx->first_free = sctx->bios[sctx->curr]->next_free;
1502 sdev->bios[sdev->curr]->next_free = -1; 1909 sctx->bios[sctx->curr]->next_free = -1;
1503 sdev->bios[sdev->curr]->page_count = 0; 1910 sctx->bios[sctx->curr]->page_count = 0;
1504 spin_unlock(&sdev->list_lock); 1911 spin_unlock(&sctx->list_lock);
1505 } else { 1912 } else {
1506 spin_unlock(&sdev->list_lock); 1913 spin_unlock(&sctx->list_lock);
1507 wait_event(sdev->list_wait, sdev->first_free != -1); 1914 wait_event(sctx->list_wait, sctx->first_free != -1);
1508 } 1915 }
1509 } 1916 }
1510 sbio = sdev->bios[sdev->curr]; 1917 sbio = sctx->bios[sctx->curr];
1511 if (sbio->page_count == 0) { 1918 if (sbio->page_count == 0) {
1512 struct bio *bio; 1919 struct bio *bio;
1513 1920
1514 sbio->physical = spage->physical; 1921 sbio->physical = spage->physical;
1515 sbio->logical = spage->logical; 1922 sbio->logical = spage->logical;
1923 sbio->dev = spage->dev;
1516 bio = sbio->bio; 1924 bio = sbio->bio;
1517 if (!bio) { 1925 if (!bio) {
1518 bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); 1926 bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
1519 if (!bio) 1927 if (!bio)
1520 return -ENOMEM; 1928 return -ENOMEM;
1521 sbio->bio = bio; 1929 sbio->bio = bio;
@@ -1523,14 +1931,15 @@ again:
1523 1931
1524 bio->bi_private = sbio; 1932 bio->bi_private = sbio;
1525 bio->bi_end_io = scrub_bio_end_io; 1933 bio->bi_end_io = scrub_bio_end_io;
1526 bio->bi_bdev = sdev->dev->bdev; 1934 bio->bi_bdev = sbio->dev->bdev;
1527 bio->bi_sector = spage->physical >> 9; 1935 bio->bi_sector = sbio->physical >> 9;
1528 sbio->err = 0; 1936 sbio->err = 0;
1529 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1937 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1530 spage->physical || 1938 spage->physical ||
1531 sbio->logical + sbio->page_count * PAGE_SIZE != 1939 sbio->logical + sbio->page_count * PAGE_SIZE !=
1532 spage->logical) { 1940 spage->logical ||
1533 scrub_submit(sdev); 1941 sbio->dev != spage->dev) {
1942 scrub_submit(sctx);
1534 goto again; 1943 goto again;
1535 } 1944 }
1536 1945
@@ -1542,81 +1951,87 @@ again:
1542 sbio->bio = NULL; 1951 sbio->bio = NULL;
1543 return -EIO; 1952 return -EIO;
1544 } 1953 }
1545 scrub_submit(sdev); 1954 scrub_submit(sctx);
1546 goto again; 1955 goto again;
1547 } 1956 }
1548 1957
1549 scrub_block_get(sblock); /* one for the added page */ 1958 scrub_block_get(sblock); /* one for the page added to the bio */
1550 atomic_inc(&sblock->outstanding_pages); 1959 atomic_inc(&sblock->outstanding_pages);
1551 sbio->page_count++; 1960 sbio->page_count++;
1552 if (sbio->page_count == sdev->pages_per_bio) 1961 if (sbio->page_count == sctx->pages_per_rd_bio)
1553 scrub_submit(sdev); 1962 scrub_submit(sctx);
1554 1963
1555 return 0; 1964 return 0;
1556} 1965}
1557 1966
1558static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, 1967static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
1559 u64 physical, u64 flags, u64 gen, int mirror_num, 1968 u64 physical, struct btrfs_device *dev, u64 flags,
1560 u8 *csum, int force) 1969 u64 gen, int mirror_num, u8 *csum, int force,
1970 u64 physical_for_dev_replace)
1561{ 1971{
1562 struct scrub_block *sblock; 1972 struct scrub_block *sblock;
1563 int index; 1973 int index;
1564 1974
1565 sblock = kzalloc(sizeof(*sblock), GFP_NOFS); 1975 sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
1566 if (!sblock) { 1976 if (!sblock) {
1567 spin_lock(&sdev->stat_lock); 1977 spin_lock(&sctx->stat_lock);
1568 sdev->stat.malloc_errors++; 1978 sctx->stat.malloc_errors++;
1569 spin_unlock(&sdev->stat_lock); 1979 spin_unlock(&sctx->stat_lock);
1570 return -ENOMEM; 1980 return -ENOMEM;
1571 } 1981 }
1572 1982
1573 /* one ref inside this function, plus one for each page later on */ 1983 /* one ref inside this function, plus one for each page added to
1984 * a bio later on */
1574 atomic_set(&sblock->ref_count, 1); 1985 atomic_set(&sblock->ref_count, 1);
1575 sblock->sdev = sdev; 1986 sblock->sctx = sctx;
1576 sblock->no_io_error_seen = 1; 1987 sblock->no_io_error_seen = 1;
1577 1988
1578 for (index = 0; len > 0; index++) { 1989 for (index = 0; len > 0; index++) {
1579 struct scrub_page *spage = sblock->pagev + index; 1990 struct scrub_page *spage;
1580 u64 l = min_t(u64, len, PAGE_SIZE); 1991 u64 l = min_t(u64, len, PAGE_SIZE);
1581 1992
1582 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); 1993 spage = kzalloc(sizeof(*spage), GFP_NOFS);
1583 spage->page = alloc_page(GFP_NOFS); 1994 if (!spage) {
1584 if (!spage->page) { 1995leave_nomem:
1585 spin_lock(&sdev->stat_lock); 1996 spin_lock(&sctx->stat_lock);
1586 sdev->stat.malloc_errors++; 1997 sctx->stat.malloc_errors++;
1587 spin_unlock(&sdev->stat_lock); 1998 spin_unlock(&sctx->stat_lock);
1588 while (index > 0) { 1999 scrub_block_put(sblock);
1589 index--;
1590 __free_page(sblock->pagev[index].page);
1591 }
1592 kfree(sblock);
1593 return -ENOMEM; 2000 return -ENOMEM;
1594 } 2001 }
2002 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2003 scrub_page_get(spage);
2004 sblock->pagev[index] = spage;
1595 spage->sblock = sblock; 2005 spage->sblock = sblock;
1596 spage->dev = sdev->dev; 2006 spage->dev = dev;
1597 spage->flags = flags; 2007 spage->flags = flags;
1598 spage->generation = gen; 2008 spage->generation = gen;
1599 spage->logical = logical; 2009 spage->logical = logical;
1600 spage->physical = physical; 2010 spage->physical = physical;
2011 spage->physical_for_dev_replace = physical_for_dev_replace;
1601 spage->mirror_num = mirror_num; 2012 spage->mirror_num = mirror_num;
1602 if (csum) { 2013 if (csum) {
1603 spage->have_csum = 1; 2014 spage->have_csum = 1;
1604 memcpy(spage->csum, csum, sdev->csum_size); 2015 memcpy(spage->csum, csum, sctx->csum_size);
1605 } else { 2016 } else {
1606 spage->have_csum = 0; 2017 spage->have_csum = 0;
1607 } 2018 }
1608 sblock->page_count++; 2019 sblock->page_count++;
2020 spage->page = alloc_page(GFP_NOFS);
2021 if (!spage->page)
2022 goto leave_nomem;
1609 len -= l; 2023 len -= l;
1610 logical += l; 2024 logical += l;
1611 physical += l; 2025 physical += l;
2026 physical_for_dev_replace += l;
1612 } 2027 }
1613 2028
1614 BUG_ON(sblock->page_count == 0); 2029 WARN_ON(sblock->page_count == 0);
1615 for (index = 0; index < sblock->page_count; index++) { 2030 for (index = 0; index < sblock->page_count; index++) {
1616 struct scrub_page *spage = sblock->pagev + index; 2031 struct scrub_page *spage = sblock->pagev[index];
1617 int ret; 2032 int ret;
1618 2033
1619 ret = scrub_add_page_to_bio(sdev, spage); 2034 ret = scrub_add_page_to_rd_bio(sctx, spage);
1620 if (ret) { 2035 if (ret) {
1621 scrub_block_put(sblock); 2036 scrub_block_put(sblock);
1622 return ret; 2037 return ret;
@@ -1624,7 +2039,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
1624 } 2039 }
1625 2040
1626 if (force) 2041 if (force)
1627 scrub_submit(sdev); 2042 scrub_submit(sctx);
1628 2043
1629 /* last one frees, either here or in bio completion for last page */ 2044 /* last one frees, either here or in bio completion for last page */
1630 scrub_block_put(sblock); 2045 scrub_block_put(sblock);
@@ -1634,8 +2049,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
1634static void scrub_bio_end_io(struct bio *bio, int err) 2049static void scrub_bio_end_io(struct bio *bio, int err)
1635{ 2050{
1636 struct scrub_bio *sbio = bio->bi_private; 2051 struct scrub_bio *sbio = bio->bi_private;
1637 struct scrub_dev *sdev = sbio->sdev; 2052 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1638 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
1639 2053
1640 sbio->err = err; 2054 sbio->err = err;
1641 sbio->bio = bio; 2055 sbio->bio = bio;
@@ -1646,10 +2060,10 @@ static void scrub_bio_end_io(struct bio *bio, int err)
1646static void scrub_bio_end_io_worker(struct btrfs_work *work) 2060static void scrub_bio_end_io_worker(struct btrfs_work *work)
1647{ 2061{
1648 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 2062 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1649 struct scrub_dev *sdev = sbio->sdev; 2063 struct scrub_ctx *sctx = sbio->sctx;
1650 int i; 2064 int i;
1651 2065
1652 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); 2066 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
1653 if (sbio->err) { 2067 if (sbio->err) {
1654 for (i = 0; i < sbio->page_count; i++) { 2068 for (i = 0; i < sbio->page_count; i++) {
1655 struct scrub_page *spage = sbio->pagev[i]; 2069 struct scrub_page *spage = sbio->pagev[i];
@@ -1671,23 +2085,37 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
1671 2085
1672 bio_put(sbio->bio); 2086 bio_put(sbio->bio);
1673 sbio->bio = NULL; 2087 sbio->bio = NULL;
1674 spin_lock(&sdev->list_lock); 2088 spin_lock(&sctx->list_lock);
1675 sbio->next_free = sdev->first_free; 2089 sbio->next_free = sctx->first_free;
1676 sdev->first_free = sbio->index; 2090 sctx->first_free = sbio->index;
1677 spin_unlock(&sdev->list_lock); 2091 spin_unlock(&sctx->list_lock);
1678 atomic_dec(&sdev->in_flight); 2092
1679 wake_up(&sdev->list_wait); 2093 if (sctx->is_dev_replace &&
2094 atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2095 mutex_lock(&sctx->wr_ctx.wr_lock);
2096 scrub_wr_submit(sctx);
2097 mutex_unlock(&sctx->wr_ctx.wr_lock);
2098 }
2099
2100 scrub_pending_bio_dec(sctx);
1680} 2101}
1681 2102
1682static void scrub_block_complete(struct scrub_block *sblock) 2103static void scrub_block_complete(struct scrub_block *sblock)
1683{ 2104{
1684 if (!sblock->no_io_error_seen) 2105 if (!sblock->no_io_error_seen) {
1685 scrub_handle_errored_block(sblock); 2106 scrub_handle_errored_block(sblock);
1686 else 2107 } else {
1687 scrub_checksum(sblock); 2108 /*
2109 * if has checksum error, write via repair mechanism in
2110 * dev replace case, otherwise write here in dev replace
2111 * case.
2112 */
2113 if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
2114 scrub_write_block_to_dev_replace(sblock);
2115 }
1688} 2116}
1689 2117
1690static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, 2118static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
1691 u8 *csum) 2119 u8 *csum)
1692{ 2120{
1693 struct btrfs_ordered_sum *sum = NULL; 2121 struct btrfs_ordered_sum *sum = NULL;
@@ -1695,15 +2123,15 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1695 unsigned long i; 2123 unsigned long i;
1696 unsigned long num_sectors; 2124 unsigned long num_sectors;
1697 2125
1698 while (!list_empty(&sdev->csum_list)) { 2126 while (!list_empty(&sctx->csum_list)) {
1699 sum = list_first_entry(&sdev->csum_list, 2127 sum = list_first_entry(&sctx->csum_list,
1700 struct btrfs_ordered_sum, list); 2128 struct btrfs_ordered_sum, list);
1701 if (sum->bytenr > logical) 2129 if (sum->bytenr > logical)
1702 return 0; 2130 return 0;
1703 if (sum->bytenr + sum->len > logical) 2131 if (sum->bytenr + sum->len > logical)
1704 break; 2132 break;
1705 2133
1706 ++sdev->stat.csum_discards; 2134 ++sctx->stat.csum_discards;
1707 list_del(&sum->list); 2135 list_del(&sum->list);
1708 kfree(sum); 2136 kfree(sum);
1709 sum = NULL; 2137 sum = NULL;
@@ -1711,10 +2139,10 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1711 if (!sum) 2139 if (!sum)
1712 return 0; 2140 return 0;
1713 2141
1714 num_sectors = sum->len / sdev->sectorsize; 2142 num_sectors = sum->len / sctx->sectorsize;
1715 for (i = 0; i < num_sectors; ++i) { 2143 for (i = 0; i < num_sectors; ++i) {
1716 if (sum->sums[i].bytenr == logical) { 2144 if (sum->sums[i].bytenr == logical) {
1717 memcpy(csum, &sum->sums[i].sum, sdev->csum_size); 2145 memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
1718 ret = 1; 2146 ret = 1;
1719 break; 2147 break;
1720 } 2148 }
@@ -1727,29 +2155,30 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1727} 2155}
1728 2156
1729/* scrub extent tries to collect up to 64 kB for each bio */ 2157/* scrub extent tries to collect up to 64 kB for each bio */
1730static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 2158static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
1731 u64 physical, u64 flags, u64 gen, int mirror_num) 2159 u64 physical, struct btrfs_device *dev, u64 flags,
2160 u64 gen, int mirror_num, u64 physical_for_dev_replace)
1732{ 2161{
1733 int ret; 2162 int ret;
1734 u8 csum[BTRFS_CSUM_SIZE]; 2163 u8 csum[BTRFS_CSUM_SIZE];
1735 u32 blocksize; 2164 u32 blocksize;
1736 2165
1737 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2166 if (flags & BTRFS_EXTENT_FLAG_DATA) {
1738 blocksize = sdev->sectorsize; 2167 blocksize = sctx->sectorsize;
1739 spin_lock(&sdev->stat_lock); 2168 spin_lock(&sctx->stat_lock);
1740 sdev->stat.data_extents_scrubbed++; 2169 sctx->stat.data_extents_scrubbed++;
1741 sdev->stat.data_bytes_scrubbed += len; 2170 sctx->stat.data_bytes_scrubbed += len;
1742 spin_unlock(&sdev->stat_lock); 2171 spin_unlock(&sctx->stat_lock);
1743 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2172 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1744 BUG_ON(sdev->nodesize != sdev->leafsize); 2173 WARN_ON(sctx->nodesize != sctx->leafsize);
1745 blocksize = sdev->nodesize; 2174 blocksize = sctx->nodesize;
1746 spin_lock(&sdev->stat_lock); 2175 spin_lock(&sctx->stat_lock);
1747 sdev->stat.tree_extents_scrubbed++; 2176 sctx->stat.tree_extents_scrubbed++;
1748 sdev->stat.tree_bytes_scrubbed += len; 2177 sctx->stat.tree_bytes_scrubbed += len;
1749 spin_unlock(&sdev->stat_lock); 2178 spin_unlock(&sctx->stat_lock);
1750 } else { 2179 } else {
1751 blocksize = sdev->sectorsize; 2180 blocksize = sctx->sectorsize;
1752 BUG_ON(1); 2181 WARN_ON(1);
1753 } 2182 }
1754 2183
1755 while (len) { 2184 while (len) {
@@ -1758,26 +2187,38 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
1758 2187
1759 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2188 if (flags & BTRFS_EXTENT_FLAG_DATA) {
1760 /* push csums to sbio */ 2189 /* push csums to sbio */
1761 have_csum = scrub_find_csum(sdev, logical, l, csum); 2190 have_csum = scrub_find_csum(sctx, logical, l, csum);
1762 if (have_csum == 0) 2191 if (have_csum == 0)
1763 ++sdev->stat.no_csum; 2192 ++sctx->stat.no_csum;
2193 if (sctx->is_dev_replace && !have_csum) {
2194 ret = copy_nocow_pages(sctx, logical, l,
2195 mirror_num,
2196 physical_for_dev_replace);
2197 goto behind_scrub_pages;
2198 }
1764 } 2199 }
1765 ret = scrub_pages(sdev, logical, l, physical, flags, gen, 2200 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
1766 mirror_num, have_csum ? csum : NULL, 0); 2201 mirror_num, have_csum ? csum : NULL, 0,
2202 physical_for_dev_replace);
2203behind_scrub_pages:
1767 if (ret) 2204 if (ret)
1768 return ret; 2205 return ret;
1769 len -= l; 2206 len -= l;
1770 logical += l; 2207 logical += l;
1771 physical += l; 2208 physical += l;
2209 physical_for_dev_replace += l;
1772 } 2210 }
1773 return 0; 2211 return 0;
1774} 2212}
1775 2213
1776static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, 2214static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
1777 struct map_lookup *map, int num, u64 base, u64 length) 2215 struct map_lookup *map,
2216 struct btrfs_device *scrub_dev,
2217 int num, u64 base, u64 length,
2218 int is_dev_replace)
1778{ 2219{
1779 struct btrfs_path *path; 2220 struct btrfs_path *path;
1780 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 2221 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
1781 struct btrfs_root *root = fs_info->extent_root; 2222 struct btrfs_root *root = fs_info->extent_root;
1782 struct btrfs_root *csum_root = fs_info->csum_root; 2223 struct btrfs_root *csum_root = fs_info->csum_root;
1783 struct btrfs_extent_item *extent; 2224 struct btrfs_extent_item *extent;
@@ -1797,9 +2238,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1797 struct reada_control *reada2; 2238 struct reada_control *reada2;
1798 struct btrfs_key key_start; 2239 struct btrfs_key key_start;
1799 struct btrfs_key key_end; 2240 struct btrfs_key key_end;
1800
1801 u64 increment = map->stripe_len; 2241 u64 increment = map->stripe_len;
1802 u64 offset; 2242 u64 offset;
2243 u64 extent_logical;
2244 u64 extent_physical;
2245 u64 extent_len;
2246 struct btrfs_device *extent_dev;
2247 int extent_mirror_num;
1803 2248
1804 nstripes = length; 2249 nstripes = length;
1805 offset = 0; 2250 offset = 0;
@@ -1843,8 +2288,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1843 */ 2288 */
1844 logical = base + offset; 2289 logical = base + offset;
1845 2290
1846 wait_event(sdev->list_wait, 2291 wait_event(sctx->list_wait,
1847 atomic_read(&sdev->in_flight) == 0); 2292 atomic_read(&sctx->bios_in_flight) == 0);
1848 atomic_inc(&fs_info->scrubs_paused); 2293 atomic_inc(&fs_info->scrubs_paused);
1849 wake_up(&fs_info->scrub_pause_wait); 2294 wake_up(&fs_info->scrub_pause_wait);
1850 2295
@@ -1898,7 +2343,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1898 * canceled? 2343 * canceled?
1899 */ 2344 */
1900 if (atomic_read(&fs_info->scrub_cancel_req) || 2345 if (atomic_read(&fs_info->scrub_cancel_req) ||
1901 atomic_read(&sdev->cancel_req)) { 2346 atomic_read(&sctx->cancel_req)) {
1902 ret = -ECANCELED; 2347 ret = -ECANCELED;
1903 goto out; 2348 goto out;
1904 } 2349 }
@@ -1907,9 +2352,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1907 */ 2352 */
1908 if (atomic_read(&fs_info->scrub_pause_req)) { 2353 if (atomic_read(&fs_info->scrub_pause_req)) {
1909 /* push queued extents */ 2354 /* push queued extents */
1910 scrub_submit(sdev); 2355 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
1911 wait_event(sdev->list_wait, 2356 scrub_submit(sctx);
1912 atomic_read(&sdev->in_flight) == 0); 2357 mutex_lock(&sctx->wr_ctx.wr_lock);
2358 scrub_wr_submit(sctx);
2359 mutex_unlock(&sctx->wr_ctx.wr_lock);
2360 wait_event(sctx->list_wait,
2361 atomic_read(&sctx->bios_in_flight) == 0);
2362 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
1913 atomic_inc(&fs_info->scrubs_paused); 2363 atomic_inc(&fs_info->scrubs_paused);
1914 wake_up(&fs_info->scrub_pause_wait); 2364 wake_up(&fs_info->scrub_pause_wait);
1915 mutex_lock(&fs_info->scrub_lock); 2365 mutex_lock(&fs_info->scrub_lock);
@@ -1926,7 +2376,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1926 2376
1927 ret = btrfs_lookup_csums_range(csum_root, logical, 2377 ret = btrfs_lookup_csums_range(csum_root, logical,
1928 logical + map->stripe_len - 1, 2378 logical + map->stripe_len - 1,
1929 &sdev->csum_list, 1); 2379 &sctx->csum_list, 1);
1930 if (ret) 2380 if (ret)
1931 goto out; 2381 goto out;
1932 2382
@@ -2004,9 +2454,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
2004 key.objectid; 2454 key.objectid;
2005 } 2455 }
2006 2456
2007 ret = scrub_extent(sdev, key.objectid, key.offset, 2457 extent_logical = key.objectid;
2008 key.objectid - logical + physical, 2458 extent_physical = key.objectid - logical + physical;
2009 flags, generation, mirror_num); 2459 extent_len = key.offset;
2460 extent_dev = scrub_dev;
2461 extent_mirror_num = mirror_num;
2462 if (is_dev_replace)
2463 scrub_remap_extent(fs_info, extent_logical,
2464 extent_len, &extent_physical,
2465 &extent_dev,
2466 &extent_mirror_num);
2467 ret = scrub_extent(sctx, extent_logical, extent_len,
2468 extent_physical, extent_dev, flags,
2469 generation, extent_mirror_num,
2470 key.objectid - logical + physical);
2010 if (ret) 2471 if (ret)
2011 goto out; 2472 goto out;
2012 2473
@@ -2016,29 +2477,34 @@ next:
2016 btrfs_release_path(path); 2477 btrfs_release_path(path);
2017 logical += increment; 2478 logical += increment;
2018 physical += map->stripe_len; 2479 physical += map->stripe_len;
2019 spin_lock(&sdev->stat_lock); 2480 spin_lock(&sctx->stat_lock);
2020 sdev->stat.last_physical = physical; 2481 sctx->stat.last_physical = physical;
2021 spin_unlock(&sdev->stat_lock); 2482 spin_unlock(&sctx->stat_lock);
2022 } 2483 }
2484out:
2023 /* push queued extents */ 2485 /* push queued extents */
2024 scrub_submit(sdev); 2486 scrub_submit(sctx);
2487 mutex_lock(&sctx->wr_ctx.wr_lock);
2488 scrub_wr_submit(sctx);
2489 mutex_unlock(&sctx->wr_ctx.wr_lock);
2025 2490
2026out:
2027 blk_finish_plug(&plug); 2491 blk_finish_plug(&plug);
2028 btrfs_free_path(path); 2492 btrfs_free_path(path);
2029 return ret < 0 ? ret : 0; 2493 return ret < 0 ? ret : 0;
2030} 2494}
2031 2495
2032static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, 2496static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2033 u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, 2497 struct btrfs_device *scrub_dev,
2034 u64 dev_offset) 2498 u64 chunk_tree, u64 chunk_objectid,
2499 u64 chunk_offset, u64 length,
2500 u64 dev_offset, int is_dev_replace)
2035{ 2501{
2036 struct btrfs_mapping_tree *map_tree = 2502 struct btrfs_mapping_tree *map_tree =
2037 &sdev->dev->dev_root->fs_info->mapping_tree; 2503 &sctx->dev_root->fs_info->mapping_tree;
2038 struct map_lookup *map; 2504 struct map_lookup *map;
2039 struct extent_map *em; 2505 struct extent_map *em;
2040 int i; 2506 int i;
2041 int ret = -EINVAL; 2507 int ret = 0;
2042 2508
2043 read_lock(&map_tree->map_tree.lock); 2509 read_lock(&map_tree->map_tree.lock);
2044 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2510 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
@@ -2055,9 +2521,11 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
2055 goto out; 2521 goto out;
2056 2522
2057 for (i = 0; i < map->num_stripes; ++i) { 2523 for (i = 0; i < map->num_stripes; ++i) {
2058 if (map->stripes[i].dev == sdev->dev && 2524 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2059 map->stripes[i].physical == dev_offset) { 2525 map->stripes[i].physical == dev_offset) {
2060 ret = scrub_stripe(sdev, map, i, chunk_offset, length); 2526 ret = scrub_stripe(sctx, map, scrub_dev, i,
2527 chunk_offset, length,
2528 is_dev_replace);
2061 if (ret) 2529 if (ret)
2062 goto out; 2530 goto out;
2063 } 2531 }
@@ -2069,11 +2537,13 @@ out:
2069} 2537}
2070 2538
2071static noinline_for_stack 2539static noinline_for_stack
2072int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) 2540int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2541 struct btrfs_device *scrub_dev, u64 start, u64 end,
2542 int is_dev_replace)
2073{ 2543{
2074 struct btrfs_dev_extent *dev_extent = NULL; 2544 struct btrfs_dev_extent *dev_extent = NULL;
2075 struct btrfs_path *path; 2545 struct btrfs_path *path;
2076 struct btrfs_root *root = sdev->dev->dev_root; 2546 struct btrfs_root *root = sctx->dev_root;
2077 struct btrfs_fs_info *fs_info = root->fs_info; 2547 struct btrfs_fs_info *fs_info = root->fs_info;
2078 u64 length; 2548 u64 length;
2079 u64 chunk_tree; 2549 u64 chunk_tree;
@@ -2085,6 +2555,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2085 struct btrfs_key key; 2555 struct btrfs_key key;
2086 struct btrfs_key found_key; 2556 struct btrfs_key found_key;
2087 struct btrfs_block_group_cache *cache; 2557 struct btrfs_block_group_cache *cache;
2558 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2088 2559
2089 path = btrfs_alloc_path(); 2560 path = btrfs_alloc_path();
2090 if (!path) 2561 if (!path)
@@ -2094,11 +2565,10 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2094 path->search_commit_root = 1; 2565 path->search_commit_root = 1;
2095 path->skip_locking = 1; 2566 path->skip_locking = 1;
2096 2567
2097 key.objectid = sdev->dev->devid; 2568 key.objectid = scrub_dev->devid;
2098 key.offset = 0ull; 2569 key.offset = 0ull;
2099 key.type = BTRFS_DEV_EXTENT_KEY; 2570 key.type = BTRFS_DEV_EXTENT_KEY;
2100 2571
2101
2102 while (1) { 2572 while (1) {
2103 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2573 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2104 if (ret < 0) 2574 if (ret < 0)
@@ -2117,7 +2587,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2117 2587
2118 btrfs_item_key_to_cpu(l, &found_key, slot); 2588 btrfs_item_key_to_cpu(l, &found_key, slot);
2119 2589
2120 if (found_key.objectid != sdev->dev->devid) 2590 if (found_key.objectid != scrub_dev->devid)
2121 break; 2591 break;
2122 2592
2123 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) 2593 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
@@ -2151,11 +2621,62 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2151 ret = -ENOENT; 2621 ret = -ENOENT;
2152 break; 2622 break;
2153 } 2623 }
2154 ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, 2624 dev_replace->cursor_right = found_key.offset + length;
2155 chunk_offset, length, found_key.offset); 2625 dev_replace->cursor_left = found_key.offset;
2626 dev_replace->item_needs_writeback = 1;
2627 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2628 chunk_offset, length, found_key.offset,
2629 is_dev_replace);
2630
2631 /*
2632 * flush, submit all pending read and write bios, afterwards
2633 * wait for them.
2634 * Note that in the dev replace case, a read request causes
2635 * write requests that are submitted in the read completion
2636 * worker. Therefore in the current situation, it is required
2637 * that all write requests are flushed, so that all read and
2638 * write requests are really completed when bios_in_flight
2639 * changes to 0.
2640 */
2641 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2642 scrub_submit(sctx);
2643 mutex_lock(&sctx->wr_ctx.wr_lock);
2644 scrub_wr_submit(sctx);
2645 mutex_unlock(&sctx->wr_ctx.wr_lock);
2646
2647 wait_event(sctx->list_wait,
2648 atomic_read(&sctx->bios_in_flight) == 0);
2649 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2650 atomic_inc(&fs_info->scrubs_paused);
2651 wake_up(&fs_info->scrub_pause_wait);
2652 wait_event(sctx->list_wait,
2653 atomic_read(&sctx->workers_pending) == 0);
2654
2655 mutex_lock(&fs_info->scrub_lock);
2656 while (atomic_read(&fs_info->scrub_pause_req)) {
2657 mutex_unlock(&fs_info->scrub_lock);
2658 wait_event(fs_info->scrub_pause_wait,
2659 atomic_read(&fs_info->scrub_pause_req) == 0);
2660 mutex_lock(&fs_info->scrub_lock);
2661 }
2662 atomic_dec(&fs_info->scrubs_paused);
2663 mutex_unlock(&fs_info->scrub_lock);
2664 wake_up(&fs_info->scrub_pause_wait);
2665
2666 dev_replace->cursor_left = dev_replace->cursor_right;
2667 dev_replace->item_needs_writeback = 1;
2156 btrfs_put_block_group(cache); 2668 btrfs_put_block_group(cache);
2157 if (ret) 2669 if (ret)
2158 break; 2670 break;
2671 if (is_dev_replace &&
2672 atomic64_read(&dev_replace->num_write_errors) > 0) {
2673 ret = -EIO;
2674 break;
2675 }
2676 if (sctx->stat.malloc_errors > 0) {
2677 ret = -ENOMEM;
2678 break;
2679 }
2159 2680
2160 key.offset = found_key.offset + length; 2681 key.offset = found_key.offset + length;
2161 btrfs_release_path(path); 2682 btrfs_release_path(path);
@@ -2170,14 +2691,14 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2170 return ret < 0 ? ret : 0; 2691 return ret < 0 ? ret : 0;
2171} 2692}
2172 2693
2173static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) 2694static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2695 struct btrfs_device *scrub_dev)
2174{ 2696{
2175 int i; 2697 int i;
2176 u64 bytenr; 2698 u64 bytenr;
2177 u64 gen; 2699 u64 gen;
2178 int ret; 2700 int ret;
2179 struct btrfs_device *device = sdev->dev; 2701 struct btrfs_root *root = sctx->dev_root;
2180 struct btrfs_root *root = device->dev_root;
2181 2702
2182 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 2703 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2183 return -EIO; 2704 return -EIO;
@@ -2186,15 +2707,16 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
2186 2707
2187 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 2708 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2188 bytenr = btrfs_sb_offset(i); 2709 bytenr = btrfs_sb_offset(i);
2189 if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) 2710 if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
2190 break; 2711 break;
2191 2712
2192 ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 2713 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2193 BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); 2714 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2715 NULL, 1, bytenr);
2194 if (ret) 2716 if (ret)
2195 return ret; 2717 return ret;
2196 } 2718 }
2197 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 2719 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2198 2720
2199 return 0; 2721 return 0;
2200} 2722}
@@ -2202,19 +2724,38 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
2202/* 2724/*
2203 * get a reference count on fs_info->scrub_workers. start worker if necessary 2725 * get a reference count on fs_info->scrub_workers. start worker if necessary
2204 */ 2726 */
2205static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) 2727static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2728 int is_dev_replace)
2206{ 2729{
2207 struct btrfs_fs_info *fs_info = root->fs_info;
2208 int ret = 0; 2730 int ret = 0;
2209 2731
2210 mutex_lock(&fs_info->scrub_lock); 2732 mutex_lock(&fs_info->scrub_lock);
2211 if (fs_info->scrub_workers_refcnt == 0) { 2733 if (fs_info->scrub_workers_refcnt == 0) {
2212 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2734 if (is_dev_replace)
2213 fs_info->thread_pool_size, &fs_info->generic_worker); 2735 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
2736 &fs_info->generic_worker);
2737 else
2738 btrfs_init_workers(&fs_info->scrub_workers, "scrub",
2739 fs_info->thread_pool_size,
2740 &fs_info->generic_worker);
2214 fs_info->scrub_workers.idle_thresh = 4; 2741 fs_info->scrub_workers.idle_thresh = 4;
2215 ret = btrfs_start_workers(&fs_info->scrub_workers); 2742 ret = btrfs_start_workers(&fs_info->scrub_workers);
2216 if (ret) 2743 if (ret)
2217 goto out; 2744 goto out;
2745 btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
2746 "scrubwrc",
2747 fs_info->thread_pool_size,
2748 &fs_info->generic_worker);
2749 fs_info->scrub_wr_completion_workers.idle_thresh = 2;
2750 ret = btrfs_start_workers(
2751 &fs_info->scrub_wr_completion_workers);
2752 if (ret)
2753 goto out;
2754 btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
2755 &fs_info->generic_worker);
2756 ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
2757 if (ret)
2758 goto out;
2218 } 2759 }
2219 ++fs_info->scrub_workers_refcnt; 2760 ++fs_info->scrub_workers_refcnt;
2220out: 2761out:
@@ -2223,40 +2764,41 @@ out:
2223 return ret; 2764 return ret;
2224} 2765}
2225 2766
2226static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) 2767static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2227{ 2768{
2228 struct btrfs_fs_info *fs_info = root->fs_info;
2229
2230 mutex_lock(&fs_info->scrub_lock); 2769 mutex_lock(&fs_info->scrub_lock);
2231 if (--fs_info->scrub_workers_refcnt == 0) 2770 if (--fs_info->scrub_workers_refcnt == 0) {
2232 btrfs_stop_workers(&fs_info->scrub_workers); 2771 btrfs_stop_workers(&fs_info->scrub_workers);
2772 btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
2773 btrfs_stop_workers(&fs_info->scrub_nocow_workers);
2774 }
2233 WARN_ON(fs_info->scrub_workers_refcnt < 0); 2775 WARN_ON(fs_info->scrub_workers_refcnt < 0);
2234 mutex_unlock(&fs_info->scrub_lock); 2776 mutex_unlock(&fs_info->scrub_lock);
2235} 2777}
2236 2778
2237 2779int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2238int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 2780 u64 end, struct btrfs_scrub_progress *progress,
2239 struct btrfs_scrub_progress *progress, int readonly) 2781 int readonly, int is_dev_replace)
2240{ 2782{
2241 struct scrub_dev *sdev; 2783 struct scrub_ctx *sctx;
2242 struct btrfs_fs_info *fs_info = root->fs_info;
2243 int ret; 2784 int ret;
2244 struct btrfs_device *dev; 2785 struct btrfs_device *dev;
2245 2786
2246 if (btrfs_fs_closing(root->fs_info)) 2787 if (btrfs_fs_closing(fs_info))
2247 return -EINVAL; 2788 return -EINVAL;
2248 2789
2249 /* 2790 /*
2250 * check some assumptions 2791 * check some assumptions
2251 */ 2792 */
2252 if (root->nodesize != root->leafsize) { 2793 if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
2253 printk(KERN_ERR 2794 printk(KERN_ERR
2254 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", 2795 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
2255 root->nodesize, root->leafsize); 2796 fs_info->chunk_root->nodesize,
2797 fs_info->chunk_root->leafsize);
2256 return -EINVAL; 2798 return -EINVAL;
2257 } 2799 }
2258 2800
2259 if (root->nodesize > BTRFS_STRIPE_LEN) { 2801 if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
2260 /* 2802 /*
2261 * in this case scrub is unable to calculate the checksum 2803 * in this case scrub is unable to calculate the checksum
2262 * the way scrub is implemented. Do not handle this 2804 * the way scrub is implemented. Do not handle this
@@ -2264,80 +2806,105 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
2264 */ 2806 */
2265 printk(KERN_ERR 2807 printk(KERN_ERR
2266 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", 2808 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
2267 root->nodesize, BTRFS_STRIPE_LEN); 2809 fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
2268 return -EINVAL; 2810 return -EINVAL;
2269 } 2811 }
2270 2812
2271 if (root->sectorsize != PAGE_SIZE) { 2813 if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
2272 /* not supported for data w/o checksums */ 2814 /* not supported for data w/o checksums */
2273 printk(KERN_ERR 2815 printk(KERN_ERR
2274 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", 2816 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
2275 root->sectorsize, (unsigned long long)PAGE_SIZE); 2817 fs_info->chunk_root->sectorsize,
2818 (unsigned long long)PAGE_SIZE);
2276 return -EINVAL; 2819 return -EINVAL;
2277 } 2820 }
2278 2821
2279 ret = scrub_workers_get(root); 2822 if (fs_info->chunk_root->nodesize >
2823 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
2824 fs_info->chunk_root->sectorsize >
2825 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
2826 /*
2827 * would exhaust the array bounds of pagev member in
2828 * struct scrub_block
2829 */
2830 pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
2831 fs_info->chunk_root->nodesize,
2832 SCRUB_MAX_PAGES_PER_BLOCK,
2833 fs_info->chunk_root->sectorsize,
2834 SCRUB_MAX_PAGES_PER_BLOCK);
2835 return -EINVAL;
2836 }
2837
2838 ret = scrub_workers_get(fs_info, is_dev_replace);
2280 if (ret) 2839 if (ret)
2281 return ret; 2840 return ret;
2282 2841
2283 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2842 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2284 dev = btrfs_find_device(root, devid, NULL, NULL); 2843 dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2285 if (!dev || dev->missing) { 2844 if (!dev || (dev->missing && !is_dev_replace)) {
2286 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2845 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2287 scrub_workers_put(root); 2846 scrub_workers_put(fs_info);
2288 return -ENODEV; 2847 return -ENODEV;
2289 } 2848 }
2290 mutex_lock(&fs_info->scrub_lock); 2849 mutex_lock(&fs_info->scrub_lock);
2291 2850
2292 if (!dev->in_fs_metadata) { 2851 if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
2293 mutex_unlock(&fs_info->scrub_lock); 2852 mutex_unlock(&fs_info->scrub_lock);
2294 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2853 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2295 scrub_workers_put(root); 2854 scrub_workers_put(fs_info);
2296 return -ENODEV; 2855 return -EIO;
2297 } 2856 }
2298 2857
2299 if (dev->scrub_device) { 2858 btrfs_dev_replace_lock(&fs_info->dev_replace);
2859 if (dev->scrub_device ||
2860 (!is_dev_replace &&
2861 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
2862 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2300 mutex_unlock(&fs_info->scrub_lock); 2863 mutex_unlock(&fs_info->scrub_lock);
2301 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2864 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2302 scrub_workers_put(root); 2865 scrub_workers_put(fs_info);
2303 return -EINPROGRESS; 2866 return -EINPROGRESS;
2304 } 2867 }
2305 sdev = scrub_setup_dev(dev); 2868 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2306 if (IS_ERR(sdev)) { 2869 sctx = scrub_setup_ctx(dev, is_dev_replace);
2870 if (IS_ERR(sctx)) {
2307 mutex_unlock(&fs_info->scrub_lock); 2871 mutex_unlock(&fs_info->scrub_lock);
2308 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2872 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2309 scrub_workers_put(root); 2873 scrub_workers_put(fs_info);
2310 return PTR_ERR(sdev); 2874 return PTR_ERR(sctx);
2311 } 2875 }
2312 sdev->readonly = readonly; 2876 sctx->readonly = readonly;
2313 dev->scrub_device = sdev; 2877 dev->scrub_device = sctx;
2314 2878
2315 atomic_inc(&fs_info->scrubs_running); 2879 atomic_inc(&fs_info->scrubs_running);
2316 mutex_unlock(&fs_info->scrub_lock); 2880 mutex_unlock(&fs_info->scrub_lock);
2317 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2881 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2318 2882
2319 down_read(&fs_info->scrub_super_lock); 2883 if (!is_dev_replace) {
2320 ret = scrub_supers(sdev); 2884 down_read(&fs_info->scrub_super_lock);
2321 up_read(&fs_info->scrub_super_lock); 2885 ret = scrub_supers(sctx, dev);
2886 up_read(&fs_info->scrub_super_lock);
2887 }
2322 2888
2323 if (!ret) 2889 if (!ret)
2324 ret = scrub_enumerate_chunks(sdev, start, end); 2890 ret = scrub_enumerate_chunks(sctx, dev, start, end,
2891 is_dev_replace);
2325 2892
2326 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 2893 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2327 atomic_dec(&fs_info->scrubs_running); 2894 atomic_dec(&fs_info->scrubs_running);
2328 wake_up(&fs_info->scrub_pause_wait); 2895 wake_up(&fs_info->scrub_pause_wait);
2329 2896
2330 wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); 2897 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
2331 2898
2332 if (progress) 2899 if (progress)
2333 memcpy(progress, &sdev->stat, sizeof(*progress)); 2900 memcpy(progress, &sctx->stat, sizeof(*progress));
2334 2901
2335 mutex_lock(&fs_info->scrub_lock); 2902 mutex_lock(&fs_info->scrub_lock);
2336 dev->scrub_device = NULL; 2903 dev->scrub_device = NULL;
2337 mutex_unlock(&fs_info->scrub_lock); 2904 mutex_unlock(&fs_info->scrub_lock);
2338 2905
2339 scrub_free_dev(sdev); 2906 scrub_free_ctx(sctx);
2340 scrub_workers_put(root); 2907 scrub_workers_put(fs_info);
2341 2908
2342 return ret; 2909 return ret;
2343} 2910}
@@ -2377,9 +2944,8 @@ void btrfs_scrub_continue_super(struct btrfs_root *root)
2377 up_write(&root->fs_info->scrub_super_lock); 2944 up_write(&root->fs_info->scrub_super_lock);
2378} 2945}
2379 2946
2380int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 2947int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2381{ 2948{
2382
2383 mutex_lock(&fs_info->scrub_lock); 2949 mutex_lock(&fs_info->scrub_lock);
2384 if (!atomic_read(&fs_info->scrubs_running)) { 2950 if (!atomic_read(&fs_info->scrubs_running)) {
2385 mutex_unlock(&fs_info->scrub_lock); 2951 mutex_unlock(&fs_info->scrub_lock);
@@ -2399,23 +2965,18 @@ int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2399 return 0; 2965 return 0;
2400} 2966}
2401 2967
2402int btrfs_scrub_cancel(struct btrfs_root *root) 2968int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
2969 struct btrfs_device *dev)
2403{ 2970{
2404 return __btrfs_scrub_cancel(root->fs_info); 2971 struct scrub_ctx *sctx;
2405}
2406
2407int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
2408{
2409 struct btrfs_fs_info *fs_info = root->fs_info;
2410 struct scrub_dev *sdev;
2411 2972
2412 mutex_lock(&fs_info->scrub_lock); 2973 mutex_lock(&fs_info->scrub_lock);
2413 sdev = dev->scrub_device; 2974 sctx = dev->scrub_device;
2414 if (!sdev) { 2975 if (!sctx) {
2415 mutex_unlock(&fs_info->scrub_lock); 2976 mutex_unlock(&fs_info->scrub_lock);
2416 return -ENOTCONN; 2977 return -ENOTCONN;
2417 } 2978 }
2418 atomic_inc(&sdev->cancel_req); 2979 atomic_inc(&sctx->cancel_req);
2419 while (dev->scrub_device) { 2980 while (dev->scrub_device) {
2420 mutex_unlock(&fs_info->scrub_lock); 2981 mutex_unlock(&fs_info->scrub_lock);
2421 wait_event(fs_info->scrub_pause_wait, 2982 wait_event(fs_info->scrub_pause_wait,
@@ -2438,12 +2999,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
2438 * does not go away in cancel_dev. FIXME: find a better solution 2999 * does not go away in cancel_dev. FIXME: find a better solution
2439 */ 3000 */
2440 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3001 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2441 dev = btrfs_find_device(root, devid, NULL, NULL); 3002 dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2442 if (!dev) { 3003 if (!dev) {
2443 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3004 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2444 return -ENODEV; 3005 return -ENODEV;
2445 } 3006 }
2446 ret = btrfs_scrub_cancel_dev(root, dev); 3007 ret = btrfs_scrub_cancel_dev(fs_info, dev);
2447 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3008 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2448 3009
2449 return ret; 3010 return ret;
@@ -2453,15 +3014,284 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2453 struct btrfs_scrub_progress *progress) 3014 struct btrfs_scrub_progress *progress)
2454{ 3015{
2455 struct btrfs_device *dev; 3016 struct btrfs_device *dev;
2456 struct scrub_dev *sdev = NULL; 3017 struct scrub_ctx *sctx = NULL;
2457 3018
2458 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 3019 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2459 dev = btrfs_find_device(root, devid, NULL, NULL); 3020 dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
2460 if (dev) 3021 if (dev)
2461 sdev = dev->scrub_device; 3022 sctx = dev->scrub_device;
2462 if (sdev) 3023 if (sctx)
2463 memcpy(progress, &sdev->stat, sizeof(*progress)); 3024 memcpy(progress, &sctx->stat, sizeof(*progress));
2464 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 3025 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2465 3026
2466 return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV; 3027 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3028}
3029
3030static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3031 u64 extent_logical, u64 extent_len,
3032 u64 *extent_physical,
3033 struct btrfs_device **extent_dev,
3034 int *extent_mirror_num)
3035{
3036 u64 mapped_length;
3037 struct btrfs_bio *bbio = NULL;
3038 int ret;
3039
3040 mapped_length = extent_len;
3041 ret = btrfs_map_block(fs_info, READ, extent_logical,
3042 &mapped_length, &bbio, 0);
3043 if (ret || !bbio || mapped_length < extent_len ||
3044 !bbio->stripes[0].dev->bdev) {
3045 kfree(bbio);
3046 return;
3047 }
3048
3049 *extent_physical = bbio->stripes[0].physical;
3050 *extent_mirror_num = bbio->mirror_num;
3051 *extent_dev = bbio->stripes[0].dev;
3052 kfree(bbio);
3053}
3054
3055static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3056 struct scrub_wr_ctx *wr_ctx,
3057 struct btrfs_fs_info *fs_info,
3058 struct btrfs_device *dev,
3059 int is_dev_replace)
3060{
3061 WARN_ON(wr_ctx->wr_curr_bio != NULL);
3062
3063 mutex_init(&wr_ctx->wr_lock);
3064 wr_ctx->wr_curr_bio = NULL;
3065 if (!is_dev_replace)
3066 return 0;
3067
3068 WARN_ON(!dev->bdev);
3069 wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3070 bio_get_nr_vecs(dev->bdev));
3071 wr_ctx->tgtdev = dev;
3072 atomic_set(&wr_ctx->flush_all_writes, 0);
3073 return 0;
3074}
3075
3076static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3077{
3078 mutex_lock(&wr_ctx->wr_lock);
3079 kfree(wr_ctx->wr_curr_bio);
3080 wr_ctx->wr_curr_bio = NULL;
3081 mutex_unlock(&wr_ctx->wr_lock);
3082}
3083
3084static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3085 int mirror_num, u64 physical_for_dev_replace)
3086{
3087 struct scrub_copy_nocow_ctx *nocow_ctx;
3088 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3089
3090 nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3091 if (!nocow_ctx) {
3092 spin_lock(&sctx->stat_lock);
3093 sctx->stat.malloc_errors++;
3094 spin_unlock(&sctx->stat_lock);
3095 return -ENOMEM;
3096 }
3097
3098 scrub_pending_trans_workers_inc(sctx);
3099
3100 nocow_ctx->sctx = sctx;
3101 nocow_ctx->logical = logical;
3102 nocow_ctx->len = len;
3103 nocow_ctx->mirror_num = mirror_num;
3104 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3105 nocow_ctx->work.func = copy_nocow_pages_worker;
3106 btrfs_queue_worker(&fs_info->scrub_nocow_workers,
3107 &nocow_ctx->work);
3108
3109 return 0;
3110}
3111
3112static void copy_nocow_pages_worker(struct btrfs_work *work)
3113{
3114 struct scrub_copy_nocow_ctx *nocow_ctx =
3115 container_of(work, struct scrub_copy_nocow_ctx, work);
3116 struct scrub_ctx *sctx = nocow_ctx->sctx;
3117 u64 logical = nocow_ctx->logical;
3118 u64 len = nocow_ctx->len;
3119 int mirror_num = nocow_ctx->mirror_num;
3120 u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3121 int ret;
3122 struct btrfs_trans_handle *trans = NULL;
3123 struct btrfs_fs_info *fs_info;
3124 struct btrfs_path *path;
3125 struct btrfs_root *root;
3126 int not_written = 0;
3127
3128 fs_info = sctx->dev_root->fs_info;
3129 root = fs_info->extent_root;
3130
3131 path = btrfs_alloc_path();
3132 if (!path) {
3133 spin_lock(&sctx->stat_lock);
3134 sctx->stat.malloc_errors++;
3135 spin_unlock(&sctx->stat_lock);
3136 not_written = 1;
3137 goto out;
3138 }
3139
3140 trans = btrfs_join_transaction(root);
3141 if (IS_ERR(trans)) {
3142 not_written = 1;
3143 goto out;
3144 }
3145
3146 ret = iterate_inodes_from_logical(logical, fs_info, path,
3147 copy_nocow_pages_for_inode,
3148 nocow_ctx);
3149 if (ret != 0 && ret != -ENOENT) {
3150 pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
3151 (unsigned long long)logical,
3152 (unsigned long long)physical_for_dev_replace,
3153 (unsigned long long)len,
3154 (unsigned long long)mirror_num, ret);
3155 not_written = 1;
3156 goto out;
3157 }
3158
3159out:
3160 if (trans && !IS_ERR(trans))
3161 btrfs_end_transaction(trans, root);
3162 if (not_written)
3163 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
3164 num_uncorrectable_read_errors);
3165
3166 btrfs_free_path(path);
3167 kfree(nocow_ctx);
3168
3169 scrub_pending_trans_workers_dec(sctx);
3170}
3171
3172static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3173{
3174 unsigned long index;
3175 struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3176 int ret = 0;
3177 struct btrfs_key key;
3178 struct inode *inode = NULL;
3179 struct btrfs_root *local_root;
3180 u64 physical_for_dev_replace;
3181 u64 len;
3182 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3183
3184 key.objectid = root;
3185 key.type = BTRFS_ROOT_ITEM_KEY;
3186 key.offset = (u64)-1;
3187 local_root = btrfs_read_fs_root_no_name(fs_info, &key);
3188 if (IS_ERR(local_root))
3189 return PTR_ERR(local_root);
3190
3191 key.type = BTRFS_INODE_ITEM_KEY;
3192 key.objectid = inum;
3193 key.offset = 0;
3194 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
3195 if (IS_ERR(inode))
3196 return PTR_ERR(inode);
3197
3198 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3199 len = nocow_ctx->len;
3200 while (len >= PAGE_CACHE_SIZE) {
3201 struct page *page = NULL;
3202 int ret_sub;
3203
3204 index = offset >> PAGE_CACHE_SHIFT;
3205
3206 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3207 if (!page) {
3208 pr_err("find_or_create_page() failed\n");
3209 ret = -ENOMEM;
3210 goto next_page;
3211 }
3212
3213 if (PageUptodate(page)) {
3214 if (PageDirty(page))
3215 goto next_page;
3216 } else {
3217 ClearPageError(page);
3218 ret_sub = extent_read_full_page(&BTRFS_I(inode)->
3219 io_tree,
3220 page, btrfs_get_extent,
3221 nocow_ctx->mirror_num);
3222 if (ret_sub) {
3223 ret = ret_sub;
3224 goto next_page;
3225 }
3226 wait_on_page_locked(page);
3227 if (!PageUptodate(page)) {
3228 ret = -EIO;
3229 goto next_page;
3230 }
3231 }
3232 ret_sub = write_page_nocow(nocow_ctx->sctx,
3233 physical_for_dev_replace, page);
3234 if (ret_sub) {
3235 ret = ret_sub;
3236 goto next_page;
3237 }
3238
3239next_page:
3240 if (page) {
3241 unlock_page(page);
3242 put_page(page);
3243 }
3244 offset += PAGE_CACHE_SIZE;
3245 physical_for_dev_replace += PAGE_CACHE_SIZE;
3246 len -= PAGE_CACHE_SIZE;
3247 }
3248
3249 if (inode)
3250 iput(inode);
3251 return ret;
3252}
3253
3254static int write_page_nocow(struct scrub_ctx *sctx,
3255 u64 physical_for_dev_replace, struct page *page)
3256{
3257 struct bio *bio;
3258 struct btrfs_device *dev;
3259 int ret;
3260 DECLARE_COMPLETION_ONSTACK(compl);
3261
3262 dev = sctx->wr_ctx.tgtdev;
3263 if (!dev)
3264 return -EIO;
3265 if (!dev->bdev) {
3266 printk_ratelimited(KERN_WARNING
3267 "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
3268 return -EIO;
3269 }
3270 bio = bio_alloc(GFP_NOFS, 1);
3271 if (!bio) {
3272 spin_lock(&sctx->stat_lock);
3273 sctx->stat.malloc_errors++;
3274 spin_unlock(&sctx->stat_lock);
3275 return -ENOMEM;
3276 }
3277 bio->bi_private = &compl;
3278 bio->bi_end_io = scrub_complete_bio_end_io;
3279 bio->bi_size = 0;
3280 bio->bi_sector = physical_for_dev_replace >> 9;
3281 bio->bi_bdev = dev->bdev;
3282 ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
3283 if (ret != PAGE_CACHE_SIZE) {
3284leave_with_eio:
3285 bio_put(bio);
3286 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
3287 return -EIO;
3288 }
3289 btrfsic_submit_bio(WRITE_SYNC, bio);
3290 wait_for_completion(&compl);
3291
3292 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
3293 goto leave_with_eio;
3294
3295 bio_put(bio);
3296 return 0;
2467} 3297}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e78b297b0b00..54454542ad40 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4397,9 +4397,9 @@ static int full_send_tree(struct send_ctx *sctx)
4397 if (!path) 4397 if (!path)
4398 return -ENOMEM; 4398 return -ENOMEM;
4399 4399
4400 spin_lock(&send_root->root_times_lock); 4400 spin_lock(&send_root->root_item_lock);
4401 start_ctransid = btrfs_root_ctransid(&send_root->root_item); 4401 start_ctransid = btrfs_root_ctransid(&send_root->root_item);
4402 spin_unlock(&send_root->root_times_lock); 4402 spin_unlock(&send_root->root_item_lock);
4403 4403
4404 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 4404 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
4405 key.type = BTRFS_INODE_ITEM_KEY; 4405 key.type = BTRFS_INODE_ITEM_KEY;
@@ -4422,9 +4422,9 @@ join_trans:
4422 * Make sure the tree has not changed after re-joining. We detect this 4422 * Make sure the tree has not changed after re-joining. We detect this
4423 * by comparing start_ctransid and ctransid. They should always match. 4423 * by comparing start_ctransid and ctransid. They should always match.
4424 */ 4424 */
4425 spin_lock(&send_root->root_times_lock); 4425 spin_lock(&send_root->root_item_lock);
4426 ctransid = btrfs_root_ctransid(&send_root->root_item); 4426 ctransid = btrfs_root_ctransid(&send_root->root_item);
4427 spin_unlock(&send_root->root_times_lock); 4427 spin_unlock(&send_root->root_item_lock);
4428 4428
4429 if (ctransid != start_ctransid) { 4429 if (ctransid != start_ctransid) {
4430 WARN(1, KERN_WARNING "btrfs: the root that you're trying to " 4430 WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 915ac14c2064..99545df1b86c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -55,6 +55,7 @@
55#include "export.h" 55#include "export.h"
56#include "compression.h" 56#include "compression.h"
57#include "rcu-string.h" 57#include "rcu-string.h"
58#include "dev-replace.h"
58 59
59#define CREATE_TRACE_POINTS 60#define CREATE_TRACE_POINTS
60#include <trace/events/btrfs.h> 61#include <trace/events/btrfs.h>
@@ -116,7 +117,16 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
116 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 117 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
117 sb->s_flags |= MS_RDONLY; 118 sb->s_flags |= MS_RDONLY;
118 printk(KERN_INFO "btrfs is forced readonly\n"); 119 printk(KERN_INFO "btrfs is forced readonly\n");
119 __btrfs_scrub_cancel(fs_info); 120 /*
121 * Note that a running device replace operation is not
122 * canceled here although there is no way to update
123 * the progress. It would add the risk of a deadlock,
124 * therefore the canceling is ommited. The only penalty
125 * is that some I/O remains active until the procedure
126 * completes. The next time when the filesystem is
127 * mounted writeable again, the device replace
128 * operation continues.
129 */
120// WARN_ON(1); 130// WARN_ON(1);
121 } 131 }
122} 132}
@@ -1186,7 +1196,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1186 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); 1196 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
1187 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); 1197 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
1188 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); 1198 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
1189 btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); 1199 btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
1200 new_pool_size);
1190} 1201}
1191 1202
1192static int btrfs_remount(struct super_block *sb, int *flags, char *data) 1203static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -1215,8 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1215 return 0; 1226 return 0;
1216 1227
1217 if (*flags & MS_RDONLY) { 1228 if (*flags & MS_RDONLY) {
1229 /*
1230 * this also happens on 'umount -rf' or on shutdown, when
1231 * the filesystem is busy.
1232 */
1218 sb->s_flags |= MS_RDONLY; 1233 sb->s_flags |= MS_RDONLY;
1219 1234
1235 btrfs_dev_replace_suspend_for_unmount(fs_info);
1236 btrfs_scrub_cancel(fs_info);
1237
1220 ret = btrfs_commit_super(root); 1238 ret = btrfs_commit_super(root);
1221 if (ret) 1239 if (ret)
1222 goto restore; 1240 goto restore;
@@ -1226,6 +1244,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1226 goto restore; 1244 goto restore;
1227 } 1245 }
1228 1246
1247 if (fs_info->fs_devices->missing_devices >
1248 fs_info->num_tolerated_disk_barrier_failures &&
1249 !(*flags & MS_RDONLY)) {
1250 printk(KERN_WARNING
1251 "Btrfs: too many missing devices, writeable remount is not allowed\n");
1252 ret = -EACCES;
1253 goto restore;
1254 }
1255
1229 if (btrfs_super_log_root(fs_info->super_copy) != 0) { 1256 if (btrfs_super_log_root(fs_info->super_copy) != 0) {
1230 ret = -EINVAL; 1257 ret = -EINVAL;
1231 goto restore; 1258 goto restore;
@@ -1244,6 +1271,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1244 if (ret) 1271 if (ret)
1245 goto restore; 1272 goto restore;
1246 1273
1274 ret = btrfs_resume_dev_replace_async(fs_info);
1275 if (ret) {
1276 pr_warn("btrfs: failed to resume dev_replace\n");
1277 goto restore;
1278 }
1247 sb->s_flags &= ~MS_RDONLY; 1279 sb->s_flags &= ~MS_RDONLY;
1248 } 1280 }
1249 1281
@@ -1336,7 +1368,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1336 min_stripe_size = BTRFS_STRIPE_LEN; 1368 min_stripe_size = BTRFS_STRIPE_LEN;
1337 1369
1338 list_for_each_entry(device, &fs_devices->devices, dev_list) { 1370 list_for_each_entry(device, &fs_devices->devices, dev_list) {
1339 if (!device->in_fs_metadata || !device->bdev) 1371 if (!device->in_fs_metadata || !device->bdev ||
1372 device->is_tgtdev_for_dev_replace)
1340 continue; 1373 continue;
1341 1374
1342 avail_space = device->total_bytes - device->bytes_used; 1375 avail_space = device->total_bytes - device->bytes_used;
@@ -1647,10 +1680,14 @@ static int __init init_btrfs_fs(void)
1647 if (err) 1680 if (err)
1648 goto free_ordered_data; 1681 goto free_ordered_data;
1649 1682
1650 err = btrfs_interface_init(); 1683 err = btrfs_auto_defrag_init();
1651 if (err) 1684 if (err)
1652 goto free_delayed_inode; 1685 goto free_delayed_inode;
1653 1686
1687 err = btrfs_interface_init();
1688 if (err)
1689 goto free_auto_defrag;
1690
1654 err = register_filesystem(&btrfs_fs_type); 1691 err = register_filesystem(&btrfs_fs_type);
1655 if (err) 1692 if (err)
1656 goto unregister_ioctl; 1693 goto unregister_ioctl;
@@ -1662,6 +1699,8 @@ static int __init init_btrfs_fs(void)
1662 1699
1663unregister_ioctl: 1700unregister_ioctl:
1664 btrfs_interface_exit(); 1701 btrfs_interface_exit();
1702free_auto_defrag:
1703 btrfs_auto_defrag_exit();
1665free_delayed_inode: 1704free_delayed_inode:
1666 btrfs_delayed_inode_exit(); 1705 btrfs_delayed_inode_exit();
1667free_ordered_data: 1706free_ordered_data:
@@ -1681,6 +1720,7 @@ free_compress:
1681static void __exit exit_btrfs_fs(void) 1720static void __exit exit_btrfs_fs(void)
1682{ 1721{
1683 btrfs_destroy_cachep(); 1722 btrfs_destroy_cachep();
1723 btrfs_auto_defrag_exit();
1684 btrfs_delayed_inode_exit(); 1724 btrfs_delayed_inode_exit();
1685 ordered_data_exit(); 1725 ordered_data_exit();
1686 extent_map_exit(); 1726 extent_map_exit();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 04bbfb1052eb..87fac9a21ea5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -30,6 +30,7 @@
30#include "tree-log.h" 30#include "tree-log.h"
31#include "inode-map.h" 31#include "inode-map.h"
32#include "volumes.h" 32#include "volumes.h"
33#include "dev-replace.h"
33 34
34#define BTRFS_ROOT_TRANS_TAG 0 35#define BTRFS_ROOT_TRANS_TAG 0
35 36
@@ -145,16 +146,12 @@ loop:
145 * the log must never go across transaction boundaries. 146 * the log must never go across transaction boundaries.
146 */ 147 */
147 smp_mb(); 148 smp_mb();
148 if (!list_empty(&fs_info->tree_mod_seq_list)) { 149 if (!list_empty(&fs_info->tree_mod_seq_list))
149 printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when " 150 WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when "
150 "creating a fresh transaction\n"); 151 "creating a fresh transaction\n");
151 WARN_ON(1); 152 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
152 } 153 WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
153 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
154 printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
155 "creating a fresh transaction\n"); 154 "creating a fresh transaction\n");
156 WARN_ON(1);
157 }
158 atomic_set(&fs_info->tree_mod_seq, 0); 155 atomic_set(&fs_info->tree_mod_seq, 0);
159 156
160 spin_lock_init(&cur_trans->commit_lock); 157 spin_lock_init(&cur_trans->commit_lock);
@@ -295,9 +292,9 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
295 return 0; 292 return 0;
296} 293}
297 294
298static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 295static struct btrfs_trans_handle *
299 u64 num_items, int type, 296start_transaction(struct btrfs_root *root, u64 num_items, int type,
300 int noflush) 297 enum btrfs_reserve_flush_enum flush)
301{ 298{
302 struct btrfs_trans_handle *h; 299 struct btrfs_trans_handle *h;
303 struct btrfs_transaction *cur_trans; 300 struct btrfs_transaction *cur_trans;
@@ -312,6 +309,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
312 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); 309 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
313 h = current->journal_info; 310 h = current->journal_info;
314 h->use_count++; 311 h->use_count++;
312 WARN_ON(h->use_count > 2);
315 h->orig_rsv = h->block_rsv; 313 h->orig_rsv = h->block_rsv;
316 h->block_rsv = NULL; 314 h->block_rsv = NULL;
317 goto got_it; 315 goto got_it;
@@ -331,14 +329,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
331 } 329 }
332 330
333 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 331 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
334 if (noflush) 332 ret = btrfs_block_rsv_add(root,
335 ret = btrfs_block_rsv_add_noflush(root, 333 &root->fs_info->trans_block_rsv,
336 &root->fs_info->trans_block_rsv, 334 num_bytes, flush);
337 num_bytes);
338 else
339 ret = btrfs_block_rsv_add(root,
340 &root->fs_info->trans_block_rsv,
341 num_bytes);
342 if (ret) 335 if (ret)
343 return ERR_PTR(ret); 336 return ERR_PTR(ret);
344 } 337 }
@@ -422,13 +415,15 @@ got_it:
422struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 415struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
423 int num_items) 416 int num_items)
424{ 417{
425 return start_transaction(root, num_items, TRANS_START, 0); 418 return start_transaction(root, num_items, TRANS_START,
419 BTRFS_RESERVE_FLUSH_ALL);
426} 420}
427 421
428struct btrfs_trans_handle *btrfs_start_transaction_noflush( 422struct btrfs_trans_handle *btrfs_start_transaction_lflush(
429 struct btrfs_root *root, int num_items) 423 struct btrfs_root *root, int num_items)
430{ 424{
431 return start_transaction(root, num_items, TRANS_START, 1); 425 return start_transaction(root, num_items, TRANS_START,
426 BTRFS_RESERVE_FLUSH_LIMIT);
432} 427}
433 428
434struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) 429struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
@@ -461,28 +456,31 @@ static noinline void wait_for_commit(struct btrfs_root *root,
461int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) 456int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
462{ 457{
463 struct btrfs_transaction *cur_trans = NULL, *t; 458 struct btrfs_transaction *cur_trans = NULL, *t;
464 int ret; 459 int ret = 0;
465 460
466 ret = 0;
467 if (transid) { 461 if (transid) {
468 if (transid <= root->fs_info->last_trans_committed) 462 if (transid <= root->fs_info->last_trans_committed)
469 goto out; 463 goto out;
470 464
465 ret = -EINVAL;
471 /* find specified transaction */ 466 /* find specified transaction */
472 spin_lock(&root->fs_info->trans_lock); 467 spin_lock(&root->fs_info->trans_lock);
473 list_for_each_entry(t, &root->fs_info->trans_list, list) { 468 list_for_each_entry(t, &root->fs_info->trans_list, list) {
474 if (t->transid == transid) { 469 if (t->transid == transid) {
475 cur_trans = t; 470 cur_trans = t;
476 atomic_inc(&cur_trans->use_count); 471 atomic_inc(&cur_trans->use_count);
472 ret = 0;
477 break; 473 break;
478 } 474 }
479 if (t->transid > transid) 475 if (t->transid > transid) {
476 ret = 0;
480 break; 477 break;
478 }
481 } 479 }
482 spin_unlock(&root->fs_info->trans_lock); 480 spin_unlock(&root->fs_info->trans_lock);
483 ret = -EINVAL; 481 /* The specified transaction doesn't exist */
484 if (!cur_trans) 482 if (!cur_trans)
485 goto out; /* bad transid */ 483 goto out;
486 } else { 484 } else {
487 /* find newest transaction that is committing | committed */ 485 /* find newest transaction that is committing | committed */
488 spin_lock(&root->fs_info->trans_lock); 486 spin_lock(&root->fs_info->trans_lock);
@@ -502,9 +500,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
502 } 500 }
503 501
504 wait_for_commit(root, cur_trans); 502 wait_for_commit(root, cur_trans);
505
506 put_transaction(cur_trans); 503 put_transaction(cur_trans);
507 ret = 0;
508out: 504out:
509 return ret; 505 return ret;
510} 506}
@@ -851,7 +847,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
851 return ret; 847 return ret;
852 848
853 ret = btrfs_run_dev_stats(trans, root->fs_info); 849 ret = btrfs_run_dev_stats(trans, root->fs_info);
854 BUG_ON(ret); 850 WARN_ON(ret);
851 ret = btrfs_run_dev_replace(trans, root->fs_info);
852 WARN_ON(ret);
855 853
856 ret = btrfs_run_qgroups(trans, root->fs_info); 854 ret = btrfs_run_qgroups(trans, root->fs_info);
857 BUG_ON(ret); 855 BUG_ON(ret);
@@ -874,6 +872,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
874 switch_commit_root(fs_info->extent_root); 872 switch_commit_root(fs_info->extent_root);
875 up_write(&fs_info->extent_commit_sem); 873 up_write(&fs_info->extent_commit_sem);
876 874
875 btrfs_after_dev_replace_commit(fs_info);
876
877 return 0; 877 return 0;
878} 878}
879 879
@@ -958,7 +958,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
958 struct btrfs_fs_info *info = root->fs_info; 958 struct btrfs_fs_info *info = root->fs_info;
959 struct btrfs_trans_handle *trans; 959 struct btrfs_trans_handle *trans;
960 int ret; 960 int ret;
961 unsigned long nr;
962 961
963 if (xchg(&root->defrag_running, 1)) 962 if (xchg(&root->defrag_running, 1))
964 return 0; 963 return 0;
@@ -970,9 +969,8 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
970 969
971 ret = btrfs_defrag_leaves(trans, root, cacheonly); 970 ret = btrfs_defrag_leaves(trans, root, cacheonly);
972 971
973 nr = trans->blocks_used;
974 btrfs_end_transaction(trans, root); 972 btrfs_end_transaction(trans, root);
975 btrfs_btree_balance_dirty(info->tree_root, nr); 973 btrfs_btree_balance_dirty(info->tree_root);
976 cond_resched(); 974 cond_resched();
977 975
978 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) 976 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
@@ -1032,8 +1030,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1032 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 1030 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
1033 1031
1034 if (to_reserve > 0) { 1032 if (to_reserve > 0) {
1035 ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, 1033 ret = btrfs_block_rsv_add(root, &pending->block_rsv,
1036 to_reserve); 1034 to_reserve,
1035 BTRFS_RESERVE_NO_FLUSH);
1037 if (ret) { 1036 if (ret) {
1038 pending->error = ret; 1037 pending->error = ret;
1039 goto no_free_objectid; 1038 goto no_free_objectid;
@@ -1191,7 +1190,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1191 parent_inode, &key, 1190 parent_inode, &key,
1192 BTRFS_FT_DIR, index); 1191 BTRFS_FT_DIR, index);
1193 /* We have check then name at the beginning, so it is impossible. */ 1192 /* We have check then name at the beginning, so it is impossible. */
1194 BUG_ON(ret == -EEXIST); 1193 BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
1195 if (ret) { 1194 if (ret) {
1196 btrfs_abort_transaction(trans, root, ret); 1195 btrfs_abort_transaction(trans, root, ret);
1197 goto fail; 1196 goto fail;
@@ -1309,9 +1308,10 @@ static void do_async_commit(struct work_struct *work)
1309 * We've got freeze protection passed with the transaction. 1308 * We've got freeze protection passed with the transaction.
1310 * Tell lockdep about it. 1309 * Tell lockdep about it.
1311 */ 1310 */
1312 rwsem_acquire_read( 1311 if (ac->newtrans->type < TRANS_JOIN_NOLOCK)
1313 &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1312 rwsem_acquire_read(
1314 0, 1, _THIS_IP_); 1313 &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1314 0, 1, _THIS_IP_);
1315 1315
1316 current->journal_info = ac->newtrans; 1316 current->journal_info = ac->newtrans;
1317 1317
@@ -1349,8 +1349,10 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1349 * Tell lockdep we've released the freeze rwsem, since the 1349 * Tell lockdep we've released the freeze rwsem, since the
1350 * async commit thread will be the one to unlock it. 1350 * async commit thread will be the one to unlock it.
1351 */ 1351 */
1352 rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1352 if (trans->type < TRANS_JOIN_NOLOCK)
1353 1, _THIS_IP_); 1353 rwsem_release(
1354 &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1355 1, _THIS_IP_);
1354 1356
1355 schedule_delayed_work(&ac->work, 0); 1357 schedule_delayed_work(&ac->work, 0);
1356 1358
@@ -1400,6 +1402,48 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1400 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1402 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1401} 1403}
1402 1404
1405static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1406 struct btrfs_root *root)
1407{
1408 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
1409 int snap_pending = 0;
1410 int ret;
1411
1412 if (!flush_on_commit) {
1413 spin_lock(&root->fs_info->trans_lock);
1414 if (!list_empty(&trans->transaction->pending_snapshots))
1415 snap_pending = 1;
1416 spin_unlock(&root->fs_info->trans_lock);
1417 }
1418
1419 if (flush_on_commit || snap_pending) {
1420 btrfs_start_delalloc_inodes(root, 1);
1421 btrfs_wait_ordered_extents(root, 1);
1422 }
1423
1424 ret = btrfs_run_delayed_items(trans, root);
1425 if (ret)
1426 return ret;
1427
1428 /*
1429 * running the delayed items may have added new refs. account
1430 * them now so that they hinder processing of more delayed refs
1431 * as little as possible.
1432 */
1433 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
1434
1435 /*
1436 * rename don't use btrfs_join_transaction, so, once we
1437 * set the transaction to blocked above, we aren't going
1438 * to get any new ordered operations. We can safely run
1439 * it here and no for sure that nothing new will be added
1440 * to the list
1441 */
1442 btrfs_run_ordered_operations(root, 1);
1443
1444 return 0;
1445}
1446
1403/* 1447/*
1404 * btrfs_transaction state sequence: 1448 * btrfs_transaction state sequence:
1405 * in_commit = 0, blocked = 0 (initial) 1449 * in_commit = 0, blocked = 0 (initial)
@@ -1414,15 +1458,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1414 struct btrfs_transaction *cur_trans = trans->transaction; 1458 struct btrfs_transaction *cur_trans = trans->transaction;
1415 struct btrfs_transaction *prev_trans = NULL; 1459 struct btrfs_transaction *prev_trans = NULL;
1416 DEFINE_WAIT(wait); 1460 DEFINE_WAIT(wait);
1417 int ret = -EIO; 1461 int ret;
1418 int should_grow = 0; 1462 int should_grow = 0;
1419 unsigned long now = get_seconds(); 1463 unsigned long now = get_seconds();
1420 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
1421 1464
1422 btrfs_run_ordered_operations(root, 0); 1465 ret = btrfs_run_ordered_operations(root, 0);
1466 if (ret) {
1467 btrfs_abort_transaction(trans, root, ret);
1468 goto cleanup_transaction;
1469 }
1423 1470
1424 if (cur_trans->aborted) 1471 if (cur_trans->aborted) {
1472 ret = cur_trans->aborted;
1425 goto cleanup_transaction; 1473 goto cleanup_transaction;
1474 }
1426 1475
1427 /* make a pass through all the delayed refs we have so far 1476 /* make a pass through all the delayed refs we have so far
1428 * any runnings procs may add more while we are here 1477 * any runnings procs may add more while we are here
@@ -1490,39 +1539,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1490 should_grow = 1; 1539 should_grow = 1;
1491 1540
1492 do { 1541 do {
1493 int snap_pending = 0;
1494
1495 joined = cur_trans->num_joined; 1542 joined = cur_trans->num_joined;
1496 if (!list_empty(&trans->transaction->pending_snapshots))
1497 snap_pending = 1;
1498 1543
1499 WARN_ON(cur_trans != trans->transaction); 1544 WARN_ON(cur_trans != trans->transaction);
1500 1545
1501 if (flush_on_commit || snap_pending) { 1546 ret = btrfs_flush_all_pending_stuffs(trans, root);
1502 btrfs_start_delalloc_inodes(root, 1);
1503 btrfs_wait_ordered_extents(root, 1);
1504 }
1505
1506 ret = btrfs_run_delayed_items(trans, root);
1507 if (ret) 1547 if (ret)
1508 goto cleanup_transaction; 1548 goto cleanup_transaction;
1509 1549
1510 /*
1511 * running the delayed items may have added new refs. account
1512 * them now so that they hinder processing of more delayed refs
1513 * as little as possible.
1514 */
1515 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
1516
1517 /*
1518 * rename don't use btrfs_join_transaction, so, once we
1519 * set the transaction to blocked above, we aren't going
1520 * to get any new ordered operations. We can safely run
1521 * it here and no for sure that nothing new will be added
1522 * to the list
1523 */
1524 btrfs_run_ordered_operations(root, 1);
1525
1526 prepare_to_wait(&cur_trans->writer_wait, &wait, 1550 prepare_to_wait(&cur_trans->writer_wait, &wait,
1527 TASK_UNINTERRUPTIBLE); 1551 TASK_UNINTERRUPTIBLE);
1528 1552
@@ -1535,6 +1559,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1535 } while (atomic_read(&cur_trans->num_writers) > 1 || 1559 } while (atomic_read(&cur_trans->num_writers) > 1 ||
1536 (should_grow && cur_trans->num_joined != joined)); 1560 (should_grow && cur_trans->num_joined != joined));
1537 1561
1562 ret = btrfs_flush_all_pending_stuffs(trans, root);
1563 if (ret)
1564 goto cleanup_transaction;
1565
1538 /* 1566 /*
1539 * Ok now we need to make sure to block out any other joins while we 1567 * Ok now we need to make sure to block out any other joins while we
1540 * commit the transaction. We could have started a join before setting 1568 * commit the transaction. We could have started a join before setting
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 80961947a6b2..0e8aa1e6c287 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -105,7 +105,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
105 struct btrfs_root *root); 105 struct btrfs_root *root);
106struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 106struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
107 int num_items); 107 int num_items);
108struct btrfs_trans_handle *btrfs_start_transaction_noflush( 108struct btrfs_trans_handle *btrfs_start_transaction_lflush(
109 struct btrfs_root *root, int num_items); 109 struct btrfs_root *root, int num_items);
110struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); 110struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
111struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); 111struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 81e407d9677a..83186c7e45d4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2952,33 +2952,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2952 struct btrfs_inode_item *item, 2952 struct btrfs_inode_item *item,
2953 struct inode *inode, int log_inode_only) 2953 struct inode *inode, int log_inode_only)
2954{ 2954{
2955 btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); 2955 struct btrfs_map_token token;
2956 btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); 2956
2957 btrfs_set_inode_mode(leaf, item, inode->i_mode); 2957 btrfs_init_map_token(&token);
2958 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2959
2960 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2961 inode->i_atime.tv_sec);
2962 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2963 inode->i_atime.tv_nsec);
2964
2965 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2966 inode->i_mtime.tv_sec);
2967 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2968 inode->i_mtime.tv_nsec);
2969
2970 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2971 inode->i_ctime.tv_sec);
2972 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2973 inode->i_ctime.tv_nsec);
2974
2975 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2976
2977 btrfs_set_inode_sequence(leaf, item, inode->i_version);
2978 btrfs_set_inode_transid(leaf, item, trans->transid);
2979 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2980 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2981 btrfs_set_inode_block_group(leaf, item, 0);
2982 2958
2983 if (log_inode_only) { 2959 if (log_inode_only) {
2984 /* set the generation to zero so the recover code 2960 /* set the generation to zero so the recover code
@@ -2986,14 +2962,63 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2986 * just to say 'this inode exists' and a logging 2962 * just to say 'this inode exists' and a logging
2987 * to say 'update this inode with these values' 2963 * to say 'update this inode with these values'
2988 */ 2964 */
2989 btrfs_set_inode_generation(leaf, item, 0); 2965 btrfs_set_token_inode_generation(leaf, item, 0, &token);
2990 btrfs_set_inode_size(leaf, item, 0); 2966 btrfs_set_token_inode_size(leaf, item, 0, &token);
2991 } else { 2967 } else {
2992 btrfs_set_inode_generation(leaf, item, 2968 btrfs_set_token_inode_generation(leaf, item,
2993 BTRFS_I(inode)->generation); 2969 BTRFS_I(inode)->generation,
2994 btrfs_set_inode_size(leaf, item, inode->i_size); 2970 &token);
2995 } 2971 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
2972 }
2973
2974 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
2975 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
2976 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
2977 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
2978
2979 btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
2980 inode->i_atime.tv_sec, &token);
2981 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
2982 inode->i_atime.tv_nsec, &token);
2983
2984 btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
2985 inode->i_mtime.tv_sec, &token);
2986 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
2987 inode->i_mtime.tv_nsec, &token);
2988
2989 btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
2990 inode->i_ctime.tv_sec, &token);
2991 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
2992 inode->i_ctime.tv_nsec, &token);
2993
2994 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
2995 &token);
2996
2997 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
2998 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
2999 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3000 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3001 btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3002}
2996 3003
3004static int log_inode_item(struct btrfs_trans_handle *trans,
3005 struct btrfs_root *log, struct btrfs_path *path,
3006 struct inode *inode)
3007{
3008 struct btrfs_inode_item *inode_item;
3009 struct btrfs_key key;
3010 int ret;
3011
3012 memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
3013 ret = btrfs_insert_empty_item(trans, log, path, &key,
3014 sizeof(*inode_item));
3015 if (ret && ret != -EEXIST)
3016 return ret;
3017 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3018 struct btrfs_inode_item);
3019 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
3020 btrfs_release_path(path);
3021 return 0;
2997} 3022}
2998 3023
2999static noinline int copy_items(struct btrfs_trans_handle *trans, 3024static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -3130,151 +3155,234 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3130 return 0; 3155 return 0;
3131} 3156}
3132 3157
3133struct log_args { 3158static int drop_adjacent_extents(struct btrfs_trans_handle *trans,
3134 struct extent_buffer *src; 3159 struct btrfs_root *root, struct inode *inode,
3135 u64 next_offset; 3160 struct extent_map *em,
3136 int start_slot; 3161 struct btrfs_path *path)
3137 int nr; 3162{
3138}; 3163 struct btrfs_file_extent_item *fi;
3164 struct extent_buffer *leaf;
3165 struct btrfs_key key, new_key;
3166 struct btrfs_map_token token;
3167 u64 extent_end;
3168 u64 extent_offset = 0;
3169 int extent_type;
3170 int del_slot = 0;
3171 int del_nr = 0;
3172 int ret = 0;
3173
3174 while (1) {
3175 btrfs_init_map_token(&token);
3176 leaf = path->nodes[0];
3177 path->slots[0]++;
3178 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3179 if (del_nr) {
3180 ret = btrfs_del_items(trans, root, path,
3181 del_slot, del_nr);
3182 if (ret)
3183 return ret;
3184 del_nr = 0;
3185 }
3186
3187 ret = btrfs_next_leaf_write(trans, root, path, 1);
3188 if (ret < 0)
3189 return ret;
3190 if (ret > 0)
3191 return 0;
3192 leaf = path->nodes[0];
3193 }
3194
3195 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3196 if (key.objectid != btrfs_ino(inode) ||
3197 key.type != BTRFS_EXTENT_DATA_KEY ||
3198 key.offset >= em->start + em->len)
3199 break;
3200
3201 fi = btrfs_item_ptr(leaf, path->slots[0],
3202 struct btrfs_file_extent_item);
3203 extent_type = btrfs_token_file_extent_type(leaf, fi, &token);
3204 if (extent_type == BTRFS_FILE_EXTENT_REG ||
3205 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
3206 extent_offset = btrfs_token_file_extent_offset(leaf,
3207 fi, &token);
3208 extent_end = key.offset +
3209 btrfs_token_file_extent_num_bytes(leaf, fi,
3210 &token);
3211 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
3212 extent_end = key.offset +
3213 btrfs_file_extent_inline_len(leaf, fi);
3214 } else {
3215 BUG();
3216 }
3217
3218 if (extent_end <= em->len + em->start) {
3219 if (!del_nr) {
3220 del_slot = path->slots[0];
3221 }
3222 del_nr++;
3223 continue;
3224 }
3225
3226 /*
3227 * Ok so we'll ignore previous items if we log a new extent,
3228 * which can lead to overlapping extents, so if we have an
3229 * existing extent we want to adjust we _have_ to check the next
3230 * guy to make sure we even need this extent anymore, this keeps
3231 * us from panicing in set_item_key_safe.
3232 */
3233 if (path->slots[0] < btrfs_header_nritems(leaf) - 1) {
3234 struct btrfs_key tmp_key;
3235
3236 btrfs_item_key_to_cpu(leaf, &tmp_key,
3237 path->slots[0] + 1);
3238 if (tmp_key.objectid == btrfs_ino(inode) &&
3239 tmp_key.type == BTRFS_EXTENT_DATA_KEY &&
3240 tmp_key.offset <= em->start + em->len) {
3241 if (!del_nr)
3242 del_slot = path->slots[0];
3243 del_nr++;
3244 continue;
3245 }
3246 }
3247
3248 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
3249 memcpy(&new_key, &key, sizeof(new_key));
3250 new_key.offset = em->start + em->len;
3251 btrfs_set_item_key_safe(trans, root, path, &new_key);
3252 extent_offset += em->start + em->len - key.offset;
3253 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset,
3254 &token);
3255 btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end -
3256 (em->start + em->len),
3257 &token);
3258 btrfs_mark_buffer_dirty(leaf);
3259 }
3260
3261 if (del_nr)
3262 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
3263
3264 return ret;
3265}
3139 3266
3140static int log_one_extent(struct btrfs_trans_handle *trans, 3267static int log_one_extent(struct btrfs_trans_handle *trans,
3141 struct inode *inode, struct btrfs_root *root, 3268 struct inode *inode, struct btrfs_root *root,
3142 struct extent_map *em, struct btrfs_path *path, 3269 struct extent_map *em, struct btrfs_path *path)
3143 struct btrfs_path *dst_path, struct log_args *args)
3144{ 3270{
3145 struct btrfs_root *log = root->log_root; 3271 struct btrfs_root *log = root->log_root;
3146 struct btrfs_file_extent_item *fi; 3272 struct btrfs_file_extent_item *fi;
3273 struct extent_buffer *leaf;
3274 struct list_head ordered_sums;
3275 struct btrfs_map_token token;
3147 struct btrfs_key key; 3276 struct btrfs_key key;
3148 u64 start = em->mod_start; 3277 u64 csum_offset = em->mod_start - em->start;
3149 u64 search_start = start; 3278 u64 csum_len = em->mod_len;
3150 u64 len = em->mod_len; 3279 u64 extent_offset = em->start - em->orig_start;
3151 u64 num_bytes; 3280 u64 block_len;
3152 int nritems;
3153 int ret; 3281 int ret;
3282 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3154 3283
3155 if (BTRFS_I(inode)->logged_trans == trans->transid) { 3284 INIT_LIST_HEAD(&ordered_sums);
3156 ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, 3285 btrfs_init_map_token(&token);
3157 start + len, NULL, 0); 3286 key.objectid = btrfs_ino(inode);
3158 if (ret) 3287 key.type = BTRFS_EXTENT_DATA_KEY;
3159 return ret; 3288 key.offset = em->start;
3289 path->really_keep_locks = 1;
3290
3291 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
3292 if (ret && ret != -EEXIST) {
3293 path->really_keep_locks = 0;
3294 return ret;
3160 } 3295 }
3296 leaf = path->nodes[0];
3297 fi = btrfs_item_ptr(leaf, path->slots[0],
3298 struct btrfs_file_extent_item);
3299 btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3300 &token);
3301 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3302 skip_csum = true;
3303 btrfs_set_token_file_extent_type(leaf, fi,
3304 BTRFS_FILE_EXTENT_PREALLOC,
3305 &token);
3306 } else {
3307 btrfs_set_token_file_extent_type(leaf, fi,
3308 BTRFS_FILE_EXTENT_REG,
3309 &token);
3310 if (em->block_start == 0)
3311 skip_csum = true;
3312 }
3313
3314 block_len = max(em->block_len, em->orig_block_len);
3315 if (em->compress_type != BTRFS_COMPRESS_NONE) {
3316 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3317 em->block_start,
3318 &token);
3319 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3320 &token);
3321 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3322 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3323 em->block_start -
3324 extent_offset, &token);
3325 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3326 &token);
3327 } else {
3328 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3329 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3330 &token);
3331 }
3332
3333 btrfs_set_token_file_extent_offset(leaf, fi,
3334 em->start - em->orig_start,
3335 &token);
3336 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3337 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token);
3338 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3339 &token);
3340 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3341 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3342 btrfs_mark_buffer_dirty(leaf);
3161 3343
3162 while (len) { 3344 /*
3163 if (args->nr) 3345 * Have to check the extent to the right of us to make sure it doesn't
3164 goto next_slot; 3346 * fall in our current range. We're ok if the previous extent is in our
3165again: 3347 * range since the recovery stuff will run us in key order and thus just
3166 key.objectid = btrfs_ino(inode); 3348 * drop the part we overwrote.
3167 key.type = BTRFS_EXTENT_DATA_KEY; 3349 */
3168 key.offset = search_start; 3350 ret = drop_adjacent_extents(trans, log, inode, em, path);
3169 3351 btrfs_release_path(path);
3170 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3352 path->really_keep_locks = 0;
3171 if (ret < 0) 3353 if (ret) {
3172 return ret; 3354 return ret;
3173 3355 }
3174 if (ret) {
3175 /*
3176 * A rare case were we can have an em for a section of a
3177 * larger extent so we need to make sure that this em
3178 * falls within the extent we've found. If not we just
3179 * bail and go back to ye-olde way of doing things but
3180 * it happens often enough in testing that we need to do
3181 * this dance to make sure.
3182 */
3183 do {
3184 if (path->slots[0] == 0) {
3185 btrfs_release_path(path);
3186 if (search_start == 0)
3187 return -ENOENT;
3188 search_start--;
3189 goto again;
3190 }
3191
3192 path->slots[0]--;
3193 btrfs_item_key_to_cpu(path->nodes[0], &key,
3194 path->slots[0]);
3195 if (key.objectid != btrfs_ino(inode) ||
3196 key.type != BTRFS_EXTENT_DATA_KEY) {
3197 btrfs_release_path(path);
3198 return -ENOENT;
3199 }
3200 } while (key.offset > start);
3201 3356
3202 fi = btrfs_item_ptr(path->nodes[0], path->slots[0], 3357 if (skip_csum)
3203 struct btrfs_file_extent_item); 3358 return 0;
3204 num_bytes = btrfs_file_extent_num_bytes(path->nodes[0],
3205 fi);
3206 if (key.offset + num_bytes <= start) {
3207 btrfs_release_path(path);
3208 return -ENOENT;
3209 }
3210 }
3211 args->src = path->nodes[0];
3212next_slot:
3213 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3214 fi = btrfs_item_ptr(args->src, path->slots[0],
3215 struct btrfs_file_extent_item);
3216 if (args->nr &&
3217 args->start_slot + args->nr == path->slots[0]) {
3218 args->nr++;
3219 } else if (args->nr) {
3220 ret = copy_items(trans, inode, dst_path, args->src,
3221 args->start_slot, args->nr,
3222 LOG_INODE_ALL);
3223 if (ret)
3224 return ret;
3225 args->nr = 1;
3226 args->start_slot = path->slots[0];
3227 } else if (!args->nr) {
3228 args->nr = 1;
3229 args->start_slot = path->slots[0];
3230 }
3231 nritems = btrfs_header_nritems(path->nodes[0]);
3232 path->slots[0]++;
3233 num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
3234 if (len < num_bytes) {
3235 /* I _think_ this is ok, envision we write to a
3236 * preallocated space that is adjacent to a previously
3237 * written preallocated space that gets merged when we
3238 * mark this preallocated space written. If we do not
3239 * have the adjacent extent in cache then when we copy
3240 * this extent it could end up being larger than our EM
3241 * thinks it is, which is a-ok, so just set len to 0.
3242 */
3243 len = 0;
3244 } else {
3245 len -= num_bytes;
3246 }
3247 start = key.offset + num_bytes;
3248 args->next_offset = start;
3249 search_start = start;
3250 3359
3251 if (path->slots[0] < nritems) { 3360 /* block start is already adjusted for the file extent offset. */
3252 if (len) 3361 ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
3253 goto next_slot; 3362 em->block_start + csum_offset,
3254 break; 3363 em->block_start + csum_offset +
3255 } 3364 csum_len - 1, &ordered_sums, 0);
3365 if (ret)
3366 return ret;
3256 3367
3257 if (args->nr) { 3368 while (!list_empty(&ordered_sums)) {
3258 ret = copy_items(trans, inode, dst_path, args->src, 3369 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3259 args->start_slot, args->nr, 3370 struct btrfs_ordered_sum,
3260 LOG_INODE_ALL); 3371 list);
3261 if (ret) 3372 if (!ret)
3262 return ret; 3373 ret = btrfs_csum_file_blocks(trans, log, sums);
3263 args->nr = 0; 3374 list_del(&sums->list);
3264 btrfs_release_path(path); 3375 kfree(sums);
3265 }
3266 } 3376 }
3267 3377
3268 return 0; 3378 return ret;
3269} 3379}
3270 3380
3271static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3381static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3272 struct btrfs_root *root, 3382 struct btrfs_root *root,
3273 struct inode *inode, 3383 struct inode *inode,
3274 struct btrfs_path *path, 3384 struct btrfs_path *path)
3275 struct btrfs_path *dst_path)
3276{ 3385{
3277 struct log_args args;
3278 struct extent_map *em, *n; 3386 struct extent_map *em, *n;
3279 struct list_head extents; 3387 struct list_head extents;
3280 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3388 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
@@ -3283,8 +3391,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3283 3391
3284 INIT_LIST_HEAD(&extents); 3392 INIT_LIST_HEAD(&extents);
3285 3393
3286 memset(&args, 0, sizeof(args));
3287
3288 write_lock(&tree->lock); 3394 write_lock(&tree->lock);
3289 test_gen = root->fs_info->last_trans_committed; 3395 test_gen = root->fs_info->last_trans_committed;
3290 3396
@@ -3317,34 +3423,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3317 3423
3318 write_unlock(&tree->lock); 3424 write_unlock(&tree->lock);
3319 3425
3320 /* 3426 ret = log_one_extent(trans, inode, root, em, path);
3321 * If the previous EM and the last extent we left off on aren't
3322 * sequential then we need to copy the items we have and redo
3323 * our search
3324 */
3325 if (args.nr && em->mod_start != args.next_offset) {
3326 ret = copy_items(trans, inode, dst_path, args.src,
3327 args.start_slot, args.nr,
3328 LOG_INODE_ALL);
3329 if (ret) {
3330 free_extent_map(em);
3331 write_lock(&tree->lock);
3332 continue;
3333 }
3334 btrfs_release_path(path);
3335 args.nr = 0;
3336 }
3337
3338 ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
3339 free_extent_map(em); 3427 free_extent_map(em);
3340 write_lock(&tree->lock); 3428 write_lock(&tree->lock);
3341 } 3429 }
3342 WARN_ON(!list_empty(&extents)); 3430 WARN_ON(!list_empty(&extents));
3343 write_unlock(&tree->lock); 3431 write_unlock(&tree->lock);
3344 3432
3345 if (!ret && args.nr)
3346 ret = copy_items(trans, inode, dst_path, args.src,
3347 args.start_slot, args.nr, LOG_INODE_ALL);
3348 btrfs_release_path(path); 3433 btrfs_release_path(path);
3349 return ret; 3434 return ret;
3350} 3435}
@@ -3400,7 +3485,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3400 3485
3401 3486
3402 /* today the code can only do partial logging of directories */ 3487 /* today the code can only do partial logging of directories */
3403 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 3488 if (S_ISDIR(inode->i_mode) ||
3489 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3490 &BTRFS_I(inode)->runtime_flags) &&
3491 inode_only == LOG_INODE_EXISTS))
3404 max_key.type = BTRFS_XATTR_ITEM_KEY; 3492 max_key.type = BTRFS_XATTR_ITEM_KEY;
3405 else 3493 else
3406 max_key.type = (u8)-1; 3494 max_key.type = (u8)-1;
@@ -3432,14 +3520,28 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3432 } else { 3520 } else {
3433 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3521 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3434 &BTRFS_I(inode)->runtime_flags)) { 3522 &BTRFS_I(inode)->runtime_flags)) {
3523 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3524 &BTRFS_I(inode)->runtime_flags);
3435 ret = btrfs_truncate_inode_items(trans, log, 3525 ret = btrfs_truncate_inode_items(trans, log,
3436 inode, 0, 0); 3526 inode, 0, 0);
3437 } else { 3527 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3438 fast_search = true; 3528 &BTRFS_I(inode)->runtime_flags)) {
3529 if (inode_only == LOG_INODE_ALL)
3530 fast_search = true;
3439 max_key.type = BTRFS_XATTR_ITEM_KEY; 3531 max_key.type = BTRFS_XATTR_ITEM_KEY;
3440 ret = drop_objectid_items(trans, log, path, ino, 3532 ret = drop_objectid_items(trans, log, path, ino,
3441 BTRFS_XATTR_ITEM_KEY); 3533 max_key.type);
3534 } else {
3535 if (inode_only == LOG_INODE_ALL)
3536 fast_search = true;
3537 ret = log_inode_item(trans, log, dst_path, inode);
3538 if (ret) {
3539 err = ret;
3540 goto out_unlock;
3541 }
3542 goto log_extents;
3442 } 3543 }
3544
3443 } 3545 }
3444 if (ret) { 3546 if (ret) {
3445 err = ret; 3547 err = ret;
@@ -3518,11 +3620,10 @@ next_slot:
3518 ins_nr = 0; 3620 ins_nr = 0;
3519 } 3621 }
3520 3622
3623log_extents:
3521 if (fast_search) { 3624 if (fast_search) {
3522 btrfs_release_path(path);
3523 btrfs_release_path(dst_path); 3625 btrfs_release_path(dst_path);
3524 ret = btrfs_log_changed_extents(trans, root, inode, path, 3626 ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
3525 dst_path);
3526 if (ret) { 3627 if (ret) {
3527 err = ret; 3628 err = ret;
3528 goto out_unlock; 3629 goto out_unlock;
@@ -3531,8 +3632,10 @@ next_slot:
3531 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3632 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3532 struct extent_map *em, *n; 3633 struct extent_map *em, *n;
3533 3634
3635 write_lock(&tree->lock);
3534 list_for_each_entry_safe(em, n, &tree->modified_extents, list) 3636 list_for_each_entry_safe(em, n, &tree->modified_extents, list)
3535 list_del_init(&em->list); 3637 list_del_init(&em->list);
3638 write_unlock(&tree->lock);
3536 } 3639 }
3537 3640
3538 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 3641 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0f5ebb72a5ea..5cce6aa74012 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,7 +25,6 @@
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <asm/div64.h>
29#include "compat.h" 28#include "compat.h"
30#include "ctree.h" 29#include "ctree.h"
31#include "extent_map.h" 30#include "extent_map.h"
@@ -36,6 +35,8 @@
36#include "async-thread.h" 35#include "async-thread.h"
37#include "check-integrity.h" 36#include "check-integrity.h"
38#include "rcu-string.h" 37#include "rcu-string.h"
38#include "math.h"
39#include "dev-replace.h"
39 40
40static int init_first_rw_device(struct btrfs_trans_handle *trans, 41static int init_first_rw_device(struct btrfs_trans_handle *trans,
41 struct btrfs_root *root, 42 struct btrfs_root *root,
@@ -71,6 +72,19 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
71 kfree(fs_devices); 72 kfree(fs_devices);
72} 73}
73 74
75static void btrfs_kobject_uevent(struct block_device *bdev,
76 enum kobject_action action)
77{
78 int ret;
79
80 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
81 if (ret)
82 pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
83 action,
84 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
85 &disk_to_dev(bdev->bd_disk)->kobj);
86}
87
74void btrfs_cleanup_fs_uuids(void) 88void btrfs_cleanup_fs_uuids(void)
75{ 89{
76 struct btrfs_fs_devices *fs_devices; 90 struct btrfs_fs_devices *fs_devices;
@@ -108,6 +122,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
108 return NULL; 122 return NULL;
109} 123}
110 124
125static int
126btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
127 int flush, struct block_device **bdev,
128 struct buffer_head **bh)
129{
130 int ret;
131
132 *bdev = blkdev_get_by_path(device_path, flags, holder);
133
134 if (IS_ERR(*bdev)) {
135 ret = PTR_ERR(*bdev);
136 printk(KERN_INFO "btrfs: open %s failed\n", device_path);
137 goto error;
138 }
139
140 if (flush)
141 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
142 ret = set_blocksize(*bdev, 4096);
143 if (ret) {
144 blkdev_put(*bdev, flags);
145 goto error;
146 }
147 invalidate_bdev(*bdev);
148 *bh = btrfs_read_dev_super(*bdev);
149 if (!*bh) {
150 ret = -EINVAL;
151 blkdev_put(*bdev, flags);
152 goto error;
153 }
154
155 return 0;
156
157error:
158 *bdev = NULL;
159 *bh = NULL;
160 return ret;
161}
162
111static void requeue_list(struct btrfs_pending_bios *pending_bios, 163static void requeue_list(struct btrfs_pending_bios *pending_bios,
112 struct bio *head, struct bio *tail) 164 struct bio *head, struct bio *tail)
113{ 165{
@@ -467,7 +519,8 @@ error:
467 return ERR_PTR(-ENOMEM); 519 return ERR_PTR(-ENOMEM);
468} 520}
469 521
470void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 522void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
523 struct btrfs_fs_devices *fs_devices, int step)
471{ 524{
472 struct btrfs_device *device, *next; 525 struct btrfs_device *device, *next;
473 526
@@ -480,8 +533,9 @@ again:
480 /* This is the initialized path, it is safe to release the devices. */ 533 /* This is the initialized path, it is safe to release the devices. */
481 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 534 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
482 if (device->in_fs_metadata) { 535 if (device->in_fs_metadata) {
483 if (!latest_transid || 536 if (!device->is_tgtdev_for_dev_replace &&
484 device->generation > latest_transid) { 537 (!latest_transid ||
538 device->generation > latest_transid)) {
485 latest_devid = device->devid; 539 latest_devid = device->devid;
486 latest_transid = device->generation; 540 latest_transid = device->generation;
487 latest_bdev = device->bdev; 541 latest_bdev = device->bdev;
@@ -489,6 +543,21 @@ again:
489 continue; 543 continue;
490 } 544 }
491 545
546 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
547 /*
548 * In the first step, keep the device which has
549 * the correct fsid and the devid that is used
550 * for the dev_replace procedure.
551 * In the second step, the dev_replace state is
552 * read from the device tree and it is known
553 * whether the procedure is really active or
554 * not, which means whether this device is
555 * used or whether it should be removed.
556 */
557 if (step == 0 || device->is_tgtdev_for_dev_replace) {
558 continue;
559 }
560 }
492 if (device->bdev) { 561 if (device->bdev) {
493 blkdev_put(device->bdev, device->mode); 562 blkdev_put(device->bdev, device->mode);
494 device->bdev = NULL; 563 device->bdev = NULL;
@@ -497,7 +566,8 @@ again:
497 if (device->writeable) { 566 if (device->writeable) {
498 list_del_init(&device->dev_alloc_list); 567 list_del_init(&device->dev_alloc_list);
499 device->writeable = 0; 568 device->writeable = 0;
500 fs_devices->rw_devices--; 569 if (!device->is_tgtdev_for_dev_replace)
570 fs_devices->rw_devices--;
501 } 571 }
502 list_del_init(&device->dev_list); 572 list_del_init(&device->dev_list);
503 fs_devices->num_devices--; 573 fs_devices->num_devices--;
@@ -555,7 +625,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
555 if (device->bdev) 625 if (device->bdev)
556 fs_devices->open_devices--; 626 fs_devices->open_devices--;
557 627
558 if (device->writeable) { 628 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
559 list_del_init(&device->dev_alloc_list); 629 list_del_init(&device->dev_alloc_list);
560 fs_devices->rw_devices--; 630 fs_devices->rw_devices--;
561 } 631 }
@@ -637,18 +707,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
637 if (!device->name) 707 if (!device->name)
638 continue; 708 continue;
639 709
640 bdev = blkdev_get_by_path(device->name->str, flags, holder); 710 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
641 if (IS_ERR(bdev)) { 711 &bdev, &bh);
642 printk(KERN_INFO "btrfs: open %s failed\n", device->name->str); 712 if (ret)
643 goto error; 713 continue;
644 }
645 filemap_write_and_wait(bdev->bd_inode->i_mapping);
646 invalidate_bdev(bdev);
647 set_blocksize(bdev, 4096);
648
649 bh = btrfs_read_dev_super(bdev);
650 if (!bh)
651 goto error_close;
652 714
653 disk_super = (struct btrfs_super_block *)bh->b_data; 715 disk_super = (struct btrfs_super_block *)bh->b_data;
654 devid = btrfs_stack_device_id(&disk_super->dev_item); 716 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -687,7 +749,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
687 fs_devices->rotating = 1; 749 fs_devices->rotating = 1;
688 750
689 fs_devices->open_devices++; 751 fs_devices->open_devices++;
690 if (device->writeable) { 752 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
691 fs_devices->rw_devices++; 753 fs_devices->rw_devices++;
692 list_add(&device->dev_alloc_list, 754 list_add(&device->dev_alloc_list,
693 &fs_devices->alloc_list); 755 &fs_devices->alloc_list);
@@ -697,9 +759,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
697 759
698error_brelse: 760error_brelse:
699 brelse(bh); 761 brelse(bh);
700error_close:
701 blkdev_put(bdev, flags); 762 blkdev_put(bdev, flags);
702error:
703 continue; 763 continue;
704 } 764 }
705 if (fs_devices->open_devices == 0) { 765 if (fs_devices->open_devices == 0) {
@@ -744,40 +804,30 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
744 u64 total_devices; 804 u64 total_devices;
745 805
746 flags |= FMODE_EXCL; 806 flags |= FMODE_EXCL;
747 bdev = blkdev_get_by_path(path, flags, holder);
748
749 if (IS_ERR(bdev)) {
750 ret = PTR_ERR(bdev);
751 goto error;
752 }
753
754 mutex_lock(&uuid_mutex); 807 mutex_lock(&uuid_mutex);
755 ret = set_blocksize(bdev, 4096); 808 ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh);
756 if (ret) 809 if (ret)
757 goto error_close; 810 goto error;
758 bh = btrfs_read_dev_super(bdev);
759 if (!bh) {
760 ret = -EINVAL;
761 goto error_close;
762 }
763 disk_super = (struct btrfs_super_block *)bh->b_data; 811 disk_super = (struct btrfs_super_block *)bh->b_data;
764 devid = btrfs_stack_device_id(&disk_super->dev_item); 812 devid = btrfs_stack_device_id(&disk_super->dev_item);
765 transid = btrfs_super_generation(disk_super); 813 transid = btrfs_super_generation(disk_super);
766 total_devices = btrfs_super_num_devices(disk_super); 814 total_devices = btrfs_super_num_devices(disk_super);
767 if (disk_super->label[0]) 815 if (disk_super->label[0]) {
816 if (disk_super->label[BTRFS_LABEL_SIZE - 1])
817 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
768 printk(KERN_INFO "device label %s ", disk_super->label); 818 printk(KERN_INFO "device label %s ", disk_super->label);
769 else 819 } else {
770 printk(KERN_INFO "device fsid %pU ", disk_super->fsid); 820 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
821 }
771 printk(KERN_CONT "devid %llu transid %llu %s\n", 822 printk(KERN_CONT "devid %llu transid %llu %s\n",
772 (unsigned long long)devid, (unsigned long long)transid, path); 823 (unsigned long long)devid, (unsigned long long)transid, path);
773 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 824 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
774 if (!ret && fs_devices_ret) 825 if (!ret && fs_devices_ret)
775 (*fs_devices_ret)->total_devices = total_devices; 826 (*fs_devices_ret)->total_devices = total_devices;
776 brelse(bh); 827 brelse(bh);
777error_close:
778 mutex_unlock(&uuid_mutex);
779 blkdev_put(bdev, flags); 828 blkdev_put(bdev, flags);
780error: 829error:
830 mutex_unlock(&uuid_mutex);
781 return ret; 831 return ret;
782} 832}
783 833
@@ -796,7 +846,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
796 846
797 *length = 0; 847 *length = 0;
798 848
799 if (start >= device->total_bytes) 849 if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
800 return 0; 850 return 0;
801 851
802 path = btrfs_alloc_path(); 852 path = btrfs_alloc_path();
@@ -913,7 +963,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
913 max_hole_size = 0; 963 max_hole_size = 0;
914 hole_size = 0; 964 hole_size = 0;
915 965
916 if (search_start >= search_end) { 966 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
917 ret = -ENOSPC; 967 ret = -ENOSPC;
918 goto error; 968 goto error;
919 } 969 }
@@ -1096,6 +1146,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1096 struct btrfs_key key; 1146 struct btrfs_key key;
1097 1147
1098 WARN_ON(!device->in_fs_metadata); 1148 WARN_ON(!device->in_fs_metadata);
1149 WARN_ON(device->is_tgtdev_for_dev_replace);
1099 path = btrfs_alloc_path(); 1150 path = btrfs_alloc_path();
1100 if (!path) 1151 if (!path)
1101 return -ENOMEM; 1152 return -ENOMEM;
@@ -1330,16 +1381,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1330 root->fs_info->avail_system_alloc_bits | 1381 root->fs_info->avail_system_alloc_bits |
1331 root->fs_info->avail_metadata_alloc_bits; 1382 root->fs_info->avail_metadata_alloc_bits;
1332 1383
1333 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1384 num_devices = root->fs_info->fs_devices->num_devices;
1334 root->fs_info->fs_devices->num_devices <= 4) { 1385 btrfs_dev_replace_lock(&root->fs_info->dev_replace);
1386 if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
1387 WARN_ON(num_devices < 1);
1388 num_devices--;
1389 }
1390 btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1391
1392 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1335 printk(KERN_ERR "btrfs: unable to go below four devices " 1393 printk(KERN_ERR "btrfs: unable to go below four devices "
1336 "on raid10\n"); 1394 "on raid10\n");
1337 ret = -EINVAL; 1395 ret = -EINVAL;
1338 goto out; 1396 goto out;
1339 } 1397 }
1340 1398
1341 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1399 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
1342 root->fs_info->fs_devices->num_devices <= 2) {
1343 printk(KERN_ERR "btrfs: unable to go below two " 1400 printk(KERN_ERR "btrfs: unable to go below two "
1344 "devices on raid1\n"); 1401 "devices on raid1\n");
1345 ret = -EINVAL; 1402 ret = -EINVAL;
@@ -1357,7 +1414,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1357 * is held. 1414 * is held.
1358 */ 1415 */
1359 list_for_each_entry(tmp, devices, dev_list) { 1416 list_for_each_entry(tmp, devices, dev_list) {
1360 if (tmp->in_fs_metadata && !tmp->bdev) { 1417 if (tmp->in_fs_metadata &&
1418 !tmp->is_tgtdev_for_dev_replace &&
1419 !tmp->bdev) {
1361 device = tmp; 1420 device = tmp;
1362 break; 1421 break;
1363 } 1422 }
@@ -1371,24 +1430,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1371 goto out; 1430 goto out;
1372 } 1431 }
1373 } else { 1432 } else {
1374 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, 1433 ret = btrfs_get_bdev_and_sb(device_path,
1375 root->fs_info->bdev_holder); 1434 FMODE_READ | FMODE_EXCL,
1376 if (IS_ERR(bdev)) { 1435 root->fs_info->bdev_holder, 0,
1377 ret = PTR_ERR(bdev); 1436 &bdev, &bh);
1437 if (ret)
1378 goto out; 1438 goto out;
1379 }
1380
1381 set_blocksize(bdev, 4096);
1382 invalidate_bdev(bdev);
1383 bh = btrfs_read_dev_super(bdev);
1384 if (!bh) {
1385 ret = -EINVAL;
1386 goto error_close;
1387 }
1388 disk_super = (struct btrfs_super_block *)bh->b_data; 1439 disk_super = (struct btrfs_super_block *)bh->b_data;
1389 devid = btrfs_stack_device_id(&disk_super->dev_item); 1440 devid = btrfs_stack_device_id(&disk_super->dev_item);
1390 dev_uuid = disk_super->dev_item.uuid; 1441 dev_uuid = disk_super->dev_item.uuid;
1391 device = btrfs_find_device(root, devid, dev_uuid, 1442 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1392 disk_super->fsid); 1443 disk_super->fsid);
1393 if (!device) { 1444 if (!device) {
1394 ret = -ENOENT; 1445 ret = -ENOENT;
@@ -1396,6 +1447,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1396 } 1447 }
1397 } 1448 }
1398 1449
1450 if (device->is_tgtdev_for_dev_replace) {
1451 pr_err("btrfs: unable to remove the dev_replace target dev\n");
1452 ret = -EINVAL;
1453 goto error_brelse;
1454 }
1455
1399 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1456 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1400 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1457 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1401 "device\n"); 1458 "device\n");
@@ -1415,6 +1472,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1415 if (ret) 1472 if (ret)
1416 goto error_undo; 1473 goto error_undo;
1417 1474
1475 /*
1476 * TODO: the superblock still includes this device in its num_devices
1477 * counter although write_all_supers() is not locked out. This
1478 * could give a filesystem state which requires a degraded mount.
1479 */
1418 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1480 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1419 if (ret) 1481 if (ret)
1420 goto error_undo; 1482 goto error_undo;
@@ -1425,7 +1487,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1425 spin_unlock(&root->fs_info->free_chunk_lock); 1487 spin_unlock(&root->fs_info->free_chunk_lock);
1426 1488
1427 device->in_fs_metadata = 0; 1489 device->in_fs_metadata = 0;
1428 btrfs_scrub_cancel_dev(root, device); 1490 btrfs_scrub_cancel_dev(root->fs_info, device);
1429 1491
1430 /* 1492 /*
1431 * the device list mutex makes sure that we don't change 1493 * the device list mutex makes sure that we don't change
@@ -1482,7 +1544,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1482 * at this point, the device is zero sized. We want to 1544 * at this point, the device is zero sized. We want to
1483 * remove it from the devices list and zero out the old super 1545 * remove it from the devices list and zero out the old super
1484 */ 1546 */
1485 if (clear_super) { 1547 if (clear_super && disk_super) {
1486 /* make sure this device isn't detected as part of 1548 /* make sure this device isn't detected as part of
1487 * the FS anymore 1549 * the FS anymore
1488 */ 1550 */
@@ -1493,9 +1555,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1493 1555
1494 ret = 0; 1556 ret = 0;
1495 1557
1558 /* Notify udev that device has changed */
1559 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
1560
1496error_brelse: 1561error_brelse:
1497 brelse(bh); 1562 brelse(bh);
1498error_close:
1499 if (bdev) 1563 if (bdev)
1500 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1564 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1501out: 1565out:
@@ -1512,6 +1576,112 @@ error_undo:
1512 goto error_brelse; 1576 goto error_brelse;
1513} 1577}
1514 1578
1579void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
1580 struct btrfs_device *srcdev)
1581{
1582 WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
1583 list_del_rcu(&srcdev->dev_list);
1584 list_del_rcu(&srcdev->dev_alloc_list);
1585 fs_info->fs_devices->num_devices--;
1586 if (srcdev->missing) {
1587 fs_info->fs_devices->missing_devices--;
1588 fs_info->fs_devices->rw_devices++;
1589 }
1590 if (srcdev->can_discard)
1591 fs_info->fs_devices->num_can_discard--;
1592 if (srcdev->bdev)
1593 fs_info->fs_devices->open_devices--;
1594
1595 call_rcu(&srcdev->rcu, free_device);
1596}
1597
1598void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1599 struct btrfs_device *tgtdev)
1600{
1601 struct btrfs_device *next_device;
1602
1603 WARN_ON(!tgtdev);
1604 mutex_lock(&fs_info->fs_devices->device_list_mutex);
1605 if (tgtdev->bdev) {
1606 btrfs_scratch_superblock(tgtdev);
1607 fs_info->fs_devices->open_devices--;
1608 }
1609 fs_info->fs_devices->num_devices--;
1610 if (tgtdev->can_discard)
1611 fs_info->fs_devices->num_can_discard++;
1612
1613 next_device = list_entry(fs_info->fs_devices->devices.next,
1614 struct btrfs_device, dev_list);
1615 if (tgtdev->bdev == fs_info->sb->s_bdev)
1616 fs_info->sb->s_bdev = next_device->bdev;
1617 if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
1618 fs_info->fs_devices->latest_bdev = next_device->bdev;
1619 list_del_rcu(&tgtdev->dev_list);
1620
1621 call_rcu(&tgtdev->rcu, free_device);
1622
1623 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1624}
1625
1626int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
1627 struct btrfs_device **device)
1628{
1629 int ret = 0;
1630 struct btrfs_super_block *disk_super;
1631 u64 devid;
1632 u8 *dev_uuid;
1633 struct block_device *bdev;
1634 struct buffer_head *bh;
1635
1636 *device = NULL;
1637 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
1638 root->fs_info->bdev_holder, 0, &bdev, &bh);
1639 if (ret)
1640 return ret;
1641 disk_super = (struct btrfs_super_block *)bh->b_data;
1642 devid = btrfs_stack_device_id(&disk_super->dev_item);
1643 dev_uuid = disk_super->dev_item.uuid;
1644 *device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1645 disk_super->fsid);
1646 brelse(bh);
1647 if (!*device)
1648 ret = -ENOENT;
1649 blkdev_put(bdev, FMODE_READ);
1650 return ret;
1651}
1652
1653int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
1654 char *device_path,
1655 struct btrfs_device **device)
1656{
1657 *device = NULL;
1658 if (strcmp(device_path, "missing") == 0) {
1659 struct list_head *devices;
1660 struct btrfs_device *tmp;
1661
1662 devices = &root->fs_info->fs_devices->devices;
1663 /*
1664 * It is safe to read the devices since the volume_mutex
1665 * is held by the caller.
1666 */
1667 list_for_each_entry(tmp, devices, dev_list) {
1668 if (tmp->in_fs_metadata && !tmp->bdev) {
1669 *device = tmp;
1670 break;
1671 }
1672 }
1673
1674 if (!*device) {
1675 pr_err("btrfs: no missing device found\n");
1676 return -ENOENT;
1677 }
1678
1679 return 0;
1680 } else {
1681 return btrfs_find_device_by_path(root, device_path, device);
1682 }
1683}
1684
1515/* 1685/*
1516 * does all the dirty work required for changing file system's UUID. 1686 * does all the dirty work required for changing file system's UUID.
1517 */ 1687 */
@@ -1630,7 +1800,8 @@ next_slot:
1630 read_extent_buffer(leaf, fs_uuid, 1800 read_extent_buffer(leaf, fs_uuid,
1631 (unsigned long)btrfs_device_fsid(dev_item), 1801 (unsigned long)btrfs_device_fsid(dev_item),
1632 BTRFS_UUID_SIZE); 1802 BTRFS_UUID_SIZE);
1633 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1803 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1804 fs_uuid);
1634 BUG_ON(!device); /* Logic error */ 1805 BUG_ON(!device); /* Logic error */
1635 1806
1636 if (device->fs_devices->seeding) { 1807 if (device->fs_devices->seeding) {
@@ -1678,16 +1849,17 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1678 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1849 filemap_write_and_wait(bdev->bd_inode->i_mapping);
1679 1850
1680 devices = &root->fs_info->fs_devices->devices; 1851 devices = &root->fs_info->fs_devices->devices;
1681 /* 1852
1682 * we have the volume lock, so we don't need the extra 1853 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1683 * device list mutex while reading the list here.
1684 */
1685 list_for_each_entry(device, devices, dev_list) { 1854 list_for_each_entry(device, devices, dev_list) {
1686 if (device->bdev == bdev) { 1855 if (device->bdev == bdev) {
1687 ret = -EEXIST; 1856 ret = -EEXIST;
1857 mutex_unlock(
1858 &root->fs_info->fs_devices->device_list_mutex);
1688 goto error; 1859 goto error;
1689 } 1860 }
1690 } 1861 }
1862 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1691 1863
1692 device = kzalloc(sizeof(*device), GFP_NOFS); 1864 device = kzalloc(sizeof(*device), GFP_NOFS);
1693 if (!device) { 1865 if (!device) {
@@ -1737,6 +1909,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1737 device->dev_root = root->fs_info->dev_root; 1909 device->dev_root = root->fs_info->dev_root;
1738 device->bdev = bdev; 1910 device->bdev = bdev;
1739 device->in_fs_metadata = 1; 1911 device->in_fs_metadata = 1;
1912 device->is_tgtdev_for_dev_replace = 0;
1740 device->mode = FMODE_EXCL; 1913 device->mode = FMODE_EXCL;
1741 set_blocksize(device->bdev, 4096); 1914 set_blocksize(device->bdev, 4096);
1742 1915
@@ -1844,6 +2017,98 @@ error:
1844 return ret; 2017 return ret;
1845} 2018}
1846 2019
2020int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2021 struct btrfs_device **device_out)
2022{
2023 struct request_queue *q;
2024 struct btrfs_device *device;
2025 struct block_device *bdev;
2026 struct btrfs_fs_info *fs_info = root->fs_info;
2027 struct list_head *devices;
2028 struct rcu_string *name;
2029 int ret = 0;
2030
2031 *device_out = NULL;
2032 if (fs_info->fs_devices->seeding)
2033 return -EINVAL;
2034
2035 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2036 fs_info->bdev_holder);
2037 if (IS_ERR(bdev))
2038 return PTR_ERR(bdev);
2039
2040 filemap_write_and_wait(bdev->bd_inode->i_mapping);
2041
2042 devices = &fs_info->fs_devices->devices;
2043 list_for_each_entry(device, devices, dev_list) {
2044 if (device->bdev == bdev) {
2045 ret = -EEXIST;
2046 goto error;
2047 }
2048 }
2049
2050 device = kzalloc(sizeof(*device), GFP_NOFS);
2051 if (!device) {
2052 ret = -ENOMEM;
2053 goto error;
2054 }
2055
2056 name = rcu_string_strdup(device_path, GFP_NOFS);
2057 if (!name) {
2058 kfree(device);
2059 ret = -ENOMEM;
2060 goto error;
2061 }
2062 rcu_assign_pointer(device->name, name);
2063
2064 q = bdev_get_queue(bdev);
2065 if (blk_queue_discard(q))
2066 device->can_discard = 1;
2067 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2068 device->writeable = 1;
2069 device->work.func = pending_bios_fn;
2070 generate_random_uuid(device->uuid);
2071 device->devid = BTRFS_DEV_REPLACE_DEVID;
2072 spin_lock_init(&device->io_lock);
2073 device->generation = 0;
2074 device->io_width = root->sectorsize;
2075 device->io_align = root->sectorsize;
2076 device->sector_size = root->sectorsize;
2077 device->total_bytes = i_size_read(bdev->bd_inode);
2078 device->disk_total_bytes = device->total_bytes;
2079 device->dev_root = fs_info->dev_root;
2080 device->bdev = bdev;
2081 device->in_fs_metadata = 1;
2082 device->is_tgtdev_for_dev_replace = 1;
2083 device->mode = FMODE_EXCL;
2084 set_blocksize(device->bdev, 4096);
2085 device->fs_devices = fs_info->fs_devices;
2086 list_add(&device->dev_list, &fs_info->fs_devices->devices);
2087 fs_info->fs_devices->num_devices++;
2088 fs_info->fs_devices->open_devices++;
2089 if (device->can_discard)
2090 fs_info->fs_devices->num_can_discard++;
2091 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2092
2093 *device_out = device;
2094 return ret;
2095
2096error:
2097 blkdev_put(bdev, FMODE_EXCL);
2098 return ret;
2099}
2100
2101void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
2102 struct btrfs_device *tgtdev)
2103{
2104 WARN_ON(fs_info->fs_devices->rw_devices == 0);
2105 tgtdev->io_width = fs_info->dev_root->sectorsize;
2106 tgtdev->io_align = fs_info->dev_root->sectorsize;
2107 tgtdev->sector_size = fs_info->dev_root->sectorsize;
2108 tgtdev->dev_root = fs_info->dev_root;
2109 tgtdev->in_fs_metadata = 1;
2110}
2111
1847static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2112static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1848 struct btrfs_device *device) 2113 struct btrfs_device *device)
1849{ 2114{
@@ -1900,7 +2165,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1900 2165
1901 if (!device->writeable) 2166 if (!device->writeable)
1902 return -EACCES; 2167 return -EACCES;
1903 if (new_size <= device->total_bytes) 2168 if (new_size <= device->total_bytes ||
2169 device->is_tgtdev_for_dev_replace)
1904 return -EINVAL; 2170 return -EINVAL;
1905 2171
1906 btrfs_set_super_total_bytes(super_copy, old_total + diff); 2172 btrfs_set_super_total_bytes(super_copy, old_total + diff);
@@ -2338,18 +2604,6 @@ static int chunk_profiles_filter(u64 chunk_type,
2338 return 1; 2604 return 1;
2339} 2605}
2340 2606
2341static u64 div_factor_fine(u64 num, int factor)
2342{
2343 if (factor <= 0)
2344 return 0;
2345 if (factor >= 100)
2346 return num;
2347
2348 num *= factor;
2349 do_div(num, 100);
2350 return num;
2351}
2352
2353static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 2607static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2354 struct btrfs_balance_args *bargs) 2608 struct btrfs_balance_args *bargs)
2355{ 2609{
@@ -2514,15 +2768,6 @@ static int should_balance_chunk(struct btrfs_root *root,
2514 return 1; 2768 return 1;
2515} 2769}
2516 2770
2517static u64 div_factor(u64 num, int factor)
2518{
2519 if (factor == 10)
2520 return num;
2521 num *= factor;
2522 do_div(num, 10);
2523 return num;
2524}
2525
2526static int __btrfs_balance(struct btrfs_fs_info *fs_info) 2771static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2527{ 2772{
2528 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 2773 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -2550,7 +2795,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2550 size_to_free = div_factor(old_size, 1); 2795 size_to_free = div_factor(old_size, 1);
2551 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 2796 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2552 if (!device->writeable || 2797 if (!device->writeable ||
2553 device->total_bytes - device->bytes_used > size_to_free) 2798 device->total_bytes - device->bytes_used > size_to_free ||
2799 device->is_tgtdev_for_dev_replace)
2554 continue; 2800 continue;
2555 2801
2556 ret = btrfs_shrink_device(device, old_size - size_to_free); 2802 ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -2728,6 +2974,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2728 u64 allowed; 2974 u64 allowed;
2729 int mixed = 0; 2975 int mixed = 0;
2730 int ret; 2976 int ret;
2977 u64 num_devices;
2731 2978
2732 if (btrfs_fs_closing(fs_info) || 2979 if (btrfs_fs_closing(fs_info) ||
2733 atomic_read(&fs_info->balance_pause_req) || 2980 atomic_read(&fs_info->balance_pause_req) ||
@@ -2756,10 +3003,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2756 } 3003 }
2757 } 3004 }
2758 3005
3006 num_devices = fs_info->fs_devices->num_devices;
3007 btrfs_dev_replace_lock(&fs_info->dev_replace);
3008 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3009 BUG_ON(num_devices < 1);
3010 num_devices--;
3011 }
3012 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2759 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 3013 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2760 if (fs_info->fs_devices->num_devices == 1) 3014 if (num_devices == 1)
2761 allowed |= BTRFS_BLOCK_GROUP_DUP; 3015 allowed |= BTRFS_BLOCK_GROUP_DUP;
2762 else if (fs_info->fs_devices->num_devices < 4) 3016 else if (num_devices < 4)
2763 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3017 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
2764 else 3018 else
2765 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 3019 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
@@ -2902,6 +3156,7 @@ static int balance_kthread(void *data)
2902 ret = btrfs_balance(fs_info->balance_ctl, NULL); 3156 ret = btrfs_balance(fs_info->balance_ctl, NULL);
2903 } 3157 }
2904 3158
3159 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
2905 mutex_unlock(&fs_info->balance_mutex); 3160 mutex_unlock(&fs_info->balance_mutex);
2906 mutex_unlock(&fs_info->volume_mutex); 3161 mutex_unlock(&fs_info->volume_mutex);
2907 3162
@@ -2924,6 +3179,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
2924 return 0; 3179 return 0;
2925 } 3180 }
2926 3181
3182 WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
2927 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3183 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
2928 if (IS_ERR(tsk)) 3184 if (IS_ERR(tsk))
2929 return PTR_ERR(tsk); 3185 return PTR_ERR(tsk);
@@ -3080,7 +3336,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3080 u64 old_size = device->total_bytes; 3336 u64 old_size = device->total_bytes;
3081 u64 diff = device->total_bytes - new_size; 3337 u64 diff = device->total_bytes - new_size;
3082 3338
3083 if (new_size >= device->total_bytes) 3339 if (device->is_tgtdev_for_dev_replace)
3084 return -EINVAL; 3340 return -EINVAL;
3085 3341
3086 path = btrfs_alloc_path(); 3342 path = btrfs_alloc_path();
@@ -3235,6 +3491,14 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
3235 return 0; 3491 return 0;
3236} 3492}
3237 3493
3494struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
3495 { 2, 1, 0, 4, 2, 2 /* raid10 */ },
3496 { 1, 1, 2, 2, 2, 2 /* raid1 */ },
3497 { 1, 2, 1, 1, 1, 2 /* dup */ },
3498 { 1, 1, 0, 2, 1, 1 /* raid0 */ },
3499 { 1, 1, 0, 1, 1, 1 /* single */ },
3500};
3501
3238static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3502static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3239 struct btrfs_root *extent_root, 3503 struct btrfs_root *extent_root,
3240 struct map_lookup **map_ret, 3504 struct map_lookup **map_ret,
@@ -3264,43 +3528,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3264 int ndevs; 3528 int ndevs;
3265 int i; 3529 int i;
3266 int j; 3530 int j;
3531 int index;
3267 3532
3268 BUG_ON(!alloc_profile_is_valid(type, 0)); 3533 BUG_ON(!alloc_profile_is_valid(type, 0));
3269 3534
3270 if (list_empty(&fs_devices->alloc_list)) 3535 if (list_empty(&fs_devices->alloc_list))
3271 return -ENOSPC; 3536 return -ENOSPC;
3272 3537
3273 sub_stripes = 1; 3538 index = __get_raid_index(type);
3274 dev_stripes = 1;
3275 devs_increment = 1;
3276 ncopies = 1;
3277 devs_max = 0; /* 0 == as many as possible */
3278 devs_min = 1;
3279 3539
3280 /* 3540 sub_stripes = btrfs_raid_array[index].sub_stripes;
3281 * define the properties of each RAID type. 3541 dev_stripes = btrfs_raid_array[index].dev_stripes;
3282 * FIXME: move this to a global table and use it in all RAID 3542 devs_max = btrfs_raid_array[index].devs_max;
3283 * calculation code 3543 devs_min = btrfs_raid_array[index].devs_min;
3284 */ 3544 devs_increment = btrfs_raid_array[index].devs_increment;
3285 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 3545 ncopies = btrfs_raid_array[index].ncopies;
3286 dev_stripes = 2;
3287 ncopies = 2;
3288 devs_max = 1;
3289 } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
3290 devs_min = 2;
3291 } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
3292 devs_increment = 2;
3293 ncopies = 2;
3294 devs_max = 2;
3295 devs_min = 2;
3296 } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
3297 sub_stripes = 2;
3298 devs_increment = 2;
3299 ncopies = 2;
3300 devs_min = 4;
3301 } else {
3302 devs_max = 1;
3303 }
3304 3546
3305 if (type & BTRFS_BLOCK_GROUP_DATA) { 3547 if (type & BTRFS_BLOCK_GROUP_DATA) {
3306 max_stripe_size = 1024 * 1024 * 1024; 3548 max_stripe_size = 1024 * 1024 * 1024;
@@ -3347,13 +3589,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3347 cur = cur->next; 3589 cur = cur->next;
3348 3590
3349 if (!device->writeable) { 3591 if (!device->writeable) {
3350 printk(KERN_ERR 3592 WARN(1, KERN_ERR
3351 "btrfs: read-only device in alloc_list\n"); 3593 "btrfs: read-only device in alloc_list\n");
3352 WARN_ON(1);
3353 continue; 3594 continue;
3354 } 3595 }
3355 3596
3356 if (!device->in_fs_metadata) 3597 if (!device->in_fs_metadata ||
3598 device->is_tgtdev_for_dev_replace)
3357 continue; 3599 continue;
3358 3600
3359 if (device->total_bytes > device->bytes_used) 3601 if (device->total_bytes > device->bytes_used)
@@ -3382,6 +3624,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3382 devices_info[ndevs].total_avail = total_avail; 3624 devices_info[ndevs].total_avail = total_avail;
3383 devices_info[ndevs].dev = device; 3625 devices_info[ndevs].dev = device;
3384 ++ndevs; 3626 ++ndevs;
3627 WARN_ON(ndevs > fs_devices->rw_devices);
3385 } 3628 }
3386 3629
3387 /* 3630 /*
@@ -3740,8 +3983,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
3740 } 3983 }
3741} 3984}
3742 3985
3743int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 3986int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
3744{ 3987{
3988 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
3745 struct extent_map *em; 3989 struct extent_map *em;
3746 struct map_lookup *map; 3990 struct map_lookup *map;
3747 struct extent_map_tree *em_tree = &map_tree->map_tree; 3991 struct extent_map_tree *em_tree = &map_tree->map_tree;
@@ -3761,32 +4005,60 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
3761 else 4005 else
3762 ret = 1; 4006 ret = 1;
3763 free_extent_map(em); 4007 free_extent_map(em);
4008
4009 btrfs_dev_replace_lock(&fs_info->dev_replace);
4010 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
4011 ret++;
4012 btrfs_dev_replace_unlock(&fs_info->dev_replace);
4013
3764 return ret; 4014 return ret;
3765} 4015}
3766 4016
3767static int find_live_mirror(struct map_lookup *map, int first, int num, 4017static int find_live_mirror(struct btrfs_fs_info *fs_info,
3768 int optimal) 4018 struct map_lookup *map, int first, int num,
4019 int optimal, int dev_replace_is_ongoing)
3769{ 4020{
3770 int i; 4021 int i;
3771 if (map->stripes[optimal].dev->bdev) 4022 int tolerance;
3772 return optimal; 4023 struct btrfs_device *srcdev;
3773 for (i = first; i < first + num; i++) { 4024
3774 if (map->stripes[i].dev->bdev) 4025 if (dev_replace_is_ongoing &&
3775 return i; 4026 fs_info->dev_replace.cont_reading_from_srcdev_mode ==
4027 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
4028 srcdev = fs_info->dev_replace.srcdev;
4029 else
4030 srcdev = NULL;
4031
4032 /*
4033 * try to avoid the drive that is the source drive for a
4034 * dev-replace procedure, only choose it if no other non-missing
4035 * mirror is available
4036 */
4037 for (tolerance = 0; tolerance < 2; tolerance++) {
4038 if (map->stripes[optimal].dev->bdev &&
4039 (tolerance || map->stripes[optimal].dev != srcdev))
4040 return optimal;
4041 for (i = first; i < first + num; i++) {
4042 if (map->stripes[i].dev->bdev &&
4043 (tolerance || map->stripes[i].dev != srcdev))
4044 return i;
4045 }
3776 } 4046 }
4047
3777 /* we couldn't find one that doesn't fail. Just return something 4048 /* we couldn't find one that doesn't fail. Just return something
3778 * and the io error handling code will clean up eventually 4049 * and the io error handling code will clean up eventually
3779 */ 4050 */
3780 return optimal; 4051 return optimal;
3781} 4052}
3782 4053
3783static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 4054static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
3784 u64 logical, u64 *length, 4055 u64 logical, u64 *length,
3785 struct btrfs_bio **bbio_ret, 4056 struct btrfs_bio **bbio_ret,
3786 int mirror_num) 4057 int mirror_num)
3787{ 4058{
3788 struct extent_map *em; 4059 struct extent_map *em;
3789 struct map_lookup *map; 4060 struct map_lookup *map;
4061 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
3790 struct extent_map_tree *em_tree = &map_tree->map_tree; 4062 struct extent_map_tree *em_tree = &map_tree->map_tree;
3791 u64 offset; 4063 u64 offset;
3792 u64 stripe_offset; 4064 u64 stripe_offset;
@@ -3800,6 +4072,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3800 int num_stripes; 4072 int num_stripes;
3801 int max_errors = 0; 4073 int max_errors = 0;
3802 struct btrfs_bio *bbio = NULL; 4074 struct btrfs_bio *bbio = NULL;
4075 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
4076 int dev_replace_is_ongoing = 0;
4077 int num_alloc_stripes;
4078 int patch_the_first_stripe_for_dev_replace = 0;
4079 u64 physical_to_patch_in_first_stripe = 0;
3803 4080
3804 read_lock(&em_tree->lock); 4081 read_lock(&em_tree->lock);
3805 em = lookup_extent_mapping(em_tree, logical, *length); 4082 em = lookup_extent_mapping(em_tree, logical, *length);
@@ -3816,9 +4093,6 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3816 map = (struct map_lookup *)em->bdev; 4093 map = (struct map_lookup *)em->bdev;
3817 offset = logical - em->start; 4094 offset = logical - em->start;
3818 4095
3819 if (mirror_num > map->num_stripes)
3820 mirror_num = 0;
3821
3822 stripe_nr = offset; 4096 stripe_nr = offset;
3823 /* 4097 /*
3824 * stripe_nr counts the total number of stripes we have to stride 4098 * stripe_nr counts the total number of stripes we have to stride
@@ -3845,6 +4119,93 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3845 if (!bbio_ret) 4119 if (!bbio_ret)
3846 goto out; 4120 goto out;
3847 4121
4122 btrfs_dev_replace_lock(dev_replace);
4123 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
4124 if (!dev_replace_is_ongoing)
4125 btrfs_dev_replace_unlock(dev_replace);
4126
4127 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
4128 !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
4129 dev_replace->tgtdev != NULL) {
4130 /*
4131 * in dev-replace case, for repair case (that's the only
4132 * case where the mirror is selected explicitly when
4133 * calling btrfs_map_block), blocks left of the left cursor
4134 * can also be read from the target drive.
4135 * For REQ_GET_READ_MIRRORS, the target drive is added as
4136 * the last one to the array of stripes. For READ, it also
4137 * needs to be supported using the same mirror number.
4138 * If the requested block is not left of the left cursor,
4139 * EIO is returned. This can happen because btrfs_num_copies()
4140 * returns one more in the dev-replace case.
4141 */
4142 u64 tmp_length = *length;
4143 struct btrfs_bio *tmp_bbio = NULL;
4144 int tmp_num_stripes;
4145 u64 srcdev_devid = dev_replace->srcdev->devid;
4146 int index_srcdev = 0;
4147 int found = 0;
4148 u64 physical_of_found = 0;
4149
4150 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
4151 logical, &tmp_length, &tmp_bbio, 0);
4152 if (ret) {
4153 WARN_ON(tmp_bbio != NULL);
4154 goto out;
4155 }
4156
4157 tmp_num_stripes = tmp_bbio->num_stripes;
4158 if (mirror_num > tmp_num_stripes) {
4159 /*
4160 * REQ_GET_READ_MIRRORS does not contain this
4161 * mirror, that means that the requested area
4162 * is not left of the left cursor
4163 */
4164 ret = -EIO;
4165 kfree(tmp_bbio);
4166 goto out;
4167 }
4168
4169 /*
4170 * process the rest of the function using the mirror_num
4171 * of the source drive. Therefore look it up first.
4172 * At the end, patch the device pointer to the one of the
4173 * target drive.
4174 */
4175 for (i = 0; i < tmp_num_stripes; i++) {
4176 if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
4177 /*
4178 * In case of DUP, in order to keep it
4179 * simple, only add the mirror with the
4180 * lowest physical address
4181 */
4182 if (found &&
4183 physical_of_found <=
4184 tmp_bbio->stripes[i].physical)
4185 continue;
4186 index_srcdev = i;
4187 found = 1;
4188 physical_of_found =
4189 tmp_bbio->stripes[i].physical;
4190 }
4191 }
4192
4193 if (found) {
4194 mirror_num = index_srcdev + 1;
4195 patch_the_first_stripe_for_dev_replace = 1;
4196 physical_to_patch_in_first_stripe = physical_of_found;
4197 } else {
4198 WARN_ON(1);
4199 ret = -EIO;
4200 kfree(tmp_bbio);
4201 goto out;
4202 }
4203
4204 kfree(tmp_bbio);
4205 } else if (mirror_num > map->num_stripes) {
4206 mirror_num = 0;
4207 }
4208
3848 num_stripes = 1; 4209 num_stripes = 1;
3849 stripe_index = 0; 4210 stripe_index = 0;
3850 stripe_nr_orig = stripe_nr; 4211 stripe_nr_orig = stripe_nr;
@@ -3859,19 +4220,20 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3859 stripe_nr_end - stripe_nr_orig); 4220 stripe_nr_end - stripe_nr_orig);
3860 stripe_index = do_div(stripe_nr, map->num_stripes); 4221 stripe_index = do_div(stripe_nr, map->num_stripes);
3861 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 4222 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3862 if (rw & (REQ_WRITE | REQ_DISCARD)) 4223 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
3863 num_stripes = map->num_stripes; 4224 num_stripes = map->num_stripes;
3864 else if (mirror_num) 4225 else if (mirror_num)
3865 stripe_index = mirror_num - 1; 4226 stripe_index = mirror_num - 1;
3866 else { 4227 else {
3867 stripe_index = find_live_mirror(map, 0, 4228 stripe_index = find_live_mirror(fs_info, map, 0,
3868 map->num_stripes, 4229 map->num_stripes,
3869 current->pid % map->num_stripes); 4230 current->pid % map->num_stripes,
4231 dev_replace_is_ongoing);
3870 mirror_num = stripe_index + 1; 4232 mirror_num = stripe_index + 1;
3871 } 4233 }
3872 4234
3873 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 4235 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3874 if (rw & (REQ_WRITE | REQ_DISCARD)) { 4236 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
3875 num_stripes = map->num_stripes; 4237 num_stripes = map->num_stripes;
3876 } else if (mirror_num) { 4238 } else if (mirror_num) {
3877 stripe_index = mirror_num - 1; 4239 stripe_index = mirror_num - 1;
@@ -3885,7 +4247,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3885 stripe_index = do_div(stripe_nr, factor); 4247 stripe_index = do_div(stripe_nr, factor);
3886 stripe_index *= map->sub_stripes; 4248 stripe_index *= map->sub_stripes;
3887 4249
3888 if (rw & REQ_WRITE) 4250 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
3889 num_stripes = map->sub_stripes; 4251 num_stripes = map->sub_stripes;
3890 else if (rw & REQ_DISCARD) 4252 else if (rw & REQ_DISCARD)
3891 num_stripes = min_t(u64, map->sub_stripes * 4253 num_stripes = min_t(u64, map->sub_stripes *
@@ -3895,9 +4257,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3895 stripe_index += mirror_num - 1; 4257 stripe_index += mirror_num - 1;
3896 else { 4258 else {
3897 int old_stripe_index = stripe_index; 4259 int old_stripe_index = stripe_index;
3898 stripe_index = find_live_mirror(map, stripe_index, 4260 stripe_index = find_live_mirror(fs_info, map,
4261 stripe_index,
3899 map->sub_stripes, stripe_index + 4262 map->sub_stripes, stripe_index +
3900 current->pid % map->sub_stripes); 4263 current->pid % map->sub_stripes,
4264 dev_replace_is_ongoing);
3901 mirror_num = stripe_index - old_stripe_index + 1; 4265 mirror_num = stripe_index - old_stripe_index + 1;
3902 } 4266 }
3903 } else { 4267 } else {
@@ -3911,7 +4275,14 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3911 } 4275 }
3912 BUG_ON(stripe_index >= map->num_stripes); 4276 BUG_ON(stripe_index >= map->num_stripes);
3913 4277
3914 bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); 4278 num_alloc_stripes = num_stripes;
4279 if (dev_replace_is_ongoing) {
4280 if (rw & (REQ_WRITE | REQ_DISCARD))
4281 num_alloc_stripes <<= 1;
4282 if (rw & REQ_GET_READ_MIRRORS)
4283 num_alloc_stripes++;
4284 }
4285 bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
3915 if (!bbio) { 4286 if (!bbio) {
3916 ret = -ENOMEM; 4287 ret = -ENOMEM;
3917 goto out; 4288 goto out;
@@ -3998,7 +4369,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3998 } 4369 }
3999 } 4370 }
4000 4371
4001 if (rw & REQ_WRITE) { 4372 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
4002 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4373 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4003 BTRFS_BLOCK_GROUP_RAID10 | 4374 BTRFS_BLOCK_GROUP_RAID10 |
4004 BTRFS_BLOCK_GROUP_DUP)) { 4375 BTRFS_BLOCK_GROUP_DUP)) {
@@ -4006,20 +4377,115 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
4006 } 4377 }
4007 } 4378 }
4008 4379
4380 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
4381 dev_replace->tgtdev != NULL) {
4382 int index_where_to_add;
4383 u64 srcdev_devid = dev_replace->srcdev->devid;
4384
4385 /*
4386 * duplicate the write operations while the dev replace
4387 * procedure is running. Since the copying of the old disk
4388 * to the new disk takes place at run time while the
4389 * filesystem is mounted writable, the regular write
4390 * operations to the old disk have to be duplicated to go
4391 * to the new disk as well.
4392 * Note that device->missing is handled by the caller, and
4393 * that the write to the old disk is already set up in the
4394 * stripes array.
4395 */
4396 index_where_to_add = num_stripes;
4397 for (i = 0; i < num_stripes; i++) {
4398 if (bbio->stripes[i].dev->devid == srcdev_devid) {
4399 /* write to new disk, too */
4400 struct btrfs_bio_stripe *new =
4401 bbio->stripes + index_where_to_add;
4402 struct btrfs_bio_stripe *old =
4403 bbio->stripes + i;
4404
4405 new->physical = old->physical;
4406 new->length = old->length;
4407 new->dev = dev_replace->tgtdev;
4408 index_where_to_add++;
4409 max_errors++;
4410 }
4411 }
4412 num_stripes = index_where_to_add;
4413 } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
4414 dev_replace->tgtdev != NULL) {
4415 u64 srcdev_devid = dev_replace->srcdev->devid;
4416 int index_srcdev = 0;
4417 int found = 0;
4418 u64 physical_of_found = 0;
4419
4420 /*
4421 * During the dev-replace procedure, the target drive can
4422 * also be used to read data in case it is needed to repair
4423 * a corrupt block elsewhere. This is possible if the
4424 * requested area is left of the left cursor. In this area,
4425 * the target drive is a full copy of the source drive.
4426 */
4427 for (i = 0; i < num_stripes; i++) {
4428 if (bbio->stripes[i].dev->devid == srcdev_devid) {
4429 /*
4430 * In case of DUP, in order to keep it
4431 * simple, only add the mirror with the
4432 * lowest physical address
4433 */
4434 if (found &&
4435 physical_of_found <=
4436 bbio->stripes[i].physical)
4437 continue;
4438 index_srcdev = i;
4439 found = 1;
4440 physical_of_found = bbio->stripes[i].physical;
4441 }
4442 }
4443 if (found) {
4444 u64 length = map->stripe_len;
4445
4446 if (physical_of_found + length <=
4447 dev_replace->cursor_left) {
4448 struct btrfs_bio_stripe *tgtdev_stripe =
4449 bbio->stripes + num_stripes;
4450
4451 tgtdev_stripe->physical = physical_of_found;
4452 tgtdev_stripe->length =
4453 bbio->stripes[index_srcdev].length;
4454 tgtdev_stripe->dev = dev_replace->tgtdev;
4455
4456 num_stripes++;
4457 }
4458 }
4459 }
4460
4009 *bbio_ret = bbio; 4461 *bbio_ret = bbio;
4010 bbio->num_stripes = num_stripes; 4462 bbio->num_stripes = num_stripes;
4011 bbio->max_errors = max_errors; 4463 bbio->max_errors = max_errors;
4012 bbio->mirror_num = mirror_num; 4464 bbio->mirror_num = mirror_num;
4465
4466 /*
4467 * this is the case that REQ_READ && dev_replace_is_ongoing &&
4468 * mirror_num == num_stripes + 1 && dev_replace target drive is
4469 * available as a mirror
4470 */
4471 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
4472 WARN_ON(num_stripes > 1);
4473 bbio->stripes[0].dev = dev_replace->tgtdev;
4474 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
4475 bbio->mirror_num = map->num_stripes + 1;
4476 }
4013out: 4477out:
4478 if (dev_replace_is_ongoing)
4479 btrfs_dev_replace_unlock(dev_replace);
4014 free_extent_map(em); 4480 free_extent_map(em);
4015 return ret; 4481 return ret;
4016} 4482}
4017 4483
4018int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 4484int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4019 u64 logical, u64 *length, 4485 u64 logical, u64 *length,
4020 struct btrfs_bio **bbio_ret, int mirror_num) 4486 struct btrfs_bio **bbio_ret, int mirror_num)
4021{ 4487{
4022 return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, 4488 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
4023 mirror_num); 4489 mirror_num);
4024} 4490}
4025 4491
@@ -4238,10 +4704,116 @@ static noinline void schedule_bio(struct btrfs_root *root,
4238 &device->work); 4704 &device->work);
4239} 4705}
4240 4706
4707static int bio_size_ok(struct block_device *bdev, struct bio *bio,
4708 sector_t sector)
4709{
4710 struct bio_vec *prev;
4711 struct request_queue *q = bdev_get_queue(bdev);
4712 unsigned short max_sectors = queue_max_sectors(q);
4713 struct bvec_merge_data bvm = {
4714 .bi_bdev = bdev,
4715 .bi_sector = sector,
4716 .bi_rw = bio->bi_rw,
4717 };
4718
4719 if (bio->bi_vcnt == 0) {
4720 WARN_ON(1);
4721 return 1;
4722 }
4723
4724 prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
4725 if ((bio->bi_size >> 9) > max_sectors)
4726 return 0;
4727
4728 if (!q->merge_bvec_fn)
4729 return 1;
4730
4731 bvm.bi_size = bio->bi_size - prev->bv_len;
4732 if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
4733 return 0;
4734 return 1;
4735}
4736
4737static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4738 struct bio *bio, u64 physical, int dev_nr,
4739 int rw, int async)
4740{
4741 struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
4742
4743 bio->bi_private = bbio;
4744 bio->bi_private = merge_stripe_index_into_bio_private(
4745 bio->bi_private, (unsigned int)dev_nr);
4746 bio->bi_end_io = btrfs_end_bio;
4747 bio->bi_sector = physical >> 9;
4748#ifdef DEBUG
4749 {
4750 struct rcu_string *name;
4751
4752 rcu_read_lock();
4753 name = rcu_dereference(dev->name);
4754 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
4755 "(%s id %llu), size=%u\n", rw,
4756 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4757 name->str, dev->devid, bio->bi_size);
4758 rcu_read_unlock();
4759 }
4760#endif
4761 bio->bi_bdev = dev->bdev;
4762 if (async)
4763 schedule_bio(root, dev, rw, bio);
4764 else
4765 btrfsic_submit_bio(rw, bio);
4766}
4767
4768static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4769 struct bio *first_bio, struct btrfs_device *dev,
4770 int dev_nr, int rw, int async)
4771{
4772 struct bio_vec *bvec = first_bio->bi_io_vec;
4773 struct bio *bio;
4774 int nr_vecs = bio_get_nr_vecs(dev->bdev);
4775 u64 physical = bbio->stripes[dev_nr].physical;
4776
4777again:
4778 bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
4779 if (!bio)
4780 return -ENOMEM;
4781
4782 while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
4783 if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
4784 bvec->bv_offset) < bvec->bv_len) {
4785 u64 len = bio->bi_size;
4786
4787 atomic_inc(&bbio->stripes_pending);
4788 submit_stripe_bio(root, bbio, bio, physical, dev_nr,
4789 rw, async);
4790 physical += len;
4791 goto again;
4792 }
4793 bvec++;
4794 }
4795
4796 submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
4797 return 0;
4798}
4799
4800static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
4801{
4802 atomic_inc(&bbio->error);
4803 if (atomic_dec_and_test(&bbio->stripes_pending)) {
4804 bio->bi_private = bbio->private;
4805 bio->bi_end_io = bbio->end_io;
4806 bio->bi_bdev = (struct block_device *)
4807 (unsigned long)bbio->mirror_num;
4808 bio->bi_sector = logical >> 9;
4809 kfree(bbio);
4810 bio_endio(bio, -EIO);
4811 }
4812}
4813
4241int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 4814int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4242 int mirror_num, int async_submit) 4815 int mirror_num, int async_submit)
4243{ 4816{
4244 struct btrfs_mapping_tree *map_tree;
4245 struct btrfs_device *dev; 4817 struct btrfs_device *dev;
4246 struct bio *first_bio = bio; 4818 struct bio *first_bio = bio;
4247 u64 logical = (u64)bio->bi_sector << 9; 4819 u64 logical = (u64)bio->bi_sector << 9;
@@ -4253,12 +4825,11 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4253 struct btrfs_bio *bbio = NULL; 4825 struct btrfs_bio *bbio = NULL;
4254 4826
4255 length = bio->bi_size; 4827 length = bio->bi_size;
4256 map_tree = &root->fs_info->mapping_tree;
4257 map_length = length; 4828 map_length = length;
4258 4829
4259 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, 4830 ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
4260 mirror_num); 4831 mirror_num);
4261 if (ret) /* -ENOMEM */ 4832 if (ret)
4262 return ret; 4833 return ret;
4263 4834
4264 total_devs = bbio->num_stripes; 4835 total_devs = bbio->num_stripes;
@@ -4276,52 +4847,48 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4276 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 4847 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4277 4848
4278 while (dev_nr < total_devs) { 4849 while (dev_nr < total_devs) {
4850 dev = bbio->stripes[dev_nr].dev;
4851 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
4852 bbio_error(bbio, first_bio, logical);
4853 dev_nr++;
4854 continue;
4855 }
4856
4857 /*
4858 * Check and see if we're ok with this bio based on it's size
4859 * and offset with the given device.
4860 */
4861 if (!bio_size_ok(dev->bdev, first_bio,
4862 bbio->stripes[dev_nr].physical >> 9)) {
4863 ret = breakup_stripe_bio(root, bbio, first_bio, dev,
4864 dev_nr, rw, async_submit);
4865 BUG_ON(ret);
4866 dev_nr++;
4867 continue;
4868 }
4869
4279 if (dev_nr < total_devs - 1) { 4870 if (dev_nr < total_devs - 1) {
4280 bio = bio_clone(first_bio, GFP_NOFS); 4871 bio = bio_clone(first_bio, GFP_NOFS);
4281 BUG_ON(!bio); /* -ENOMEM */ 4872 BUG_ON(!bio); /* -ENOMEM */
4282 } else { 4873 } else {
4283 bio = first_bio; 4874 bio = first_bio;
4284 } 4875 }
4285 bio->bi_private = bbio; 4876
4286 bio->bi_private = merge_stripe_index_into_bio_private( 4877 submit_stripe_bio(root, bbio, bio,
4287 bio->bi_private, (unsigned int)dev_nr); 4878 bbio->stripes[dev_nr].physical, dev_nr, rw,
4288 bio->bi_end_io = btrfs_end_bio; 4879 async_submit);
4289 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
4290 dev = bbio->stripes[dev_nr].dev;
4291 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
4292#ifdef DEBUG
4293 struct rcu_string *name;
4294
4295 rcu_read_lock();
4296 name = rcu_dereference(dev->name);
4297 pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
4298 "(%s id %llu), size=%u\n", rw,
4299 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4300 name->str, dev->devid, bio->bi_size);
4301 rcu_read_unlock();
4302#endif
4303 bio->bi_bdev = dev->bdev;
4304 if (async_submit)
4305 schedule_bio(root, dev, rw, bio);
4306 else
4307 btrfsic_submit_bio(rw, bio);
4308 } else {
4309 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
4310 bio->bi_sector = logical >> 9;
4311 bio_endio(bio, -EIO);
4312 }
4313 dev_nr++; 4880 dev_nr++;
4314 } 4881 }
4315 return 0; 4882 return 0;
4316} 4883}
4317 4884
4318struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 4885struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
4319 u8 *uuid, u8 *fsid) 4886 u8 *uuid, u8 *fsid)
4320{ 4887{
4321 struct btrfs_device *device; 4888 struct btrfs_device *device;
4322 struct btrfs_fs_devices *cur_devices; 4889 struct btrfs_fs_devices *cur_devices;
4323 4890
4324 cur_devices = root->fs_info->fs_devices; 4891 cur_devices = fs_info->fs_devices;
4325 while (cur_devices) { 4892 while (cur_devices) {
4326 if (!fsid || 4893 if (!fsid ||
4327 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 4894 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
@@ -4402,6 +4969,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
4402 em->bdev = (struct block_device *)map; 4969 em->bdev = (struct block_device *)map;
4403 em->start = logical; 4970 em->start = logical;
4404 em->len = length; 4971 em->len = length;
4972 em->orig_start = 0;
4405 em->block_start = 0; 4973 em->block_start = 0;
4406 em->block_len = em->len; 4974 em->block_len = em->len;
4407 4975
@@ -4419,8 +4987,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
4419 read_extent_buffer(leaf, uuid, (unsigned long) 4987 read_extent_buffer(leaf, uuid, (unsigned long)
4420 btrfs_stripe_dev_uuid_nr(chunk, i), 4988 btrfs_stripe_dev_uuid_nr(chunk, i),
4421 BTRFS_UUID_SIZE); 4989 BTRFS_UUID_SIZE);
4422 map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 4990 map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
4423 NULL); 4991 uuid, NULL);
4424 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 4992 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
4425 kfree(map); 4993 kfree(map);
4426 free_extent_map(em); 4994 free_extent_map(em);
@@ -4461,6 +5029,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,
4461 device->io_align = btrfs_device_io_align(leaf, dev_item); 5029 device->io_align = btrfs_device_io_align(leaf, dev_item);
4462 device->io_width = btrfs_device_io_width(leaf, dev_item); 5030 device->io_width = btrfs_device_io_width(leaf, dev_item);
4463 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 5031 device->sector_size = btrfs_device_sector_size(leaf, dev_item);
5032 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
5033 device->is_tgtdev_for_dev_replace = 0;
4464 5034
4465 ptr = (unsigned long)btrfs_device_uuid(dev_item); 5035 ptr = (unsigned long)btrfs_device_uuid(dev_item);
4466 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 5036 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
@@ -4538,7 +5108,7 @@ static int read_one_dev(struct btrfs_root *root,
4538 return ret; 5108 return ret;
4539 } 5109 }
4540 5110
4541 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 5111 device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
4542 if (!device || !device->bdev) { 5112 if (!device || !device->bdev) {
4543 if (!btrfs_test_opt(root, DEGRADED)) 5113 if (!btrfs_test_opt(root, DEGRADED))
4544 return -EIO; 5114 return -EIO;
@@ -4571,7 +5141,7 @@ static int read_one_dev(struct btrfs_root *root,
4571 fill_device_from_item(leaf, dev_item, device); 5141 fill_device_from_item(leaf, dev_item, device);
4572 device->dev_root = root->fs_info->dev_root; 5142 device->dev_root = root->fs_info->dev_root;
4573 device->in_fs_metadata = 1; 5143 device->in_fs_metadata = 1;
4574 if (device->writeable) { 5144 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
4575 device->fs_devices->total_rw_bytes += device->total_bytes; 5145 device->fs_devices->total_rw_bytes += device->total_bytes;
4576 spin_lock(&root->fs_info->free_chunk_lock); 5146 spin_lock(&root->fs_info->free_chunk_lock);
4577 root->fs_info->free_chunk_space += device->total_bytes - 5147 root->fs_info->free_chunk_space += device->total_bytes -
@@ -4930,7 +5500,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
4930 int i; 5500 int i;
4931 5501
4932 mutex_lock(&fs_devices->device_list_mutex); 5502 mutex_lock(&fs_devices->device_list_mutex);
4933 dev = btrfs_find_device(root, stats->devid, NULL, NULL); 5503 dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
4934 mutex_unlock(&fs_devices->device_list_mutex); 5504 mutex_unlock(&fs_devices->device_list_mutex);
4935 5505
4936 if (!dev) { 5506 if (!dev) {
@@ -4958,3 +5528,21 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
4958 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 5528 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
4959 return 0; 5529 return 0;
4960} 5530}
5531
5532int btrfs_scratch_superblock(struct btrfs_device *device)
5533{
5534 struct buffer_head *bh;
5535 struct btrfs_super_block *disk_super;
5536
5537 bh = btrfs_read_dev_super(device->bdev);
5538 if (!bh)
5539 return -EINVAL;
5540 disk_super = (struct btrfs_super_block *)bh->b_data;
5541
5542 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
5543 set_buffer_dirty(bh);
5544 sync_dirty_buffer(bh);
5545 brelse(bh);
5546
5547 return 0;
5548}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 53c06af92e8d..d3c3939ac751 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -50,6 +50,7 @@ struct btrfs_device {
50 int in_fs_metadata; 50 int in_fs_metadata;
51 int missing; 51 int missing;
52 int can_discard; 52 int can_discard;
53 int is_tgtdev_for_dev_replace;
53 54
54 spinlock_t io_lock; 55 spinlock_t io_lock;
55 56
@@ -88,7 +89,7 @@ struct btrfs_device {
88 u8 uuid[BTRFS_UUID_SIZE]; 89 u8 uuid[BTRFS_UUID_SIZE];
89 90
90 /* per-device scrub information */ 91 /* per-device scrub information */
91 struct scrub_dev *scrub_device; 92 struct scrub_ctx *scrub_device;
92 93
93 struct btrfs_work work; 94 struct btrfs_work work;
94 struct rcu_head rcu; 95 struct rcu_head rcu;
@@ -179,6 +180,15 @@ struct btrfs_device_info {
179 u64 total_avail; 180 u64 total_avail;
180}; 181};
181 182
183struct btrfs_raid_attr {
184 int sub_stripes; /* sub_stripes info for map */
185 int dev_stripes; /* stripes per dev */
186 int devs_max; /* max devs to use */
187 int devs_min; /* min devs needed */
188 int devs_increment; /* ndevs has to be a multiple of this */
189 int ncopies; /* how many copies to data has */
190};
191
182struct map_lookup { 192struct map_lookup {
183 u64 type; 193 u64 type;
184 int io_align; 194 int io_align;
@@ -248,7 +258,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
248 struct btrfs_device *device, 258 struct btrfs_device *device,
249 u64 chunk_tree, u64 chunk_objectid, 259 u64 chunk_tree, u64 chunk_objectid,
250 u64 chunk_offset, u64 start, u64 num_bytes); 260 u64 chunk_offset, u64 start, u64 num_bytes);
251int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 261int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
252 u64 logical, u64 *length, 262 u64 logical, u64 *length,
253 struct btrfs_bio **bbio_ret, int mirror_num); 263 struct btrfs_bio **bbio_ret, int mirror_num);
254int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 264int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -267,19 +277,27 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
267int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 277int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
268 struct btrfs_fs_devices **fs_devices_ret); 278 struct btrfs_fs_devices **fs_devices_ret);
269int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); 279int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
270void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); 280void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
281 struct btrfs_fs_devices *fs_devices, int step);
282int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
283 char *device_path,
284 struct btrfs_device **device);
285int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
286 struct btrfs_device **device);
271int btrfs_add_device(struct btrfs_trans_handle *trans, 287int btrfs_add_device(struct btrfs_trans_handle *trans,
272 struct btrfs_root *root, 288 struct btrfs_root *root,
273 struct btrfs_device *device); 289 struct btrfs_device *device);
274int btrfs_rm_device(struct btrfs_root *root, char *device_path); 290int btrfs_rm_device(struct btrfs_root *root, char *device_path);
275void btrfs_cleanup_fs_uuids(void); 291void btrfs_cleanup_fs_uuids(void);
276int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); 292int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
277int btrfs_grow_device(struct btrfs_trans_handle *trans, 293int btrfs_grow_device(struct btrfs_trans_handle *trans,
278 struct btrfs_device *device, u64 new_size); 294 struct btrfs_device *device, u64 new_size);
279struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 295struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
280 u8 *uuid, u8 *fsid); 296 u8 *uuid, u8 *fsid);
281int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); 297int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
282int btrfs_init_new_device(struct btrfs_root *root, char *path); 298int btrfs_init_new_device(struct btrfs_root *root, char *path);
299int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
300 struct btrfs_device **device_out);
283int btrfs_balance(struct btrfs_balance_control *bctl, 301int btrfs_balance(struct btrfs_balance_control *bctl,
284 struct btrfs_ioctl_balance_args *bargs); 302 struct btrfs_ioctl_balance_args *bargs);
285int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); 303int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
@@ -296,6 +314,13 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
296int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); 314int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
297int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 315int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
298 struct btrfs_fs_info *fs_info); 316 struct btrfs_fs_info *fs_info);
317void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
318 struct btrfs_device *srcdev);
319void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
320 struct btrfs_device *tgtdev);
321void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
322 struct btrfs_device *tgtdev);
323int btrfs_scratch_superblock(struct btrfs_device *device);
299 324
300static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 325static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
301 int index) 326 int index)
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3f4e2d69e83a..446a6848c554 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -122,6 +122,16 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
122 */ 122 */
123 if (!value) 123 if (!value)
124 goto out; 124 goto out;
125 } else {
126 di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
127 name, name_len, 0);
128 if (IS_ERR(di)) {
129 ret = PTR_ERR(di);
130 goto out;
131 }
132 if (!di && !value)
133 goto out;
134 btrfs_release_path(path);
125 } 135 }
126 136
127again: 137again:
@@ -198,6 +208,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
198 208
199 inode_inc_iversion(inode); 209 inode_inc_iversion(inode);
200 inode->i_ctime = CURRENT_TIME; 210 inode->i_ctime = CURRENT_TIME;
211 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
201 ret = btrfs_update_inode(trans, root, inode); 212 ret = btrfs_update_inode(trans, root, inode);
202 BUG_ON(ret); 213 BUG_ON(ret);
203out: 214out:
@@ -265,7 +276,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
265 276
266 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 277 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
267 if (verify_dir_item(root, leaf, di)) 278 if (verify_dir_item(root, leaf, di))
268 continue; 279 goto next;
269 280
270 name_len = btrfs_dir_name_len(leaf, di); 281 name_len = btrfs_dir_name_len(leaf, di);
271 total_size += name_len + 1; 282 total_size += name_len + 1;
diff --git a/fs/buffer.c b/fs/buffer.c
index b5f044283edb..c017a2dfb909 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -46,8 +46,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
46 46
47#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) 47#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
48 48
49inline void 49void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
50init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
51{ 50{
52 bh->b_end_io = handler; 51 bh->b_end_io = handler;
53 bh->b_private = private; 52 bh->b_private = private;
@@ -555,7 +554,7 @@ void emergency_thaw_all(void)
555 */ 554 */
556int sync_mapping_buffers(struct address_space *mapping) 555int sync_mapping_buffers(struct address_space *mapping)
557{ 556{
558 struct address_space *buffer_mapping = mapping->assoc_mapping; 557 struct address_space *buffer_mapping = mapping->private_data;
559 558
560 if (buffer_mapping == NULL || list_empty(&mapping->private_list)) 559 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
561 return 0; 560 return 0;
@@ -588,10 +587,10 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
588 struct address_space *buffer_mapping = bh->b_page->mapping; 587 struct address_space *buffer_mapping = bh->b_page->mapping;
589 588
590 mark_buffer_dirty(bh); 589 mark_buffer_dirty(bh);
591 if (!mapping->assoc_mapping) { 590 if (!mapping->private_data) {
592 mapping->assoc_mapping = buffer_mapping; 591 mapping->private_data = buffer_mapping;
593 } else { 592 } else {
594 BUG_ON(mapping->assoc_mapping != buffer_mapping); 593 BUG_ON(mapping->private_data != buffer_mapping);
595 } 594 }
596 if (!bh->b_assoc_map) { 595 if (!bh->b_assoc_map) {
597 spin_lock(&buffer_mapping->private_lock); 596 spin_lock(&buffer_mapping->private_lock);
@@ -788,7 +787,7 @@ void invalidate_inode_buffers(struct inode *inode)
788 if (inode_has_buffers(inode)) { 787 if (inode_has_buffers(inode)) {
789 struct address_space *mapping = &inode->i_data; 788 struct address_space *mapping = &inode->i_data;
790 struct list_head *list = &mapping->private_list; 789 struct list_head *list = &mapping->private_list;
791 struct address_space *buffer_mapping = mapping->assoc_mapping; 790 struct address_space *buffer_mapping = mapping->private_data;
792 791
793 spin_lock(&buffer_mapping->private_lock); 792 spin_lock(&buffer_mapping->private_lock);
794 while (!list_empty(list)) 793 while (!list_empty(list))
@@ -811,7 +810,7 @@ int remove_inode_buffers(struct inode *inode)
811 if (inode_has_buffers(inode)) { 810 if (inode_has_buffers(inode)) {
812 struct address_space *mapping = &inode->i_data; 811 struct address_space *mapping = &inode->i_data;
813 struct list_head *list = &mapping->private_list; 812 struct list_head *list = &mapping->private_list;
814 struct address_space *buffer_mapping = mapping->assoc_mapping; 813 struct address_space *buffer_mapping = mapping->private_data;
815 814
816 spin_lock(&buffer_mapping->private_lock); 815 spin_lock(&buffer_mapping->private_lock);
817 while (!list_empty(list)) { 816 while (!list_empty(list)) {
@@ -850,13 +849,10 @@ try_again:
850 if (!bh) 849 if (!bh)
851 goto no_grow; 850 goto no_grow;
852 851
853 bh->b_bdev = NULL;
854 bh->b_this_page = head; 852 bh->b_this_page = head;
855 bh->b_blocknr = -1; 853 bh->b_blocknr = -1;
856 head = bh; 854 head = bh;
857 855
858 bh->b_state = 0;
859 atomic_set(&bh->b_count, 0);
860 bh->b_size = size; 856 bh->b_size = size;
861 857
862 /* Link the buffer to its page */ 858 /* Link the buffer to its page */
@@ -911,6 +907,18 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
911 attach_page_buffers(page, head); 907 attach_page_buffers(page, head);
912} 908}
913 909
910static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
911{
912 sector_t retval = ~((sector_t)0);
913 loff_t sz = i_size_read(bdev->bd_inode);
914
915 if (sz) {
916 unsigned int sizebits = blksize_bits(size);
917 retval = (sz >> sizebits);
918 }
919 return retval;
920}
921
914/* 922/*
915 * Initialise the state of a blockdev page's buffers. 923 * Initialise the state of a blockdev page's buffers.
916 */ 924 */
@@ -921,7 +929,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
921 struct buffer_head *head = page_buffers(page); 929 struct buffer_head *head = page_buffers(page);
922 struct buffer_head *bh = head; 930 struct buffer_head *bh = head;
923 int uptodate = PageUptodate(page); 931 int uptodate = PageUptodate(page);
924 sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode)); 932 sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
925 933
926 do { 934 do {
927 if (!buffer_mapped(bh)) { 935 if (!buffer_mapped(bh)) {
@@ -1553,6 +1561,28 @@ void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1553EXPORT_SYMBOL(unmap_underlying_metadata); 1561EXPORT_SYMBOL(unmap_underlying_metadata);
1554 1562
1555/* 1563/*
1564 * Size is a power-of-two in the range 512..PAGE_SIZE,
1565 * and the case we care about most is PAGE_SIZE.
1566 *
1567 * So this *could* possibly be written with those
1568 * constraints in mind (relevant mostly if some
1569 * architecture has a slow bit-scan instruction)
1570 */
1571static inline int block_size_bits(unsigned int blocksize)
1572{
1573 return ilog2(blocksize);
1574}
1575
1576static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
1577{
1578 BUG_ON(!PageLocked(page));
1579
1580 if (!page_has_buffers(page))
1581 create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
1582 return page_buffers(page);
1583}
1584
1585/*
1556 * NOTE! All mapped/uptodate combinations are valid: 1586 * NOTE! All mapped/uptodate combinations are valid:
1557 * 1587 *
1558 * Mapped Uptodate Meaning 1588 * Mapped Uptodate Meaning
@@ -1589,19 +1619,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1589 sector_t block; 1619 sector_t block;
1590 sector_t last_block; 1620 sector_t last_block;
1591 struct buffer_head *bh, *head; 1621 struct buffer_head *bh, *head;
1592 const unsigned blocksize = 1 << inode->i_blkbits; 1622 unsigned int blocksize, bbits;
1593 int nr_underway = 0; 1623 int nr_underway = 0;
1594 int write_op = (wbc->sync_mode == WB_SYNC_ALL ? 1624 int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1595 WRITE_SYNC : WRITE); 1625 WRITE_SYNC : WRITE);
1596 1626
1597 BUG_ON(!PageLocked(page)); 1627 head = create_page_buffers(page, inode,
1598
1599 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1600
1601 if (!page_has_buffers(page)) {
1602 create_empty_buffers(page, blocksize,
1603 (1 << BH_Dirty)|(1 << BH_Uptodate)); 1628 (1 << BH_Dirty)|(1 << BH_Uptodate));
1604 }
1605 1629
1606 /* 1630 /*
1607 * Be very careful. We have no exclusion from __set_page_dirty_buffers 1631 * Be very careful. We have no exclusion from __set_page_dirty_buffers
@@ -1613,9 +1637,12 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1613 * handle that here by just cleaning them. 1637 * handle that here by just cleaning them.
1614 */ 1638 */
1615 1639
1616 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1617 head = page_buffers(page);
1618 bh = head; 1640 bh = head;
1641 blocksize = bh->b_size;
1642 bbits = block_size_bits(blocksize);
1643
1644 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1645 last_block = (i_size_read(inode) - 1) >> bbits;
1619 1646
1620 /* 1647 /*
1621 * Get all the dirty buffers mapped to disk addresses and 1648 * Get all the dirty buffers mapped to disk addresses and
@@ -1806,12 +1833,10 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1806 BUG_ON(to > PAGE_CACHE_SIZE); 1833 BUG_ON(to > PAGE_CACHE_SIZE);
1807 BUG_ON(from > to); 1834 BUG_ON(from > to);
1808 1835
1809 blocksize = 1 << inode->i_blkbits; 1836 head = create_page_buffers(page, inode, 0);
1810 if (!page_has_buffers(page)) 1837 blocksize = head->b_size;
1811 create_empty_buffers(page, blocksize, 0); 1838 bbits = block_size_bits(blocksize);
1812 head = page_buffers(page);
1813 1839
1814 bbits = inode->i_blkbits;
1815 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); 1840 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1816 1841
1817 for(bh = head, block_start = 0; bh != head || !block_start; 1842 for(bh = head, block_start = 0; bh != head || !block_start;
@@ -1881,11 +1906,11 @@ static int __block_commit_write(struct inode *inode, struct page *page,
1881 unsigned blocksize; 1906 unsigned blocksize;
1882 struct buffer_head *bh, *head; 1907 struct buffer_head *bh, *head;
1883 1908
1884 blocksize = 1 << inode->i_blkbits; 1909 bh = head = page_buffers(page);
1910 blocksize = bh->b_size;
1885 1911
1886 for(bh = head = page_buffers(page), block_start = 0; 1912 block_start = 0;
1887 bh != head || !block_start; 1913 do {
1888 block_start=block_end, bh = bh->b_this_page) {
1889 block_end = block_start + blocksize; 1914 block_end = block_start + blocksize;
1890 if (block_end <= from || block_start >= to) { 1915 if (block_end <= from || block_start >= to) {
1891 if (!buffer_uptodate(bh)) 1916 if (!buffer_uptodate(bh))
@@ -1895,7 +1920,10 @@ static int __block_commit_write(struct inode *inode, struct page *page,
1895 mark_buffer_dirty(bh); 1920 mark_buffer_dirty(bh);
1896 } 1921 }
1897 clear_buffer_new(bh); 1922 clear_buffer_new(bh);
1898 } 1923
1924 block_start = block_end;
1925 bh = bh->b_this_page;
1926 } while (bh != head);
1899 1927
1900 /* 1928 /*
1901 * If this is a partial write which happened to make all buffers 1929 * If this is a partial write which happened to make all buffers
@@ -2020,7 +2048,6 @@ EXPORT_SYMBOL(generic_write_end);
2020int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, 2048int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2021 unsigned long from) 2049 unsigned long from)
2022{ 2050{
2023 struct inode *inode = page->mapping->host;
2024 unsigned block_start, block_end, blocksize; 2051 unsigned block_start, block_end, blocksize;
2025 unsigned to; 2052 unsigned to;
2026 struct buffer_head *bh, *head; 2053 struct buffer_head *bh, *head;
@@ -2029,13 +2056,13 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2029 if (!page_has_buffers(page)) 2056 if (!page_has_buffers(page))
2030 return 0; 2057 return 0;
2031 2058
2032 blocksize = 1 << inode->i_blkbits; 2059 head = page_buffers(page);
2060 blocksize = head->b_size;
2033 to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); 2061 to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2034 to = from + to; 2062 to = from + to;
2035 if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) 2063 if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2036 return 0; 2064 return 0;
2037 2065
2038 head = page_buffers(page);
2039 bh = head; 2066 bh = head;
2040 block_start = 0; 2067 block_start = 0;
2041 do { 2068 do {
@@ -2068,18 +2095,16 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
2068 struct inode *inode = page->mapping->host; 2095 struct inode *inode = page->mapping->host;
2069 sector_t iblock, lblock; 2096 sector_t iblock, lblock;
2070 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; 2097 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2071 unsigned int blocksize; 2098 unsigned int blocksize, bbits;
2072 int nr, i; 2099 int nr, i;
2073 int fully_mapped = 1; 2100 int fully_mapped = 1;
2074 2101
2075 BUG_ON(!PageLocked(page)); 2102 head = create_page_buffers(page, inode, 0);
2076 blocksize = 1 << inode->i_blkbits; 2103 blocksize = head->b_size;
2077 if (!page_has_buffers(page)) 2104 bbits = block_size_bits(blocksize);
2078 create_empty_buffers(page, blocksize, 0);
2079 head = page_buffers(page);
2080 2105
2081 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 2106 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
2082 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; 2107 lblock = (i_size_read(inode)+blocksize-1) >> bbits;
2083 bh = head; 2108 bh = head;
2084 nr = 0; 2109 nr = 0;
2085 i = 0; 2110 i = 0;
@@ -2864,6 +2889,55 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
2864 bio_put(bio); 2889 bio_put(bio);
2865} 2890}
2866 2891
2892/*
2893 * This allows us to do IO even on the odd last sectors
2894 * of a device, even if the bh block size is some multiple
2895 * of the physical sector size.
2896 *
2897 * We'll just truncate the bio to the size of the device,
2898 * and clear the end of the buffer head manually.
2899 *
2900 * Truly out-of-range accesses will turn into actual IO
2901 * errors, this only handles the "we need to be able to
2902 * do IO at the final sector" case.
2903 */
2904static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
2905{
2906 sector_t maxsector;
2907 unsigned bytes;
2908
2909 maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
2910 if (!maxsector)
2911 return;
2912
2913 /*
2914 * If the *whole* IO is past the end of the device,
2915 * let it through, and the IO layer will turn it into
2916 * an EIO.
2917 */
2918 if (unlikely(bio->bi_sector >= maxsector))
2919 return;
2920
2921 maxsector -= bio->bi_sector;
2922 bytes = bio->bi_size;
2923 if (likely((bytes >> 9) <= maxsector))
2924 return;
2925
2926 /* Uhhuh. We've got a bh that straddles the device size! */
2927 bytes = maxsector << 9;
2928
2929 /* Truncate the bio.. */
2930 bio->bi_size = bytes;
2931 bio->bi_io_vec[0].bv_len = bytes;
2932
2933 /* ..and clear the end of the buffer for reads */
2934 if ((rw & RW_MASK) == READ) {
2935 void *kaddr = kmap_atomic(bh->b_page);
2936 memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes);
2937 kunmap_atomic(kaddr);
2938 }
2939}
2940
2867int submit_bh(int rw, struct buffer_head * bh) 2941int submit_bh(int rw, struct buffer_head * bh)
2868{ 2942{
2869 struct bio *bio; 2943 struct bio *bio;
@@ -2900,6 +2974,9 @@ int submit_bh(int rw, struct buffer_head * bh)
2900 bio->bi_end_io = end_bio_bh_io_sync; 2974 bio->bi_end_io = end_bio_bh_io_sync;
2901 bio->bi_private = bh; 2975 bio->bi_private = bh;
2902 2976
2977 /* Take care of bh's that straddle the end of the device */
2978 guard_bh_eod(rw, bio, bh);
2979
2903 bio_get(bio); 2980 bio_get(bio);
2904 submit_bio(rw, bio); 2981 submit_bio(rw, bio);
2905 2982
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e5b77319c97b..8c1aabe93b67 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -454,7 +454,7 @@ static void reset_readdir(struct ceph_file_info *fi)
454 fi->flags &= ~CEPH_F_ATEND; 454 fi->flags &= ~CEPH_F_ATEND;
455} 455}
456 456
457static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) 457static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
458{ 458{
459 struct ceph_file_info *fi = file->private_data; 459 struct ceph_file_info *fi = file->private_data;
460 struct inode *inode = file->f_mapping->host; 460 struct inode *inode = file->f_mapping->host;
@@ -463,7 +463,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
463 463
464 mutex_lock(&inode->i_mutex); 464 mutex_lock(&inode->i_mutex);
465 retval = -EINVAL; 465 retval = -EINVAL;
466 switch (origin) { 466 switch (whence) {
467 case SEEK_END: 467 case SEEK_END:
468 offset += inode->i_size + 2; /* FIXME */ 468 offset += inode->i_size + 2; /* FIXME */
469 break; 469 break;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9349bb37a2fe..ca3ab3f9ca70 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -56,13 +56,15 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
56 struct ceph_nfs_confh *cfh = (void *)rawfh; 56 struct ceph_nfs_confh *cfh = (void *)rawfh;
57 int connected_handle_length = sizeof(*cfh)/4; 57 int connected_handle_length = sizeof(*cfh)/4;
58 int handle_length = sizeof(*fh)/4; 58 int handle_length = sizeof(*fh)/4;
59 struct dentry *dentry = d_find_alias(inode); 59 struct dentry *dentry;
60 struct dentry *parent; 60 struct dentry *parent;
61 61
62 /* don't re-export snaps */ 62 /* don't re-export snaps */
63 if (ceph_snap(inode) != CEPH_NOSNAP) 63 if (ceph_snap(inode) != CEPH_NOSNAP)
64 return -EINVAL; 64 return -EINVAL;
65 65
66 dentry = d_find_alias(inode);
67
66 /* if we found an alias, generate a connectable fh */ 68 /* if we found an alias, generate a connectable fh */
67 if (*max_len >= connected_handle_length && dentry) { 69 if (*max_len >= connected_handle_length && dentry) {
68 dout("encode_fh %p connectable\n", dentry); 70 dout("encode_fh %p connectable\n", dentry);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5840d2aaed15..d4dfdcf76d7f 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -797,7 +797,7 @@ out:
797/* 797/*
798 * llseek. be sure to verify file size on SEEK_END. 798 * llseek. be sure to verify file size on SEEK_END.
799 */ 799 */
800static loff_t ceph_llseek(struct file *file, loff_t offset, int origin) 800static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
801{ 801{
802 struct inode *inode = file->f_mapping->host; 802 struct inode *inode = file->f_mapping->host;
803 int ret; 803 int ret;
@@ -805,7 +805,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
805 mutex_lock(&inode->i_mutex); 805 mutex_lock(&inode->i_mutex);
806 __ceph_do_pending_vmtruncate(inode); 806 __ceph_do_pending_vmtruncate(inode);
807 807
808 if (origin == SEEK_END || origin == SEEK_DATA || origin == SEEK_HOLE) { 808 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
809 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); 809 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
810 if (ret < 0) { 810 if (ret < 0) {
811 offset = ret; 811 offset = ret;
@@ -813,7 +813,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
813 } 813 }
814 } 814 }
815 815
816 switch (origin) { 816 switch (whence) {
817 case SEEK_END: 817 case SEEK_END:
818 offset += inode->i_size; 818 offset += inode->i_size;
819 break; 819 break;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 2075ddfffa73..21ff76c22a17 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -122,9 +122,17 @@ config CIFS_ACL
122 Allows fetching CIFS/NTFS ACL from the server. The DACL blob 122 Allows fetching CIFS/NTFS ACL from the server. The DACL blob
123 is handed over to the application/caller. 123 is handed over to the application/caller.
124 124
125config CIFS_DEBUG
126 bool "Enable CIFS debugging routines"
127 default y
128 depends on CIFS
129 help
130 Enabling this option adds helpful debugging messages to
131 the cifs code which increases the size of the cifs module.
132 If unsure, say Y.
125config CIFS_DEBUG2 133config CIFS_DEBUG2
126 bool "Enable additional CIFS debugging routines" 134 bool "Enable additional CIFS debugging routines"
127 depends on CIFS 135 depends on CIFS_DEBUG
128 help 136 help
129 Enabling this option adds a few more debugging routines 137 Enabling this option adds a few more debugging routines
130 to the cifs code which slightly increases the size of 138 to the cifs code which slightly increases the size of
diff --git a/fs/cifs/README b/fs/cifs/README
index 22ab7b5b8da7..2d5622f60e11 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -480,7 +480,7 @@ A partial list of the supported mount options follows:
480 Unicode on the wire. 480 Unicode on the wire.
481 nomapchars Do not translate any of these seven characters (default). 481 nomapchars Do not translate any of these seven characters (default).
482 nocase Request case insensitive path name matching (case 482 nocase Request case insensitive path name matching (case
483 sensitive is the default if the server suports it). 483 sensitive is the default if the server supports it).
484 (mount option "ignorecase" is identical to "nocase") 484 (mount option "ignorecase" is identical to "nocase")
485 posixpaths If CIFS Unix extensions are supported, attempt to 485 posixpaths If CIFS Unix extensions are supported, attempt to
486 negotiate posix path name support which allows certain 486 negotiate posix path name support which allows certain
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index c0c68bb492d7..86e92ef2abc1 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -18,7 +18,6 @@
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 * 19 *
20*/ 20*/
21#define CIFS_DEBUG /* BB temporary */
22 21
23#ifndef _H_CIFS_DEBUG 22#ifndef _H_CIFS_DEBUG
24#define _H_CIFS_DEBUG 23#define _H_CIFS_DEBUG
@@ -37,49 +36,43 @@ void dump_smb(void *, int);
37#define CIFS_RC 0x02 36#define CIFS_RC 0x02
38#define CIFS_TIMER 0x04 37#define CIFS_TIMER 0x04
39 38
39extern int cifsFYI;
40extern int cifsERROR;
41
40/* 42/*
41 * debug ON 43 * debug ON
42 * -------- 44 * --------
43 */ 45 */
44#ifdef CIFS_DEBUG 46#ifdef CONFIG_CIFS_DEBUG
45 47
46/* information message: e.g., configuration, major event */ 48/* information message: e.g., configuration, major event */
47extern int cifsFYI; 49#define cifsfyi(fmt, ...) \
48#define cifsfyi(fmt, arg...) \
49do { \ 50do { \
50 if (cifsFYI & CIFS_INFO) \ 51 if (cifsFYI & CIFS_INFO) \
51 printk(KERN_DEBUG "%s: " fmt "\n", __FILE__, ##arg); \ 52 printk(KERN_DEBUG "%s: " fmt "\n", \
53 __FILE__, ##__VA_ARGS__); \
52} while (0) 54} while (0)
53 55
54#define cFYI(set, fmt, arg...) \ 56#define cFYI(set, fmt, ...) \
55do { \ 57do { \
56 if (set) \ 58 if (set) \
57 cifsfyi(fmt, ##arg); \ 59 cifsfyi(fmt, ##__VA_ARGS__); \
58} while (0) 60} while (0)
59 61
60#define cifswarn(fmt, arg...) \ 62#define cifswarn(fmt, ...) \
61 printk(KERN_WARNING fmt "\n", ##arg) 63 printk(KERN_WARNING fmt "\n", ##__VA_ARGS__)
62 64
63/* debug event message: */ 65/* error event message: e.g., i/o error */
64extern int cifsERROR; 66#define cifserror(fmt, ...) \
65
66#define cEVENT(fmt, arg...) \
67do { \ 67do { \
68 if (cifsERROR) \ 68 if (cifsERROR) \
69 printk(KERN_EVENT "%s: " fmt "\n", __FILE__, ##arg); \ 69 printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \
70} while (0)
71
72/* error event message: e.g., i/o error */
73#define cifserror(fmt, arg...) \
74do { \
75 if (cifsERROR) \
76 printk(KERN_ERR "CIFS VFS: " fmt "\n", ##arg); \
77} while (0) 70} while (0)
78 71
79#define cERROR(set, fmt, arg...) \ 72#define cERROR(set, fmt, ...) \
80do { \ 73do { \
81 if (set) \ 74 if (set) \
82 cifserror(fmt, ##arg); \ 75 cifserror(fmt, ##__VA_ARGS__); \
83} while (0) 76} while (0)
84 77
85/* 78/*
@@ -87,10 +80,27 @@ do { \
87 * --------- 80 * ---------
88 */ 81 */
89#else /* _CIFS_DEBUG */ 82#else /* _CIFS_DEBUG */
90#define cERROR(set, fmt, arg...) 83#define cifsfyi(fmt, ...) \
91#define cEVENT(fmt, arg...) 84do { \
92#define cFYI(set, fmt, arg...) 85 if (0) \
93#define cifserror(fmt, arg...) 86 printk(KERN_DEBUG "%s: " fmt "\n", \
87 __FILE__, ##__VA_ARGS__); \
88} while (0)
89#define cFYI(set, fmt, ...) \
90do { \
91 if (0 && set) \
92 cifsfyi(fmt, ##__VA_ARGS__); \
93} while (0)
94#define cifserror(fmt, ...) \
95do { \
96 if (0) \
97 printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \
98} while (0)
99#define cERROR(set, fmt, ...) \
100do { \
101 if (0 && set) \
102 cifserror(fmt, ##__VA_ARGS__); \
103} while (0)
94#endif /* _CIFS_DEBUG */ 104#endif /* _CIFS_DEBUG */
95 105
96#endif /* _H_CIFS_DEBUG */ 106#endif /* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index fc783e264420..5cbd00e74067 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -42,135 +42,27 @@ static const struct cifs_sid sid_authusers = {
42/* group users */ 42/* group users */
43static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; 43static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
44 44
45const struct cred *root_cred; 45static const struct cred *root_cred;
46
47static void
48shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem,
49 int *nr_del)
50{
51 struct rb_node *node;
52 struct rb_node *tmp;
53 struct cifs_sid_id *psidid;
54
55 node = rb_first(root);
56 while (node) {
57 tmp = node;
58 node = rb_next(tmp);
59 psidid = rb_entry(tmp, struct cifs_sid_id, rbnode);
60 if (nr_to_scan == 0 || *nr_del == nr_to_scan)
61 ++(*nr_rem);
62 else {
63 if (time_after(jiffies, psidid->time + SID_MAP_EXPIRE)
64 && psidid->refcount == 0) {
65 rb_erase(tmp, root);
66 ++(*nr_del);
67 } else
68 ++(*nr_rem);
69 }
70 }
71}
72
73/*
74 * Run idmap cache shrinker.
75 */
76static int
77cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc)
78{
79 int nr_to_scan = sc->nr_to_scan;
80 int nr_del = 0;
81 int nr_rem = 0;
82 struct rb_root *root;
83
84 root = &uidtree;
85 spin_lock(&siduidlock);
86 shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
87 spin_unlock(&siduidlock);
88
89 root = &gidtree;
90 spin_lock(&sidgidlock);
91 shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
92 spin_unlock(&sidgidlock);
93
94 root = &siduidtree;
95 spin_lock(&uidsidlock);
96 shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
97 spin_unlock(&uidsidlock);
98
99 root = &sidgidtree;
100 spin_lock(&gidsidlock);
101 shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
102 spin_unlock(&gidsidlock);
103
104 return nr_rem;
105}
106
107static void
108sid_rb_insert(struct rb_root *root, unsigned long cid,
109 struct cifs_sid_id **psidid, char *typestr)
110{
111 char *strptr;
112 struct rb_node *node = root->rb_node;
113 struct rb_node *parent = NULL;
114 struct rb_node **linkto = &(root->rb_node);
115 struct cifs_sid_id *lsidid;
116
117 while (node) {
118 lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
119 parent = node;
120 if (cid > lsidid->id) {
121 linkto = &(node->rb_left);
122 node = node->rb_left;
123 }
124 if (cid < lsidid->id) {
125 linkto = &(node->rb_right);
126 node = node->rb_right;
127 }
128 }
129
130 (*psidid)->id = cid;
131 (*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
132 (*psidid)->refcount = 0;
133
134 sprintf((*psidid)->sidstr, "%s", typestr);
135 strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr);
136 sprintf(strptr, "%ld", cid);
137
138 clear_bit(SID_ID_PENDING, &(*psidid)->state);
139 clear_bit(SID_ID_MAPPED, &(*psidid)->state);
140
141 rb_link_node(&(*psidid)->rbnode, parent, linkto);
142 rb_insert_color(&(*psidid)->rbnode, root);
143}
144
145static struct cifs_sid_id *
146sid_rb_search(struct rb_root *root, unsigned long cid)
147{
148 struct rb_node *node = root->rb_node;
149 struct cifs_sid_id *lsidid;
150
151 while (node) {
152 lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
153 if (cid > lsidid->id)
154 node = node->rb_left;
155 else if (cid < lsidid->id)
156 node = node->rb_right;
157 else /* node found */
158 return lsidid;
159 }
160
161 return NULL;
162}
163
164static struct shrinker cifs_shrinker = {
165 .shrink = cifs_idmap_shrinker,
166 .seeks = DEFAULT_SEEKS,
167};
168 46
169static int 47static int
170cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep) 48cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
171{ 49{
172 char *payload; 50 char *payload;
173 51
52 /*
53 * If the payload is less than or equal to the size of a pointer, then
54 * an allocation here is wasteful. Just copy the data directly to the
55 * payload.value union member instead.
56 *
57 * With this however, you must check the datalen before trying to
58 * dereference payload.data!
59 */
60 if (prep->datalen <= sizeof(key->payload)) {
61 key->payload.value = 0;
62 memcpy(&key->payload.value, prep->data, prep->datalen);
63 key->datalen = prep->datalen;
64 return 0;
65 }
174 payload = kmalloc(prep->datalen, GFP_KERNEL); 66 payload = kmalloc(prep->datalen, GFP_KERNEL);
175 if (!payload) 67 if (!payload)
176 return -ENOMEM; 68 return -ENOMEM;
@@ -184,10 +76,11 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
184static inline void 76static inline void
185cifs_idmap_key_destroy(struct key *key) 77cifs_idmap_key_destroy(struct key *key)
186{ 78{
187 kfree(key->payload.data); 79 if (key->datalen > sizeof(key->payload))
80 kfree(key->payload.data);
188} 81}
189 82
190struct key_type cifs_idmap_key_type = { 83static struct key_type cifs_idmap_key_type = {
191 .name = "cifs.idmap", 84 .name = "cifs.idmap",
192 .instantiate = cifs_idmap_key_instantiate, 85 .instantiate = cifs_idmap_key_instantiate,
193 .destroy = cifs_idmap_key_destroy, 86 .destroy = cifs_idmap_key_destroy,
@@ -195,214 +88,174 @@ struct key_type cifs_idmap_key_type = {
195 .match = user_match, 88 .match = user_match,
196}; 89};
197 90
198static void 91static char *
199sid_to_str(struct cifs_sid *sidptr, char *sidstr) 92sid_to_key_str(struct cifs_sid *sidptr, unsigned int type)
200{ 93{
201 int i; 94 int i, len;
202 unsigned long saval; 95 unsigned int saval;
203 char *strptr; 96 char *sidstr, *strptr;
97 unsigned long long id_auth_val;
98
99 /* 3 bytes for prefix */
100 sidstr = kmalloc(3 + SID_STRING_BASE_SIZE +
101 (SID_STRING_SUBAUTH_SIZE * sidptr->num_subauth),
102 GFP_KERNEL);
103 if (!sidstr)
104 return sidstr;
204 105
205 strptr = sidstr; 106 strptr = sidstr;
107 len = sprintf(strptr, "%cs:S-%hhu", type == SIDOWNER ? 'o' : 'g',
108 sidptr->revision);
109 strptr += len;
110
111 /* The authority field is a single 48-bit number */
112 id_auth_val = (unsigned long long)sidptr->authority[5];
113 id_auth_val |= (unsigned long long)sidptr->authority[4] << 8;
114 id_auth_val |= (unsigned long long)sidptr->authority[3] << 16;
115 id_auth_val |= (unsigned long long)sidptr->authority[2] << 24;
116 id_auth_val |= (unsigned long long)sidptr->authority[1] << 32;
117 id_auth_val |= (unsigned long long)sidptr->authority[0] << 48;
206 118
207 sprintf(strptr, "%s", "S"); 119 /*
208 strptr = sidstr + strlen(sidstr); 120 * MS-DTYP states that if the authority is >= 2^32, then it should be
209 121 * expressed as a hex value.
210 sprintf(strptr, "-%d", sidptr->revision); 122 */
211 strptr = sidstr + strlen(sidstr); 123 if (id_auth_val <= UINT_MAX)
124 len = sprintf(strptr, "-%llu", id_auth_val);
125 else
126 len = sprintf(strptr, "-0x%llx", id_auth_val);
212 127
213 for (i = 0; i < 6; ++i) { 128 strptr += len;
214 if (sidptr->authority[i]) {
215 sprintf(strptr, "-%d", sidptr->authority[i]);
216 strptr = sidstr + strlen(sidstr);
217 }
218 }
219 129
220 for (i = 0; i < sidptr->num_subauth; ++i) { 130 for (i = 0; i < sidptr->num_subauth; ++i) {
221 saval = le32_to_cpu(sidptr->sub_auth[i]); 131 saval = le32_to_cpu(sidptr->sub_auth[i]);
222 sprintf(strptr, "-%ld", saval); 132 len = sprintf(strptr, "-%u", saval);
223 strptr = sidstr + strlen(sidstr); 133 strptr += len;
224 } 134 }
135
136 return sidstr;
225} 137}
226 138
227static void 139/*
228id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr, 140 * if the two SIDs (roughly equivalent to a UUID for a user or group) are
229 struct cifs_sid_id **psidid, char *typestr) 141 * the same returns zero, if they do not match returns non-zero.
142 */
143static int
144compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
230{ 145{
231 int rc; 146 int i;
232 char *strptr; 147 int num_subauth, num_sat, num_saw;
233 struct rb_node *node = root->rb_node;
234 struct rb_node *parent = NULL;
235 struct rb_node **linkto = &(root->rb_node);
236 struct cifs_sid_id *lsidid;
237
238 while (node) {
239 lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
240 parent = node;
241 rc = compare_sids(sidptr, &((lsidid)->sid));
242 if (rc > 0) {
243 linkto = &(node->rb_left);
244 node = node->rb_left;
245 } else if (rc < 0) {
246 linkto = &(node->rb_right);
247 node = node->rb_right;
248 }
249 }
250
251 memcpy(&(*psidid)->sid, sidptr, sizeof(struct cifs_sid));
252 (*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
253 (*psidid)->refcount = 0;
254 148
255 sprintf((*psidid)->sidstr, "%s", typestr); 149 if ((!ctsid) || (!cwsid))
256 strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr); 150 return 1;
257 sid_to_str(&(*psidid)->sid, strptr);
258 151
259 clear_bit(SID_ID_PENDING, &(*psidid)->state); 152 /* compare the revision */
260 clear_bit(SID_ID_MAPPED, &(*psidid)->state); 153 if (ctsid->revision != cwsid->revision) {
154 if (ctsid->revision > cwsid->revision)
155 return 1;
156 else
157 return -1;
158 }
261 159
262 rb_link_node(&(*psidid)->rbnode, parent, linkto); 160 /* compare all of the six auth values */
263 rb_insert_color(&(*psidid)->rbnode, root); 161 for (i = 0; i < NUM_AUTHS; ++i) {
264} 162 if (ctsid->authority[i] != cwsid->authority[i]) {
163 if (ctsid->authority[i] > cwsid->authority[i])
164 return 1;
165 else
166 return -1;
167 }
168 }
265 169
266static struct cifs_sid_id * 170 /* compare all of the subauth values if any */
267id_rb_search(struct rb_root *root, struct cifs_sid *sidptr) 171 num_sat = ctsid->num_subauth;
268{ 172 num_saw = cwsid->num_subauth;
269 int rc; 173 num_subauth = num_sat < num_saw ? num_sat : num_saw;
270 struct rb_node *node = root->rb_node; 174 if (num_subauth) {
271 struct cifs_sid_id *lsidid; 175 for (i = 0; i < num_subauth; ++i) {
272 176 if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
273 while (node) { 177 if (le32_to_cpu(ctsid->sub_auth[i]) >
274 lsidid = rb_entry(node, struct cifs_sid_id, rbnode); 178 le32_to_cpu(cwsid->sub_auth[i]))
275 rc = compare_sids(sidptr, &((lsidid)->sid)); 179 return 1;
276 if (rc > 0) { 180 else
277 node = node->rb_left; 181 return -1;
278 } else if (rc < 0) { 182 }
279 node = node->rb_right; 183 }
280 } else /* node found */
281 return lsidid;
282 } 184 }
283 185
284 return NULL; 186 return 0; /* sids compare/match */
285} 187}
286 188
287static int 189static void
288sidid_pending_wait(void *unused) 190cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src)
289{ 191{
290 schedule(); 192 int i;
291 return signal_pending(current) ? -ERESTARTSYS : 0; 193
194 dst->revision = src->revision;
195 dst->num_subauth = min_t(u8, src->num_subauth, SID_MAX_SUB_AUTHORITIES);
196 for (i = 0; i < NUM_AUTHS; ++i)
197 dst->authority[i] = src->authority[i];
198 for (i = 0; i < dst->num_subauth; ++i)
199 dst->sub_auth[i] = src->sub_auth[i];
292} 200}
293 201
294static int 202static int
295id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid) 203id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid)
296{ 204{
297 int rc = 0; 205 int rc;
298 struct key *sidkey; 206 struct key *sidkey;
207 struct cifs_sid *ksid;
208 unsigned int ksid_size;
209 char desc[3 + 10 + 1]; /* 3 byte prefix + 10 bytes for value + NULL */
299 const struct cred *saved_cred; 210 const struct cred *saved_cred;
300 struct cifs_sid *lsid;
301 struct cifs_sid_id *psidid, *npsidid;
302 struct rb_root *cidtree;
303 spinlock_t *cidlock;
304
305 if (sidtype == SIDOWNER) {
306 cidlock = &siduidlock;
307 cidtree = &uidtree;
308 } else if (sidtype == SIDGROUP) {
309 cidlock = &sidgidlock;
310 cidtree = &gidtree;
311 } else
312 return -EINVAL;
313
314 spin_lock(cidlock);
315 psidid = sid_rb_search(cidtree, cid);
316
317 if (!psidid) { /* node does not exist, allocate one & attempt adding */
318 spin_unlock(cidlock);
319 npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
320 if (!npsidid)
321 return -ENOMEM;
322 211
323 npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL); 212 rc = snprintf(desc, sizeof(desc), "%ci:%u",
324 if (!npsidid->sidstr) { 213 sidtype == SIDOWNER ? 'o' : 'g', cid);
325 kfree(npsidid); 214 if (rc >= sizeof(desc))
326 return -ENOMEM; 215 return -EINVAL;
327 }
328 216
329 spin_lock(cidlock); 217 rc = 0;
330 psidid = sid_rb_search(cidtree, cid); 218 saved_cred = override_creds(root_cred);
331 if (psidid) { /* node happened to get inserted meanwhile */ 219 sidkey = request_key(&cifs_idmap_key_type, desc, "");
332 ++psidid->refcount; 220 if (IS_ERR(sidkey)) {
333 spin_unlock(cidlock); 221 rc = -EINVAL;
334 kfree(npsidid->sidstr); 222 cFYI(1, "%s: Can't map %cid %u to a SID", __func__,
335 kfree(npsidid); 223 sidtype == SIDOWNER ? 'u' : 'g', cid);
336 } else { 224 goto out_revert_creds;
337 psidid = npsidid; 225 } else if (sidkey->datalen < CIFS_SID_BASE_SIZE) {
338 sid_rb_insert(cidtree, cid, &psidid, 226 rc = -EIO;
339 sidtype == SIDOWNER ? "oi:" : "gi:"); 227 cFYI(1, "%s: Downcall contained malformed key "
340 ++psidid->refcount; 228 "(datalen=%hu)", __func__, sidkey->datalen);
341 spin_unlock(cidlock); 229 goto invalidate_key;
342 }
343 } else {
344 ++psidid->refcount;
345 spin_unlock(cidlock);
346 } 230 }
347 231
348 /* 232 /*
349 * If we are here, it is safe to access psidid and its fields 233 * A sid is usually too large to be embedded in payload.value, but if
350 * since a reference was taken earlier while holding the spinlock. 234 * there are no subauthorities and the host has 8-byte pointers, then
351 * A reference on the node is put without holding the spinlock 235 * it could be.
352 * and it is OK to do so in this case, shrinker will not erase
353 * this node until all references are put and we do not access
354 * any fields of the node after a reference is put .
355 */ 236 */
356 if (test_bit(SID_ID_MAPPED, &psidid->state)) { 237 ksid = sidkey->datalen <= sizeof(sidkey->payload) ?
357 memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid)); 238 (struct cifs_sid *)&sidkey->payload.value :
358 psidid->time = jiffies; /* update ts for accessing */ 239 (struct cifs_sid *)sidkey->payload.data;
359 goto id_sid_out; 240
360 } 241 ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32));
361 242 if (ksid_size > sidkey->datalen) {
362 if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) { 243 rc = -EIO;
363 rc = -EINVAL; 244 cFYI(1, "%s: Downcall contained malformed key (datalen=%hu, "
364 goto id_sid_out; 245 "ksid_size=%u)", __func__, sidkey->datalen, ksid_size);
246 goto invalidate_key;
365 } 247 }
366 248
367 if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) { 249 cifs_copy_sid(ssid, ksid);
368 saved_cred = override_creds(root_cred); 250out_key_put:
369 sidkey = request_key(&cifs_idmap_key_type, psidid->sidstr, ""); 251 key_put(sidkey);
370 if (IS_ERR(sidkey)) { 252out_revert_creds:
371 rc = -EINVAL; 253 revert_creds(saved_cred);
372 cFYI(1, "%s: Can't map and id to a SID", __func__);
373 } else {
374 lsid = (struct cifs_sid *)sidkey->payload.data;
375 memcpy(&psidid->sid, lsid,
376 sidkey->datalen < sizeof(struct cifs_sid) ?
377 sidkey->datalen : sizeof(struct cifs_sid));
378 memcpy(ssid, &psidid->sid,
379 sidkey->datalen < sizeof(struct cifs_sid) ?
380 sidkey->datalen : sizeof(struct cifs_sid));
381 set_bit(SID_ID_MAPPED, &psidid->state);
382 key_put(sidkey);
383 kfree(psidid->sidstr);
384 }
385 psidid->time = jiffies; /* update ts for accessing */
386 revert_creds(saved_cred);
387 clear_bit(SID_ID_PENDING, &psidid->state);
388 wake_up_bit(&psidid->state, SID_ID_PENDING);
389 } else {
390 rc = wait_on_bit(&psidid->state, SID_ID_PENDING,
391 sidid_pending_wait, TASK_INTERRUPTIBLE);
392 if (rc) {
393 cFYI(1, "%s: sidid_pending_wait interrupted %d",
394 __func__, rc);
395 --psidid->refcount;
396 return rc;
397 }
398 if (test_bit(SID_ID_MAPPED, &psidid->state))
399 memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid));
400 else
401 rc = -EINVAL;
402 }
403id_sid_out:
404 --psidid->refcount;
405 return rc; 254 return rc;
255
256invalidate_key:
257 key_invalidate(sidkey);
258 goto out_key_put;
406} 259}
407 260
408static int 261static int
@@ -410,111 +263,67 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
410 struct cifs_fattr *fattr, uint sidtype) 263 struct cifs_fattr *fattr, uint sidtype)
411{ 264{
412 int rc; 265 int rc;
413 unsigned long cid; 266 struct key *sidkey;
414 struct key *idkey; 267 char *sidstr;
415 const struct cred *saved_cred; 268 const struct cred *saved_cred;
416 struct cifs_sid_id *psidid, *npsidid; 269 uid_t fuid = cifs_sb->mnt_uid;
417 struct rb_root *cidtree; 270 gid_t fgid = cifs_sb->mnt_gid;
418 spinlock_t *cidlock;
419
420 if (sidtype == SIDOWNER) {
421 cid = cifs_sb->mnt_uid; /* default uid, in case upcall fails */
422 cidlock = &siduidlock;
423 cidtree = &uidtree;
424 } else if (sidtype == SIDGROUP) {
425 cid = cifs_sb->mnt_gid; /* default gid, in case upcall fails */
426 cidlock = &sidgidlock;
427 cidtree = &gidtree;
428 } else
429 return -ENOENT;
430
431 spin_lock(cidlock);
432 psidid = id_rb_search(cidtree, psid);
433
434 if (!psidid) { /* node does not exist, allocate one & attempt adding */
435 spin_unlock(cidlock);
436 npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
437 if (!npsidid)
438 return -ENOMEM;
439
440 npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
441 if (!npsidid->sidstr) {
442 kfree(npsidid);
443 return -ENOMEM;
444 }
445
446 spin_lock(cidlock);
447 psidid = id_rb_search(cidtree, psid);
448 if (psidid) { /* node happened to get inserted meanwhile */
449 ++psidid->refcount;
450 spin_unlock(cidlock);
451 kfree(npsidid->sidstr);
452 kfree(npsidid);
453 } else {
454 psidid = npsidid;
455 id_rb_insert(cidtree, psid, &psidid,
456 sidtype == SIDOWNER ? "os:" : "gs:");
457 ++psidid->refcount;
458 spin_unlock(cidlock);
459 }
460 } else {
461 ++psidid->refcount;
462 spin_unlock(cidlock);
463 }
464 271
465 /* 272 /*
466 * If we are here, it is safe to access psidid and its fields 273 * If we have too many subauthorities, then something is really wrong.
467 * since a reference was taken earlier while holding the spinlock. 274 * Just return an error.
468 * A reference on the node is put without holding the spinlock
469 * and it is OK to do so in this case, shrinker will not erase
470 * this node until all references are put and we do not access
471 * any fields of the node after a reference is put .
472 */ 275 */
473 if (test_bit(SID_ID_MAPPED, &psidid->state)) { 276 if (unlikely(psid->num_subauth > SID_MAX_SUB_AUTHORITIES)) {
474 cid = psidid->id; 277 cFYI(1, "%s: %u subauthorities is too many!", __func__,
475 psidid->time = jiffies; /* update ts for accessing */ 278 psid->num_subauth);
476 goto sid_to_id_out; 279 return -EIO;
477 } 280 }
478 281
479 if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) 282 sidstr = sid_to_key_str(psid, sidtype);
480 goto sid_to_id_out; 283 if (!sidstr)
481 284 return -ENOMEM;
482 if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) { 285
483 saved_cred = override_creds(root_cred); 286 saved_cred = override_creds(root_cred);
484 idkey = request_key(&cifs_idmap_key_type, psidid->sidstr, ""); 287 sidkey = request_key(&cifs_idmap_key_type, sidstr, "");
485 if (IS_ERR(idkey)) 288 if (IS_ERR(sidkey)) {
486 cFYI(1, "%s: Can't map SID to an id", __func__); 289 rc = -EINVAL;
487 else { 290 cFYI(1, "%s: Can't map SID %s to a %cid", __func__, sidstr,
488 cid = *(unsigned long *)idkey->payload.value; 291 sidtype == SIDOWNER ? 'u' : 'g');
489 psidid->id = cid; 292 goto out_revert_creds;
490 set_bit(SID_ID_MAPPED, &psidid->state); 293 }
491 key_put(idkey); 294
492 kfree(psidid->sidstr); 295 /*
493 } 296 * FIXME: Here we assume that uid_t and gid_t are same size. It's
494 revert_creds(saved_cred); 297 * probably a safe assumption but might be better to check based on
495 psidid->time = jiffies; /* update ts for accessing */ 298 * sidtype.
496 clear_bit(SID_ID_PENDING, &psidid->state); 299 */
497 wake_up_bit(&psidid->state, SID_ID_PENDING); 300 if (sidkey->datalen != sizeof(uid_t)) {
498 } else { 301 rc = -EIO;
499 rc = wait_on_bit(&psidid->state, SID_ID_PENDING, 302 cFYI(1, "%s: Downcall contained malformed key "
500 sidid_pending_wait, TASK_INTERRUPTIBLE); 303 "(datalen=%hu)", __func__, sidkey->datalen);
501 if (rc) { 304 key_invalidate(sidkey);
502 cFYI(1, "%s: sidid_pending_wait interrupted %d", 305 goto out_key_put;
503 __func__, rc);
504 --psidid->refcount; /* decremented without spinlock */
505 return rc;
506 }
507 if (test_bit(SID_ID_MAPPED, &psidid->state))
508 cid = psidid->id;
509 } 306 }
510 307
511sid_to_id_out:
512 --psidid->refcount; /* decremented without spinlock */
513 if (sidtype == SIDOWNER) 308 if (sidtype == SIDOWNER)
514 fattr->cf_uid = cid; 309 memcpy(&fuid, &sidkey->payload.value, sizeof(uid_t));
515 else 310 else
516 fattr->cf_gid = cid; 311 memcpy(&fgid, &sidkey->payload.value, sizeof(gid_t));
312
313out_key_put:
314 key_put(sidkey);
315out_revert_creds:
316 revert_creds(saved_cred);
317 kfree(sidstr);
517 318
319 /*
320 * Note that we return 0 here unconditionally. If the mapping
321 * fails then we just fall back to using the mnt_uid/mnt_gid.
322 */
323 if (sidtype == SIDOWNER)
324 fattr->cf_uid = fuid;
325 else
326 fattr->cf_gid = fgid;
518 return 0; 327 return 0;
519} 328}
520 329
@@ -537,19 +346,15 @@ init_cifs_idmap(void)
537 if (!cred) 346 if (!cred)
538 return -ENOMEM; 347 return -ENOMEM;
539 348
540 keyring = key_alloc(&key_type_keyring, ".cifs_idmap", 0, 0, cred, 349 keyring = keyring_alloc(".cifs_idmap", 0, 0, cred,
541 (KEY_POS_ALL & ~KEY_POS_SETATTR) | 350 (KEY_POS_ALL & ~KEY_POS_SETATTR) |
542 KEY_USR_VIEW | KEY_USR_READ, 351 KEY_USR_VIEW | KEY_USR_READ,
543 KEY_ALLOC_NOT_IN_QUOTA); 352 KEY_ALLOC_NOT_IN_QUOTA, NULL);
544 if (IS_ERR(keyring)) { 353 if (IS_ERR(keyring)) {
545 ret = PTR_ERR(keyring); 354 ret = PTR_ERR(keyring);
546 goto failed_put_cred; 355 goto failed_put_cred;
547 } 356 }
548 357
549 ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
550 if (ret < 0)
551 goto failed_put_key;
552
553 ret = register_key_type(&cifs_idmap_key_type); 358 ret = register_key_type(&cifs_idmap_key_type);
554 if (ret < 0) 359 if (ret < 0)
555 goto failed_put_key; 360 goto failed_put_key;
@@ -561,17 +366,6 @@ init_cifs_idmap(void)
561 cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; 366 cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
562 root_cred = cred; 367 root_cred = cred;
563 368
564 spin_lock_init(&siduidlock);
565 uidtree = RB_ROOT;
566 spin_lock_init(&sidgidlock);
567 gidtree = RB_ROOT;
568
569 spin_lock_init(&uidsidlock);
570 siduidtree = RB_ROOT;
571 spin_lock_init(&gidsidlock);
572 sidgidtree = RB_ROOT;
573 register_shrinker(&cifs_shrinker);
574
575 cFYI(1, "cifs idmap keyring: %d", key_serial(keyring)); 369 cFYI(1, "cifs idmap keyring: %d", key_serial(keyring));
576 return 0; 370 return 0;
577 371
@@ -588,95 +382,13 @@ exit_cifs_idmap(void)
588 key_revoke(root_cred->thread_keyring); 382 key_revoke(root_cred->thread_keyring);
589 unregister_key_type(&cifs_idmap_key_type); 383 unregister_key_type(&cifs_idmap_key_type);
590 put_cred(root_cred); 384 put_cred(root_cred);
591 unregister_shrinker(&cifs_shrinker);
592 cFYI(1, "Unregistered %s key type", cifs_idmap_key_type.name); 385 cFYI(1, "Unregistered %s key type", cifs_idmap_key_type.name);
593} 386}
594 387
595void
596cifs_destroy_idmaptrees(void)
597{
598 struct rb_root *root;
599 struct rb_node *node;
600
601 root = &uidtree;
602 spin_lock(&siduidlock);
603 while ((node = rb_first(root)))
604 rb_erase(node, root);
605 spin_unlock(&siduidlock);
606
607 root = &gidtree;
608 spin_lock(&sidgidlock);
609 while ((node = rb_first(root)))
610 rb_erase(node, root);
611 spin_unlock(&sidgidlock);
612
613 root = &siduidtree;
614 spin_lock(&uidsidlock);
615 while ((node = rb_first(root)))
616 rb_erase(node, root);
617 spin_unlock(&uidsidlock);
618
619 root = &sidgidtree;
620 spin_lock(&gidsidlock);
621 while ((node = rb_first(root)))
622 rb_erase(node, root);
623 spin_unlock(&gidsidlock);
624}
625
626/* if the two SIDs (roughly equivalent to a UUID for a user or group) are
627 the same returns 1, if they do not match returns 0 */
628int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
629{
630 int i;
631 int num_subauth, num_sat, num_saw;
632
633 if ((!ctsid) || (!cwsid))
634 return 1;
635
636 /* compare the revision */
637 if (ctsid->revision != cwsid->revision) {
638 if (ctsid->revision > cwsid->revision)
639 return 1;
640 else
641 return -1;
642 }
643
644 /* compare all of the six auth values */
645 for (i = 0; i < 6; ++i) {
646 if (ctsid->authority[i] != cwsid->authority[i]) {
647 if (ctsid->authority[i] > cwsid->authority[i])
648 return 1;
649 else
650 return -1;
651 }
652 }
653
654 /* compare all of the subauth values if any */
655 num_sat = ctsid->num_subauth;
656 num_saw = cwsid->num_subauth;
657 num_subauth = num_sat < num_saw ? num_sat : num_saw;
658 if (num_subauth) {
659 for (i = 0; i < num_subauth; ++i) {
660 if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
661 if (le32_to_cpu(ctsid->sub_auth[i]) >
662 le32_to_cpu(cwsid->sub_auth[i]))
663 return 1;
664 else
665 return -1;
666 }
667 }
668 }
669
670 return 0; /* sids compare/match */
671}
672
673
674/* copy ntsd, owner sid, and group sid from a security descriptor to another */ 388/* copy ntsd, owner sid, and group sid from a security descriptor to another */
675static void copy_sec_desc(const struct cifs_ntsd *pntsd, 389static void copy_sec_desc(const struct cifs_ntsd *pntsd,
676 struct cifs_ntsd *pnntsd, __u32 sidsoffset) 390 struct cifs_ntsd *pnntsd, __u32 sidsoffset)
677{ 391{
678 int i;
679
680 struct cifs_sid *owner_sid_ptr, *group_sid_ptr; 392 struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
681 struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; 393 struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
682 394
@@ -692,26 +404,14 @@ static void copy_sec_desc(const struct cifs_ntsd *pntsd,
692 owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + 404 owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
693 le32_to_cpu(pntsd->osidoffset)); 405 le32_to_cpu(pntsd->osidoffset));
694 nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset); 406 nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset);
695 407 cifs_copy_sid(nowner_sid_ptr, owner_sid_ptr);
696 nowner_sid_ptr->revision = owner_sid_ptr->revision;
697 nowner_sid_ptr->num_subauth = owner_sid_ptr->num_subauth;
698 for (i = 0; i < 6; i++)
699 nowner_sid_ptr->authority[i] = owner_sid_ptr->authority[i];
700 for (i = 0; i < 5; i++)
701 nowner_sid_ptr->sub_auth[i] = owner_sid_ptr->sub_auth[i];
702 408
703 /* copy group sid */ 409 /* copy group sid */
704 group_sid_ptr = (struct cifs_sid *)((char *)pntsd + 410 group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
705 le32_to_cpu(pntsd->gsidoffset)); 411 le32_to_cpu(pntsd->gsidoffset));
706 ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset + 412 ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset +
707 sizeof(struct cifs_sid)); 413 sizeof(struct cifs_sid));
708 414 cifs_copy_sid(ngroup_sid_ptr, group_sid_ptr);
709 ngroup_sid_ptr->revision = group_sid_ptr->revision;
710 ngroup_sid_ptr->num_subauth = group_sid_ptr->num_subauth;
711 for (i = 0; i < 6; i++)
712 ngroup_sid_ptr->authority[i] = group_sid_ptr->authority[i];
713 for (i = 0; i < 5; i++)
714 ngroup_sid_ptr->sub_auth[i] = group_sid_ptr->sub_auth[i];
715 415
716 return; 416 return;
717} 417}
@@ -818,7 +518,7 @@ static __u16 fill_ace_for_sid(struct cifs_ace *pntace,
818 518
819 pntace->sid.revision = psid->revision; 519 pntace->sid.revision = psid->revision;
820 pntace->sid.num_subauth = psid->num_subauth; 520 pntace->sid.num_subauth = psid->num_subauth;
821 for (i = 0; i < 6; i++) 521 for (i = 0; i < NUM_AUTHS; i++)
822 pntace->sid.authority[i] = psid->authority[i]; 522 pntace->sid.authority[i] = psid->authority[i];
823 for (i = 0; i < psid->num_subauth; i++) 523 for (i = 0; i < psid->num_subauth; i++)
824 pntace->sid.sub_auth[i] = psid->sub_auth[i]; 524 pntace->sid.sub_auth[i] = psid->sub_auth[i];
@@ -994,8 +694,8 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
994 return -EINVAL; 694 return -EINVAL;
995 } 695 }
996 696
997 if (psid->num_subauth) {
998#ifdef CONFIG_CIFS_DEBUG2 697#ifdef CONFIG_CIFS_DEBUG2
698 if (psid->num_subauth) {
999 int i; 699 int i;
1000 cFYI(1, "SID revision %d num_auth %d", 700 cFYI(1, "SID revision %d num_auth %d",
1001 psid->revision, psid->num_subauth); 701 psid->revision, psid->num_subauth);
@@ -1009,8 +709,8 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
1009 num auths and therefore go off the end */ 709 num auths and therefore go off the end */
1010 cFYI(1, "RID 0x%x", 710 cFYI(1, "RID 0x%x",
1011 le32_to_cpu(psid->sub_auth[psid->num_subauth-1])); 711 le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
1012#endif
1013 } 712 }
713#endif
1014 714
1015 return 0; 715 return 0;
1016} 716}
@@ -1120,8 +820,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
1120 kfree(nowner_sid_ptr); 820 kfree(nowner_sid_ptr);
1121 return rc; 821 return rc;
1122 } 822 }
1123 memcpy(owner_sid_ptr, nowner_sid_ptr, 823 cifs_copy_sid(owner_sid_ptr, nowner_sid_ptr);
1124 sizeof(struct cifs_sid));
1125 kfree(nowner_sid_ptr); 824 kfree(nowner_sid_ptr);
1126 *aclflag = CIFS_ACL_OWNER; 825 *aclflag = CIFS_ACL_OWNER;
1127 } 826 }
@@ -1139,8 +838,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
1139 kfree(ngroup_sid_ptr); 838 kfree(ngroup_sid_ptr);
1140 return rc; 839 return rc;
1141 } 840 }
1142 memcpy(group_sid_ptr, ngroup_sid_ptr, 841 cifs_copy_sid(group_sid_ptr, ngroup_sid_ptr);
1143 sizeof(struct cifs_sid));
1144 kfree(ngroup_sid_ptr); 842 kfree(ngroup_sid_ptr);
1145 *aclflag = CIFS_ACL_GROUP; 843 *aclflag = CIFS_ACL_GROUP;
1146 } 844 }
@@ -1316,42 +1014,39 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
1316 1014
1317 /* Get the security descriptor */ 1015 /* Get the security descriptor */
1318 pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen); 1016 pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
1319
1320 /* Add three ACEs for owner, group, everyone getting rid of
1321 other ACEs as chmod disables ACEs and set the security descriptor */
1322
1323 if (IS_ERR(pntsd)) { 1017 if (IS_ERR(pntsd)) {
1324 rc = PTR_ERR(pntsd); 1018 rc = PTR_ERR(pntsd);
1325 cERROR(1, "%s: error %d getting sec desc", __func__, rc); 1019 cERROR(1, "%s: error %d getting sec desc", __func__, rc);
1326 } else { 1020 goto out;
1327 /* allocate memory for the smb header, 1021 }
1328 set security descriptor request security descriptor
1329 parameters, and secuirty descriptor itself */
1330
1331 secdesclen = secdesclen < DEFSECDESCLEN ?
1332 DEFSECDESCLEN : secdesclen;
1333 pnntsd = kmalloc(secdesclen, GFP_KERNEL);
1334 if (!pnntsd) {
1335 cERROR(1, "Unable to allocate security descriptor");
1336 kfree(pntsd);
1337 return -ENOMEM;
1338 }
1339 1022
1340 rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid, 1023 /*
1341 &aclflag); 1024 * Add three ACEs for owner, group, everyone getting rid of other ACEs
1025 * as chmod disables ACEs and set the security descriptor. Allocate
1026 * memory for the smb header, set security descriptor request security
1027 * descriptor parameters, and secuirty descriptor itself
1028 */
1029 secdesclen = max_t(u32, secdesclen, DEFAULT_SEC_DESC_LEN);
1030 pnntsd = kmalloc(secdesclen, GFP_KERNEL);
1031 if (!pnntsd) {
1032 cERROR(1, "Unable to allocate security descriptor");
1033 kfree(pntsd);
1034 return -ENOMEM;
1035 }
1342 1036
1343 cFYI(DBG2, "build_sec_desc rc: %d", rc); 1037 rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
1038 &aclflag);
1344 1039
1345 if (!rc) { 1040 cFYI(DBG2, "build_sec_desc rc: %d", rc);
1346 /* Set the security descriptor */
1347 rc = set_cifs_acl(pnntsd, secdesclen, inode,
1348 path, aclflag);
1349 cFYI(DBG2, "set_cifs_acl rc: %d", rc);
1350 }
1351 1041
1352 kfree(pnntsd); 1042 if (!rc) {
1353 kfree(pntsd); 1043 /* Set the security descriptor */
1044 rc = set_cifs_acl(pnntsd, secdesclen, inode, path, aclflag);
1045 cFYI(DBG2, "set_cifs_acl rc: %d", rc);
1354 } 1046 }
1355 1047
1048 kfree(pnntsd);
1049 kfree(pntsd);
1050out:
1356 return rc; 1051 return rc;
1357} 1052}
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index 5c902c7ce524..4f3884835267 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -23,11 +23,8 @@
23#define _CIFSACL_H 23#define _CIFSACL_H
24 24
25 25
26#define NUM_AUTHS 6 /* number of authority fields */ 26#define NUM_AUTHS (6) /* number of authority fields */
27#define NUM_SUBAUTHS 5 /* number of sub authority fields */ 27#define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */
28#define NUM_WK_SIDS 7 /* number of well known sids */
29#define SIDNAMELENGTH 20 /* long enough for the ones we care about */
30#define DEFSECDESCLEN 192 /* sec desc len contaiting a dacl with three aces */
31 28
32#define READ_BIT 0x4 29#define READ_BIT 0x4
33#define WRITE_BIT 0x2 30#define WRITE_BIT 0x2
@@ -41,12 +38,32 @@
41 38
42#define SIDOWNER 1 39#define SIDOWNER 1
43#define SIDGROUP 2 40#define SIDGROUP 2
44#define SIDLEN 150 /* S- 1 revision- 6 authorities- max 5 sub authorities */
45 41
46#define SID_ID_MAPPED 0 42/*
47#define SID_ID_PENDING 1 43 * Security Descriptor length containing DACL with 3 ACEs (one each for
48#define SID_MAP_EXPIRE (3600 * HZ) /* map entry expires after one hour */ 44 * owner, group and world).
49#define SID_MAP_RETRY (300 * HZ) /* wait 5 minutes for next attempt to map */ 45 */
46#define DEFAULT_SEC_DESC_LEN (sizeof(struct cifs_ntsd) + \
47 sizeof(struct cifs_acl) + \
48 (sizeof(struct cifs_ace) * 3))
49
50/*
51 * Maximum size of a string representation of a SID:
52 *
53 * The fields are unsigned values in decimal. So:
54 *
55 * u8: max 3 bytes in decimal
56 * u32: max 10 bytes in decimal
57 *
58 * "S-" + 3 bytes for version field + 15 for authority field + NULL terminator
59 *
60 * For authority field, max is when all 6 values are non-zero and it must be
61 * represented in hex. So "-0x" + 12 hex digits.
62 *
63 * Add 11 bytes for each subauthority field (10 bytes each + 1 for '-')
64 */
65#define SID_STRING_BASE_SIZE (2 + 3 + 15 + 1)
66#define SID_STRING_SUBAUTH_SIZE (11) /* size of a single subauth string */
50 67
51struct cifs_ntsd { 68struct cifs_ntsd {
52 __le16 revision; /* revision level */ 69 __le16 revision; /* revision level */
@@ -60,10 +77,13 @@ struct cifs_ntsd {
60struct cifs_sid { 77struct cifs_sid {
61 __u8 revision; /* revision level */ 78 __u8 revision; /* revision level */
62 __u8 num_subauth; 79 __u8 num_subauth;
63 __u8 authority[6]; 80 __u8 authority[NUM_AUTHS];
64 __le32 sub_auth[5]; /* sub_auth[num_subauth] */ 81 __le32 sub_auth[SID_MAX_SUB_AUTHORITIES]; /* sub_auth[num_subauth] */
65} __attribute__((packed)); 82} __attribute__((packed));
66 83
84/* size of a struct cifs_sid, sans sub_auth array */
85#define CIFS_SID_BASE_SIZE (1 + 1 + NUM_AUTHS)
86
67struct cifs_acl { 87struct cifs_acl {
68 __le16 revision; /* revision level */ 88 __le16 revision; /* revision level */
69 __le16 size; 89 __le16 size;
@@ -78,26 +98,4 @@ struct cifs_ace {
78 struct cifs_sid sid; /* ie UUID of user or group who gets these perms */ 98 struct cifs_sid sid; /* ie UUID of user or group who gets these perms */
79} __attribute__((packed)); 99} __attribute__((packed));
80 100
81struct cifs_wksid {
82 struct cifs_sid cifssid;
83 char sidname[SIDNAMELENGTH];
84} __attribute__((packed));
85
86struct cifs_sid_id {
87 unsigned int refcount; /* increment with spinlock, decrement without */
88 unsigned long id;
89 unsigned long time;
90 unsigned long state;
91 char *sidstr;
92 struct rb_node rbnode;
93 struct cifs_sid sid;
94};
95
96#ifdef __KERNEL__
97extern struct key_type cifs_idmap_key_type;
98extern const struct cred *root_cred;
99#endif /* KERNEL */
100
101extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
102
103#endif /* _CIFSACL_H */ 101#endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index e7931cc55d0c..ce9f3c5421bf 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -64,24 +64,23 @@ unsigned int global_secflags = CIFSSEC_DEF;
64unsigned int sign_CIFS_PDUs = 1; 64unsigned int sign_CIFS_PDUs = 1;
65static const struct super_operations cifs_super_ops; 65static const struct super_operations cifs_super_ops;
66unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; 66unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
67module_param(CIFSMaxBufSize, int, 0); 67module_param(CIFSMaxBufSize, uint, 0);
68MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). " 68MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). "
69 "Default: 16384 Range: 8192 to 130048"); 69 "Default: 16384 Range: 8192 to 130048");
70unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL; 70unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL;
71module_param(cifs_min_rcv, int, 0); 71module_param(cifs_min_rcv, uint, 0);
72MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: " 72MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: "
73 "1 to 64"); 73 "1 to 64");
74unsigned int cifs_min_small = 30; 74unsigned int cifs_min_small = 30;
75module_param(cifs_min_small, int, 0); 75module_param(cifs_min_small, uint, 0);
76MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " 76MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
77 "Range: 2 to 256"); 77 "Range: 2 to 256");
78unsigned int cifs_max_pending = CIFS_MAX_REQ; 78unsigned int cifs_max_pending = CIFS_MAX_REQ;
79module_param(cifs_max_pending, int, 0444); 79module_param(cifs_max_pending, uint, 0444);
80MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " 80MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
81 "Default: 32767 Range: 2 to 32767."); 81 "Default: 32767 Range: 2 to 32767.");
82module_param(enable_oplocks, bool, 0644); 82module_param(enable_oplocks, bool, 0644);
83MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:" 83MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
84 "y/Y/1");
85 84
86extern mempool_t *cifs_sm_req_poolp; 85extern mempool_t *cifs_sm_req_poolp;
87extern mempool_t *cifs_req_poolp; 86extern mempool_t *cifs_req_poolp;
@@ -230,6 +229,7 @@ cifs_alloc_inode(struct super_block *sb)
230 cifs_set_oplock_level(cifs_inode, 0); 229 cifs_set_oplock_level(cifs_inode, 0);
231 cifs_inode->delete_pending = false; 230 cifs_inode->delete_pending = false;
232 cifs_inode->invalid_mapping = false; 231 cifs_inode->invalid_mapping = false;
232 cifs_inode->leave_pages_clean = false;
233 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ 233 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
234 cifs_inode->server_eof = 0; 234 cifs_inode->server_eof = 0;
235 cifs_inode->uniqueid = 0; 235 cifs_inode->uniqueid = 0;
@@ -540,8 +540,8 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
540 char *s, *p; 540 char *s, *p;
541 char sep; 541 char sep;
542 542
543 full_path = build_path_to_root(vol, cifs_sb, 543 full_path = cifs_build_path_to_root(vol, cifs_sb,
544 cifs_sb_master_tcon(cifs_sb)); 544 cifs_sb_master_tcon(cifs_sb));
545 if (full_path == NULL) 545 if (full_path == NULL)
546 return ERR_PTR(-ENOMEM); 546 return ERR_PTR(-ENOMEM);
547 547
@@ -695,13 +695,13 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
695 return written; 695 return written;
696} 696}
697 697
698static loff_t cifs_llseek(struct file *file, loff_t offset, int origin) 698static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
699{ 699{
700 /* 700 /*
701 * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate 701 * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
702 * the cached file length 702 * the cached file length
703 */ 703 */
704 if (origin != SEEK_SET && origin != SEEK_CUR) { 704 if (whence != SEEK_SET && whence != SEEK_CUR) {
705 int rc; 705 int rc;
706 struct inode *inode = file->f_path.dentry->d_inode; 706 struct inode *inode = file->f_path.dentry->d_inode;
707 707
@@ -728,7 +728,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
728 if (rc < 0) 728 if (rc < 0)
729 return (loff_t)rc; 729 return (loff_t)rc;
730 } 730 }
731 return generic_file_llseek(file, offset, origin); 731 return generic_file_llseek(file, offset, whence);
732} 732}
733 733
734static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) 734static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
@@ -1205,7 +1205,6 @@ exit_cifs(void)
1205 unregister_filesystem(&cifs_fs_type); 1205 unregister_filesystem(&cifs_fs_type);
1206 cifs_dfs_release_automount_timer(); 1206 cifs_dfs_release_automount_timer();
1207#ifdef CONFIG_CIFS_ACL 1207#ifdef CONFIG_CIFS_ACL
1208 cifs_destroy_idmaptrees();
1209 exit_cifs_idmap(); 1208 exit_cifs_idmap();
1210#endif 1209#endif
1211#ifdef CONFIG_CIFS_UPCALL 1210#ifdef CONFIG_CIFS_UPCALL
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f5af2527fc69..aea1eec64911 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -178,6 +178,7 @@ struct smb_rqst {
178 178
179enum smb_version { 179enum smb_version {
180 Smb_1 = 1, 180 Smb_1 = 1,
181 Smb_20,
181 Smb_21, 182 Smb_21,
182 Smb_30, 183 Smb_30,
183}; 184};
@@ -280,9 +281,6 @@ struct smb_version_operations {
280 /* set attributes */ 281 /* set attributes */
281 int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *, 282 int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *,
282 const unsigned int); 283 const unsigned int);
283 /* build a full path to the root of the mount */
284 char * (*build_path_to_root)(struct smb_vol *, struct cifs_sb_info *,
285 struct cifs_tcon *);
286 /* check if we can send an echo or nor */ 284 /* check if we can send an echo or nor */
287 bool (*can_echo)(struct TCP_Server_Info *); 285 bool (*can_echo)(struct TCP_Server_Info *);
288 /* send echo request */ 286 /* send echo request */
@@ -369,6 +367,8 @@ struct smb_version_operations {
369 void (*set_lease_key)(struct inode *, struct cifs_fid *fid); 367 void (*set_lease_key)(struct inode *, struct cifs_fid *fid);
370 /* generate new lease key */ 368 /* generate new lease key */
371 void (*new_lease_key)(struct cifs_fid *fid); 369 void (*new_lease_key)(struct cifs_fid *fid);
370 int (*calc_signature)(struct smb_rqst *rqst,
371 struct TCP_Server_Info *server);
372}; 372};
373 373
374struct smb_version_values { 374struct smb_version_values {
@@ -396,7 +396,6 @@ struct smb_vol {
396 char *password; 396 char *password;
397 char *domainname; 397 char *domainname;
398 char *UNC; 398 char *UNC;
399 char *UNCip;
400 char *iocharset; /* local code page for mapping to and from Unicode */ 399 char *iocharset; /* local code page for mapping to and from Unicode */
401 char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */ 400 char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
402 char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */ 401 char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
@@ -444,11 +443,11 @@ struct smb_vol {
444 unsigned int rsize; 443 unsigned int rsize;
445 unsigned int wsize; 444 unsigned int wsize;
446 bool sockopt_tcp_nodelay:1; 445 bool sockopt_tcp_nodelay:1;
447 unsigned short int port;
448 unsigned long actimeo; /* attribute cache timeout (jiffies) */ 446 unsigned long actimeo; /* attribute cache timeout (jiffies) */
449 struct smb_version_operations *ops; 447 struct smb_version_operations *ops;
450 struct smb_version_values *vals; 448 struct smb_version_values *vals;
451 char *prepath; 449 char *prepath;
450 struct sockaddr_storage dstaddr; /* destination address */
452 struct sockaddr_storage srcaddr; /* allow binding to a local IP */ 451 struct sockaddr_storage srcaddr; /* allow binding to a local IP */
453 struct nls_table *local_nls; 452 struct nls_table *local_nls;
454}; 453};
@@ -1031,6 +1030,7 @@ struct cifsInodeInfo {
1031 bool clientCanCacheAll; /* read and writebehind oplock */ 1030 bool clientCanCacheAll; /* read and writebehind oplock */
1032 bool delete_pending; /* DELETE_ON_CLOSE is set */ 1031 bool delete_pending; /* DELETE_ON_CLOSE is set */
1033 bool invalid_mapping; /* pagecache is invalid */ 1032 bool invalid_mapping; /* pagecache is invalid */
1033 bool leave_pages_clean; /* protected by i_mutex, not set pages dirty */
1034 unsigned long time; /* jiffies of last update of inode */ 1034 unsigned long time; /* jiffies of last update of inode */
1035 u64 server_eof; /* current file size on server -- protected by i_lock */ 1035 u64 server_eof; /* current file size on server -- protected by i_lock */
1036 u64 uniqueid; /* server inode number */ 1036 u64 uniqueid; /* server inode number */
@@ -1067,30 +1067,16 @@ static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb)
1067static inline void 1067static inline void
1068convert_delimiter(char *path, char delim) 1068convert_delimiter(char *path, char delim)
1069{ 1069{
1070 int i; 1070 char old_delim, *pos;
1071 char old_delim;
1072
1073 if (path == NULL)
1074 return;
1075 1071
1076 if (delim == '/') 1072 if (delim == '/')
1077 old_delim = '\\'; 1073 old_delim = '\\';
1078 else 1074 else
1079 old_delim = '/'; 1075 old_delim = '/';
1080 1076
1081 for (i = 0; path[i] != '\0'; i++) { 1077 pos = path;
1082 if (path[i] == old_delim) 1078 while ((pos = strchr(pos, old_delim)))
1083 path[i] = delim; 1079 *pos = delim;
1084 }
1085}
1086
1087static inline char *
1088build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
1089 struct cifs_tcon *tcon)
1090{
1091 if (!vol->ops->build_path_to_root)
1092 return NULL;
1093 return vol->ops->build_path_to_root(vol, cifs_sb, tcon);
1094} 1080}
1095 1081
1096#ifdef CONFIG_CIFS_STATS 1082#ifdef CONFIG_CIFS_STATS
@@ -1362,7 +1348,7 @@ require use of the stronger protocol */
1362#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ 1348#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */
1363#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */ 1349#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */
1364 1350
1365#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP) 1351#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMSSP)
1366#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2) 1352#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
1367#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP) 1353#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
1368/* 1354/*
@@ -1506,6 +1492,6 @@ extern struct smb_version_values smb20_values;
1506extern struct smb_version_operations smb21_operations; 1492extern struct smb_version_operations smb21_operations;
1507extern struct smb_version_values smb21_values; 1493extern struct smb_version_values smb21_values;
1508#define SMB30_VERSION_STRING "3.0" 1494#define SMB30_VERSION_STRING "3.0"
1509/*extern struct smb_version_operations smb30_operations; */ /* not needed yet */ 1495extern struct smb_version_operations smb30_operations;
1510extern struct smb_version_values smb30_values; 1496extern struct smb_version_values smb30_values;
1511#endif /* _CIFS_GLOB_H */ 1497#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 5144e9fbeb8c..1988c1baa224 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -58,8 +58,10 @@ do { \
58} while (0) 58} while (0)
59extern int init_cifs_idmap(void); 59extern int init_cifs_idmap(void);
60extern void exit_cifs_idmap(void); 60extern void exit_cifs_idmap(void);
61extern void cifs_destroy_idmaptrees(void);
62extern char *build_path_from_dentry(struct dentry *); 61extern char *build_path_from_dentry(struct dentry *);
62extern char *cifs_build_path_to_root(struct smb_vol *vol,
63 struct cifs_sb_info *cifs_sb,
64 struct cifs_tcon *tcon);
63extern char *build_wildcard_path_from_dentry(struct dentry *direntry); 65extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
64extern char *cifs_compose_mount_options(const char *sb_mountdata, 66extern char *cifs_compose_mount_options(const char *sb_mountdata,
65 const char *fullpath, const struct dfs_info3_param *ref, 67 const char *fullpath, const struct dfs_info3_param *ref,
@@ -107,9 +109,7 @@ extern unsigned int smbCalcSize(void *buf);
107extern int decode_negTokenInit(unsigned char *security_blob, int length, 109extern int decode_negTokenInit(unsigned char *security_blob, int length,
108 struct TCP_Server_Info *server); 110 struct TCP_Server_Info *server);
109extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len); 111extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
110extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port); 112extern void cifs_set_port(struct sockaddr *addr, const unsigned short int port);
111extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
112 const unsigned short int port);
113extern int map_smb_to_linux_error(char *buf, bool logErr); 113extern int map_smb_to_linux_error(char *buf, bool logErr);
114extern void header_assemble(struct smb_hdr *, char /* command */ , 114extern void header_assemble(struct smb_hdr *, char /* command */ ,
115 const struct cifs_tcon *, int /* length of 115 const struct cifs_tcon *, int /* length of
@@ -185,7 +185,7 @@ extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon);
185extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, 185extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset,
186 __u64 length, __u8 type, 186 __u64 length, __u8 type,
187 struct cifsLockInfo **conf_lock, 187 struct cifsLockInfo **conf_lock,
188 bool rw_check); 188 int rw_check);
189extern void cifs_add_pending_open(struct cifs_fid *fid, 189extern void cifs_add_pending_open(struct cifs_fid *fid,
190 struct tcon_link *tlink, 190 struct tcon_link *tlink,
191 struct cifs_pending_open *open); 191 struct cifs_pending_open *open);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 5c670b998ffb..7635b5db26a7 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -186,6 +186,7 @@ static const match_table_t cifs_mount_option_tokens = {
186 { Opt_user, "user=%s" }, 186 { Opt_user, "user=%s" },
187 { Opt_user, "username=%s" }, 187 { Opt_user, "username=%s" },
188 { Opt_blank_pass, "pass=" }, 188 { Opt_blank_pass, "pass=" },
189 { Opt_blank_pass, "password=" },
189 { Opt_pass, "pass=%s" }, 190 { Opt_pass, "pass=%s" },
190 { Opt_pass, "password=%s" }, 191 { Opt_pass, "password=%s" },
191 { Opt_blank_ip, "ip=" }, 192 { Opt_blank_ip, "ip=" },
@@ -274,6 +275,7 @@ static const match_table_t cifs_cacheflavor_tokens = {
274 275
275static const match_table_t cifs_smb_version_tokens = { 276static const match_table_t cifs_smb_version_tokens = {
276 { Smb_1, SMB1_VERSION_STRING }, 277 { Smb_1, SMB1_VERSION_STRING },
278 { Smb_20, SMB20_VERSION_STRING},
277 { Smb_21, SMB21_VERSION_STRING }, 279 { Smb_21, SMB21_VERSION_STRING },
278 { Smb_30, SMB30_VERSION_STRING }, 280 { Smb_30, SMB30_VERSION_STRING },
279}; 281};
@@ -1074,12 +1076,16 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
1074 vol->vals = &smb1_values; 1076 vol->vals = &smb1_values;
1075 break; 1077 break;
1076#ifdef CONFIG_CIFS_SMB2 1078#ifdef CONFIG_CIFS_SMB2
1079 case Smb_20:
1080 vol->ops = &smb21_operations; /* currently identical with 2.1 */
1081 vol->vals = &smb20_values;
1082 break;
1077 case Smb_21: 1083 case Smb_21:
1078 vol->ops = &smb21_operations; 1084 vol->ops = &smb21_operations;
1079 vol->vals = &smb21_values; 1085 vol->vals = &smb21_values;
1080 break; 1086 break;
1081 case Smb_30: 1087 case Smb_30:
1082 vol->ops = &smb21_operations; /* currently identical with 2.1 */ 1088 vol->ops = &smb30_operations;
1083 vol->vals = &smb30_values; 1089 vol->vals = &smb30_values;
1084 break; 1090 break;
1085#endif 1091#endif
@@ -1090,6 +1096,52 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
1090 return 0; 1096 return 0;
1091} 1097}
1092 1098
1099/*
1100 * Parse a devname into substrings and populate the vol->UNC and vol->prepath
1101 * fields with the result. Returns 0 on success and an error otherwise.
1102 */
1103static int
1104cifs_parse_devname(const char *devname, struct smb_vol *vol)
1105{
1106 char *pos;
1107 const char *delims = "/\\";
1108 size_t len;
1109
1110 /* make sure we have a valid UNC double delimiter prefix */
1111 len = strspn(devname, delims);
1112 if (len != 2)
1113 return -EINVAL;
1114
1115 /* find delimiter between host and sharename */
1116 pos = strpbrk(devname + 2, delims);
1117 if (!pos)
1118 return -EINVAL;
1119
1120 /* skip past delimiter */
1121 ++pos;
1122
1123 /* now go until next delimiter or end of string */
1124 len = strcspn(pos, delims);
1125
1126 /* move "pos" up to delimiter or NULL */
1127 pos += len;
1128 vol->UNC = kstrndup(devname, pos - devname, GFP_KERNEL);
1129 if (!vol->UNC)
1130 return -ENOMEM;
1131
1132 convert_delimiter(vol->UNC, '\\');
1133
1134 /* If pos is NULL, or is a bogus trailing delimiter then no prepath */
1135 if (!*pos++ || !*pos)
1136 return 0;
1137
1138 vol->prepath = kstrdup(pos, GFP_KERNEL);
1139 if (!vol->prepath)
1140 return -ENOMEM;
1141
1142 return 0;
1143}
1144
1093static int 1145static int
1094cifs_parse_mount_options(const char *mountdata, const char *devname, 1146cifs_parse_mount_options(const char *mountdata, const char *devname,
1095 struct smb_vol *vol) 1147 struct smb_vol *vol)
@@ -1108,11 +1160,17 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1108 char *string = NULL; 1160 char *string = NULL;
1109 char *tmp_end, *value; 1161 char *tmp_end, *value;
1110 char delim; 1162 char delim;
1163 bool got_ip = false;
1164 unsigned short port = 0;
1165 struct sockaddr *dstaddr = (struct sockaddr *)&vol->dstaddr;
1111 1166
1112 separator[0] = ','; 1167 separator[0] = ',';
1113 separator[1] = 0; 1168 separator[1] = 0;
1114 delim = separator[0]; 1169 delim = separator[0];
1115 1170
1171 /* ensure we always start with zeroed-out smb_vol */
1172 memset(vol, 0, sizeof(*vol));
1173
1116 /* 1174 /*
1117 * does not have to be perfect mapping since field is 1175 * does not have to be perfect mapping since field is
1118 * informational, only used for servers that do not support 1176 * informational, only used for servers that do not support
@@ -1169,6 +1227,16 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1169 vol->backupuid_specified = false; /* no backup intent for a user */ 1227 vol->backupuid_specified = false; /* no backup intent for a user */
1170 vol->backupgid_specified = false; /* no backup intent for a group */ 1228 vol->backupgid_specified = false; /* no backup intent for a group */
1171 1229
1230 /*
1231 * For now, we ignore -EINVAL errors under the assumption that the
1232 * unc= and prefixpath= options will be usable.
1233 */
1234 if (cifs_parse_devname(devname, vol) == -ENOMEM) {
1235 printk(KERN_ERR "CIFS: Unable to allocate memory to parse "
1236 "device string.\n");
1237 goto out_nomem;
1238 }
1239
1172 while ((data = strsep(&options, separator)) != NULL) { 1240 while ((data = strsep(&options, separator)) != NULL) {
1173 substring_t args[MAX_OPT_ARGS]; 1241 substring_t args[MAX_OPT_ARGS];
1174 unsigned long option; 1242 unsigned long option;
@@ -1416,12 +1484,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1416 vol->dir_mode = option; 1484 vol->dir_mode = option;
1417 break; 1485 break;
1418 case Opt_port: 1486 case Opt_port:
1419 if (get_option_ul(args, &option)) { 1487 if (get_option_ul(args, &option) ||
1420 cERROR(1, "%s: Invalid port value", 1488 option > USHRT_MAX) {
1421 __func__); 1489 cERROR(1, "%s: Invalid port value", __func__);
1422 goto cifs_parse_mount_err; 1490 goto cifs_parse_mount_err;
1423 } 1491 }
1424 vol->port = option; 1492 port = (unsigned short)option;
1425 break; 1493 break;
1426 case Opt_rsize: 1494 case Opt_rsize:
1427 if (get_option_ul(args, &option)) { 1495 if (get_option_ul(args, &option)) {
@@ -1537,53 +1605,48 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1537 vol->password[j] = '\0'; 1605 vol->password[j] = '\0';
1538 break; 1606 break;
1539 case Opt_blank_ip: 1607 case Opt_blank_ip:
1540 vol->UNCip = NULL; 1608 /* FIXME: should this be an error instead? */
1609 got_ip = false;
1541 break; 1610 break;
1542 case Opt_ip: 1611 case Opt_ip:
1543 string = match_strdup(args); 1612 string = match_strdup(args);
1544 if (string == NULL) 1613 if (string == NULL)
1545 goto out_nomem; 1614 goto out_nomem;
1546 1615
1547 if (strnlen(string, INET6_ADDRSTRLEN) > 1616 if (!cifs_convert_address(dstaddr, string,
1548 INET6_ADDRSTRLEN) { 1617 strlen(string))) {
1549 printk(KERN_WARNING "CIFS: ip address " 1618 printk(KERN_ERR "CIFS: bad ip= option (%s).\n",
1550 "too long\n"); 1619 string);
1551 goto cifs_parse_mount_err;
1552 }
1553 vol->UNCip = kstrdup(string, GFP_KERNEL);
1554 if (!vol->UNCip) {
1555 printk(KERN_WARNING "CIFS: no memory "
1556 "for UNC IP\n");
1557 goto cifs_parse_mount_err; 1620 goto cifs_parse_mount_err;
1558 } 1621 }
1622 got_ip = true;
1559 break; 1623 break;
1560 case Opt_unc: 1624 case Opt_unc:
1561 string = match_strdup(args); 1625 string = vol->UNC;
1562 if (string == NULL) 1626 vol->UNC = match_strdup(args);
1627 if (vol->UNC == NULL) {
1628 kfree(string);
1563 goto out_nomem; 1629 goto out_nomem;
1564
1565 temp_len = strnlen(string, 300);
1566 if (temp_len == 300) {
1567 printk(KERN_WARNING "CIFS: UNC name too long\n");
1568 goto cifs_parse_mount_err;
1569 } 1630 }
1570 1631
1571 vol->UNC = kmalloc(temp_len+1, GFP_KERNEL); 1632 convert_delimiter(vol->UNC, '\\');
1572 if (vol->UNC == NULL) { 1633 if (vol->UNC[0] != '\\' || vol->UNC[1] != '\\') {
1573 printk(KERN_WARNING "CIFS: no memory for UNC\n"); 1634 kfree(string);
1574 goto cifs_parse_mount_err; 1635 printk(KERN_ERR "CIFS: UNC Path does not "
1575 } 1636 "begin with // or \\\\\n");
1576 strcpy(vol->UNC, string);
1577
1578 if (strncmp(string, "//", 2) == 0) {
1579 vol->UNC[0] = '\\';
1580 vol->UNC[1] = '\\';
1581 } else if (strncmp(string, "\\\\", 2) != 0) {
1582 printk(KERN_WARNING "CIFS: UNC Path does not "
1583 "begin with // or \\\\\n");
1584 goto cifs_parse_mount_err; 1637 goto cifs_parse_mount_err;
1585 } 1638 }
1586 1639
1640 /* Compare old unc= option to new one */
1641 if (!string || strcmp(string, vol->UNC))
1642 printk(KERN_WARNING "CIFS: the value of the "
1643 "unc= mount option does not match the "
1644 "device string. Using the unc= option "
1645 "for now. In 3.10, that option will "
1646 "be ignored and the contents of the "
1647 "device string will be used "
1648 "instead. (%s != %s)\n", string,
1649 vol->UNC);
1587 break; 1650 break;
1588 case Opt_domain: 1651 case Opt_domain:
1589 string = match_strdup(args); 1652 string = match_strdup(args);
@@ -1618,31 +1681,26 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1618 } 1681 }
1619 break; 1682 break;
1620 case Opt_prefixpath: 1683 case Opt_prefixpath:
1621 string = match_strdup(args); 1684 /* skip over any leading delimiter */
1622 if (string == NULL) 1685 if (*args[0].from == '/' || *args[0].from == '\\')
1623 goto out_nomem; 1686 args[0].from++;
1624
1625 temp_len = strnlen(string, 1024);
1626 if (string[0] != '/')
1627 temp_len++; /* missing leading slash */
1628 if (temp_len > 1024) {
1629 printk(KERN_WARNING "CIFS: prefix too long\n");
1630 goto cifs_parse_mount_err;
1631 }
1632 1687
1633 vol->prepath = kmalloc(temp_len+1, GFP_KERNEL); 1688 string = vol->prepath;
1689 vol->prepath = match_strdup(args);
1634 if (vol->prepath == NULL) { 1690 if (vol->prepath == NULL) {
1635 printk(KERN_WARNING "CIFS: no memory " 1691 kfree(string);
1636 "for path prefix\n"); 1692 goto out_nomem;
1637 goto cifs_parse_mount_err;
1638 } 1693 }
1639 1694 /* Compare old prefixpath= option to new one */
1640 if (string[0] != '/') { 1695 if (!string || strcmp(string, vol->prepath))
1641 vol->prepath[0] = '/'; 1696 printk(KERN_WARNING "CIFS: the value of the "
1642 strcpy(vol->prepath+1, string); 1697 "prefixpath= mount option does not "
1643 } else 1698 "match the device string. Using the "
1644 strcpy(vol->prepath, string); 1699 "prefixpath= option for now. In 3.10, "
1645 1700 "that option will be ignored and the "
1701 "contents of the device string will be "
1702 "used instead.(%s != %s)\n", string,
1703 vol->prepath);
1646 break; 1704 break;
1647 case Opt_iocharset: 1705 case Opt_iocharset:
1648 string = match_strdup(args); 1706 string = match_strdup(args);
@@ -1799,9 +1857,30 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1799 goto cifs_parse_mount_err; 1857 goto cifs_parse_mount_err;
1800 } 1858 }
1801#endif 1859#endif
1860 if (!vol->UNC) {
1861 cERROR(1, "CIFS mount error: No usable UNC path provided in "
1862 "device string or in unc= option!");
1863 goto cifs_parse_mount_err;
1864 }
1802 1865
1803 if (vol->UNCip == NULL) 1866 /* make sure UNC has a share name */
1804 vol->UNCip = &vol->UNC[2]; 1867 if (!strchr(vol->UNC + 3, '\\')) {
1868 cERROR(1, "Malformed UNC. Unable to find share name.");
1869 goto cifs_parse_mount_err;
1870 }
1871
1872 if (!got_ip) {
1873 /* No ip= option specified? Try to get it from UNC */
1874 if (!cifs_convert_address(dstaddr, &vol->UNC[2],
1875 strlen(&vol->UNC[2]))) {
1876 printk(KERN_ERR "Unable to determine destination "
1877 "address.\n");
1878 goto cifs_parse_mount_err;
1879 }
1880 }
1881
1882 /* set the port that we got earlier */
1883 cifs_set_port(dstaddr, port);
1805 1884
1806 if (uid_specified) 1885 if (uid_specified)
1807 vol->override_uid = override_uid; 1886 vol->override_uid = override_uid;
@@ -1972,9 +2051,10 @@ match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
1972 return true; 2051 return true;
1973} 2052}
1974 2053
1975static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr, 2054static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
1976 struct smb_vol *vol)
1977{ 2055{
2056 struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr;
2057
1978 if ((server->vals != vol->vals) || (server->ops != vol->ops)) 2058 if ((server->vals != vol->vals) || (server->ops != vol->ops))
1979 return 0; 2059 return 0;
1980 2060
@@ -1995,13 +2075,13 @@ static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr,
1995} 2075}
1996 2076
1997static struct TCP_Server_Info * 2077static struct TCP_Server_Info *
1998cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol) 2078cifs_find_tcp_session(struct smb_vol *vol)
1999{ 2079{
2000 struct TCP_Server_Info *server; 2080 struct TCP_Server_Info *server;
2001 2081
2002 spin_lock(&cifs_tcp_ses_lock); 2082 spin_lock(&cifs_tcp_ses_lock);
2003 list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { 2083 list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
2004 if (!match_server(server, addr, vol)) 2084 if (!match_server(server, vol))
2005 continue; 2085 continue;
2006 2086
2007 ++server->srv_count; 2087 ++server->srv_count;
@@ -2051,40 +2131,12 @@ static struct TCP_Server_Info *
2051cifs_get_tcp_session(struct smb_vol *volume_info) 2131cifs_get_tcp_session(struct smb_vol *volume_info)
2052{ 2132{
2053 struct TCP_Server_Info *tcp_ses = NULL; 2133 struct TCP_Server_Info *tcp_ses = NULL;
2054 struct sockaddr_storage addr;
2055 struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr;
2056 struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
2057 int rc; 2134 int rc;
2058 2135
2059 memset(&addr, 0, sizeof(struct sockaddr_storage)); 2136 cFYI(1, "UNC: %s", volume_info->UNC);
2060
2061 cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip);
2062
2063 if (volume_info->UNCip && volume_info->UNC) {
2064 rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
2065 volume_info->UNCip,
2066 strlen(volume_info->UNCip),
2067 volume_info->port);
2068 if (!rc) {
2069 /* we failed translating address */
2070 rc = -EINVAL;
2071 goto out_err;
2072 }
2073 } else if (volume_info->UNCip) {
2074 /* BB using ip addr as tcp_ses name to connect to the
2075 DFS root below */
2076 cERROR(1, "Connecting to DFS root not implemented yet");
2077 rc = -EINVAL;
2078 goto out_err;
2079 } else /* which tcp_sess DFS root would we conect to */ {
2080 cERROR(1, "CIFS mount error: No UNC path (e.g. -o "
2081 "unc=//192.168.1.100/public) specified");
2082 rc = -EINVAL;
2083 goto out_err;
2084 }
2085 2137
2086 /* see if we already have a matching tcp_ses */ 2138 /* see if we already have a matching tcp_ses */
2087 tcp_ses = cifs_find_tcp_session((struct sockaddr *)&addr, volume_info); 2139 tcp_ses = cifs_find_tcp_session(volume_info);
2088 if (tcp_ses) 2140 if (tcp_ses)
2089 return tcp_ses; 2141 return tcp_ses;
2090 2142
@@ -2129,27 +2181,18 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
2129 INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); 2181 INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
2130 INIT_LIST_HEAD(&tcp_ses->smb_ses_list); 2182 INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
2131 INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request); 2183 INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
2132 2184 memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
2185 sizeof(tcp_ses->srcaddr));
2186 memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr,
2187 sizeof(tcp_ses->dstaddr));
2133 /* 2188 /*
2134 * at this point we are the only ones with the pointer 2189 * at this point we are the only ones with the pointer
2135 * to the struct since the kernel thread not created yet 2190 * to the struct since the kernel thread not created yet
2136 * no need to spinlock this init of tcpStatus or srv_count 2191 * no need to spinlock this init of tcpStatus or srv_count
2137 */ 2192 */
2138 tcp_ses->tcpStatus = CifsNew; 2193 tcp_ses->tcpStatus = CifsNew;
2139 memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
2140 sizeof(tcp_ses->srcaddr));
2141 ++tcp_ses->srv_count; 2194 ++tcp_ses->srv_count;
2142 2195
2143 if (addr.ss_family == AF_INET6) {
2144 cFYI(1, "attempting ipv6 connect");
2145 /* BB should we allow ipv6 on port 139? */
2146 /* other OS never observed in Wild doing 139 with v6 */
2147 memcpy(&tcp_ses->dstaddr, sin_server6,
2148 sizeof(struct sockaddr_in6));
2149 } else
2150 memcpy(&tcp_ses->dstaddr, sin_server,
2151 sizeof(struct sockaddr_in));
2152
2153 rc = ip_connect(tcp_ses); 2196 rc = ip_connect(tcp_ses);
2154 if (rc < 0) { 2197 if (rc < 0) {
2155 cERROR(1, "Error connecting to socket. Aborting operation"); 2198 cERROR(1, "Error connecting to socket. Aborting operation");
@@ -2397,8 +2440,6 @@ cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)),
2397} 2440}
2398#endif /* CONFIG_KEYS */ 2441#endif /* CONFIG_KEYS */
2399 2442
2400static bool warned_on_ntlm; /* globals init to false automatically */
2401
2402static struct cifs_ses * 2443static struct cifs_ses *
2403cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) 2444cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
2404{ 2445{
@@ -2475,14 +2516,6 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
2475 ses->cred_uid = volume_info->cred_uid; 2516 ses->cred_uid = volume_info->cred_uid;
2476 ses->linux_uid = volume_info->linux_uid; 2517 ses->linux_uid = volume_info->linux_uid;
2477 2518
2478 /* ntlmv2 is much stronger than ntlm security, and has been broadly
2479 supported for many years, time to update default security mechanism */
2480 if ((volume_info->secFlg == 0) && warned_on_ntlm == false) {
2481 warned_on_ntlm = true;
2482 cERROR(1, "default security mechanism requested. The default "
2483 "security mechanism will be upgraded from ntlm to "
2484 "ntlmv2 in kernel release 3.3");
2485 }
2486 ses->overrideSecFlg = volume_info->secFlg; 2519 ses->overrideSecFlg = volume_info->secFlg;
2487 2520
2488 mutex_lock(&ses->session_mutex); 2521 mutex_lock(&ses->session_mutex);
@@ -2598,13 +2631,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
2598 } 2631 }
2599 } 2632 }
2600 2633
2601 if (strchr(volume_info->UNC + 3, '\\') == NULL
2602 && strchr(volume_info->UNC + 3, '/') == NULL) {
2603 cERROR(1, "Missing share name");
2604 rc = -ENODEV;
2605 goto out_fail;
2606 }
2607
2608 /* 2634 /*
2609 * BB Do we need to wrap session_mutex around this TCon call and Unix 2635 * BB Do we need to wrap session_mutex around this TCon call and Unix
2610 * SetFS as we do on SessSetup and reconnect? 2636 * SetFS as we do on SessSetup and reconnect?
@@ -2718,11 +2744,8 @@ cifs_match_super(struct super_block *sb, void *data)
2718 struct cifs_ses *ses; 2744 struct cifs_ses *ses;
2719 struct cifs_tcon *tcon; 2745 struct cifs_tcon *tcon;
2720 struct tcon_link *tlink; 2746 struct tcon_link *tlink;
2721 struct sockaddr_storage addr;
2722 int rc = 0; 2747 int rc = 0;
2723 2748
2724 memset(&addr, 0, sizeof(struct sockaddr_storage));
2725
2726 spin_lock(&cifs_tcp_ses_lock); 2749 spin_lock(&cifs_tcp_ses_lock);
2727 cifs_sb = CIFS_SB(sb); 2750 cifs_sb = CIFS_SB(sb);
2728 tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); 2751 tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
@@ -2736,17 +2759,7 @@ cifs_match_super(struct super_block *sb, void *data)
2736 2759
2737 volume_info = mnt_data->vol; 2760 volume_info = mnt_data->vol;
2738 2761
2739 if (!volume_info->UNCip || !volume_info->UNC) 2762 if (!match_server(tcp_srv, volume_info) ||
2740 goto out;
2741
2742 rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
2743 volume_info->UNCip,
2744 strlen(volume_info->UNCip),
2745 volume_info->port);
2746 if (!rc)
2747 goto out;
2748
2749 if (!match_server(tcp_srv, (struct sockaddr *)&addr, volume_info) ||
2750 !match_session(ses, volume_info) || 2763 !match_session(ses, volume_info) ||
2751 !match_tcon(tcon, volume_info->UNC)) { 2764 !match_tcon(tcon, volume_info->UNC)) {
2752 rc = 0; 2765 rc = 0;
@@ -3261,8 +3274,6 @@ cleanup_volume_info_contents(struct smb_vol *volume_info)
3261{ 3274{
3262 kfree(volume_info->username); 3275 kfree(volume_info->username);
3263 kzfree(volume_info->password); 3276 kzfree(volume_info->password);
3264 if (volume_info->UNCip != volume_info->UNC + 2)
3265 kfree(volume_info->UNCip);
3266 kfree(volume_info->UNC); 3277 kfree(volume_info->UNC);
3267 kfree(volume_info->domainname); 3278 kfree(volume_info->domainname);
3268 kfree(volume_info->iocharset); 3279 kfree(volume_info->iocharset);
@@ -3280,14 +3291,16 @@ cifs_cleanup_volume_info(struct smb_vol *volume_info)
3280 3291
3281 3292
3282#ifdef CONFIG_CIFS_DFS_UPCALL 3293#ifdef CONFIG_CIFS_DFS_UPCALL
3283/* build_path_to_root returns full path to root when 3294/*
3284 * we do not have an exiting connection (tcon) */ 3295 * cifs_build_path_to_root returns full path to root when we do not have an
3296 * exiting connection (tcon)
3297 */
3285static char * 3298static char *
3286build_unc_path_to_root(const struct smb_vol *vol, 3299build_unc_path_to_root(const struct smb_vol *vol,
3287 const struct cifs_sb_info *cifs_sb) 3300 const struct cifs_sb_info *cifs_sb)
3288{ 3301{
3289 char *full_path, *pos; 3302 char *full_path, *pos;
3290 unsigned int pplen = vol->prepath ? strlen(vol->prepath) : 0; 3303 unsigned int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
3291 unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1); 3304 unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1);
3292 3305
3293 full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL); 3306 full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL);
@@ -3298,6 +3311,7 @@ build_unc_path_to_root(const struct smb_vol *vol,
3298 pos = full_path + unc_len; 3311 pos = full_path + unc_len;
3299 3312
3300 if (pplen) { 3313 if (pplen) {
3314 *pos++ = CIFS_DIR_SEP(cifs_sb);
3301 strncpy(pos, vol->prepath, pplen); 3315 strncpy(pos, vol->prepath, pplen);
3302 pos += pplen; 3316 pos += pplen;
3303 } 3317 }
@@ -3353,7 +3367,6 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses,
3353 mdata = NULL; 3367 mdata = NULL;
3354 } else { 3368 } else {
3355 cleanup_volume_info_contents(volume_info); 3369 cleanup_volume_info_contents(volume_info);
3356 memset(volume_info, '\0', sizeof(*volume_info));
3357 rc = cifs_setup_volume_info(volume_info, mdata, 3370 rc = cifs_setup_volume_info(volume_info, mdata,
3358 fake_devname); 3371 fake_devname);
3359 } 3372 }
@@ -3375,7 +3388,6 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
3375 if (cifs_parse_mount_options(mount_data, devname, volume_info)) 3388 if (cifs_parse_mount_options(mount_data, devname, volume_info))
3376 return -EINVAL; 3389 return -EINVAL;
3377 3390
3378
3379 if (volume_info->nullauth) { 3391 if (volume_info->nullauth) {
3380 cFYI(1, "Anonymous login"); 3392 cFYI(1, "Anonymous login");
3381 kfree(volume_info->username); 3393 kfree(volume_info->username);
@@ -3412,7 +3424,7 @@ cifs_get_volume_info(char *mount_data, const char *devname)
3412 int rc; 3424 int rc;
3413 struct smb_vol *volume_info; 3425 struct smb_vol *volume_info;
3414 3426
3415 volume_info = kzalloc(sizeof(struct smb_vol), GFP_KERNEL); 3427 volume_info = kmalloc(sizeof(struct smb_vol), GFP_KERNEL);
3416 if (!volume_info) 3428 if (!volume_info)
3417 return ERR_PTR(-ENOMEM); 3429 return ERR_PTR(-ENOMEM);
3418 3430
@@ -3537,8 +3549,10 @@ remote_path_check:
3537 rc = -ENOSYS; 3549 rc = -ENOSYS;
3538 goto mount_fail_check; 3550 goto mount_fail_check;
3539 } 3551 }
3540 /* build_path_to_root works only when we have a valid tcon */ 3552 /*
3541 full_path = build_path_to_root(volume_info, cifs_sb, tcon); 3553 * cifs_build_path_to_root works only when we have a valid tcon
3554 */
3555 full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon);
3542 if (full_path == NULL) { 3556 if (full_path == NULL) {
3543 rc = -ENOMEM; 3557 rc = -ENOMEM;
3544 goto mount_fail_check; 3558 goto mount_fail_check;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 7c0a81283645..8719bbe0dcc3 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -44,6 +44,38 @@ renew_parental_timestamps(struct dentry *direntry)
44 } while (!IS_ROOT(direntry)); 44 } while (!IS_ROOT(direntry));
45} 45}
46 46
47char *
48cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
49 struct cifs_tcon *tcon)
50{
51 int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
52 int dfsplen;
53 char *full_path = NULL;
54
55 /* if no prefix path, simply set path to the root of share to "" */
56 if (pplen == 0) {
57 full_path = kzalloc(1, GFP_KERNEL);
58 return full_path;
59 }
60
61 if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
62 dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
63 else
64 dfsplen = 0;
65
66 full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL);
67 if (full_path == NULL)
68 return full_path;
69
70 if (dfsplen)
71 strncpy(full_path, tcon->treeName, dfsplen);
72 full_path[dfsplen] = CIFS_DIR_SEP(cifs_sb);
73 strncpy(full_path + dfsplen + 1, vol->prepath, pplen);
74 convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
75 full_path[dfsplen + pplen] = 0; /* add trailing null */
76 return full_path;
77}
78
47/* Note: caller must free return buffer */ 79/* Note: caller must free return buffer */
48char * 80char *
49build_path_from_dentry(struct dentry *direntry) 81build_path_from_dentry(struct dentry *direntry)
@@ -398,7 +430,16 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
398 * in network traffic in the other paths. 430 * in network traffic in the other paths.
399 */ 431 */
400 if (!(oflags & O_CREAT)) { 432 if (!(oflags & O_CREAT)) {
401 struct dentry *res = cifs_lookup(inode, direntry, 0); 433 struct dentry *res;
434
435 /*
436 * Check for hashed negative dentry. We have already revalidated
437 * the dentry and it is fine. No need to perform another lookup.
438 */
439 if (!d_unhashed(direntry))
440 return -ENOENT;
441
442 res = cifs_lookup(inode, direntry, 0);
402 if (IS_ERR(res)) 443 if (IS_ERR(res))
403 return PTR_ERR(res); 444 return PTR_ERR(res);
404 445
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index edb25b4bbb95..0a6677ba212b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -505,16 +505,36 @@ out:
505 return rc; 505 return rc;
506} 506}
507 507
508static int cifs_push_posix_locks(struct cifsFileInfo *cfile);
509
508/* 510/*
509 * Try to reacquire byte range locks that were released when session 511 * Try to reacquire byte range locks that were released when session
510 * to server was lost 512 * to server was lost.
511 */ 513 */
512static int cifs_relock_file(struct cifsFileInfo *cifsFile) 514static int
515cifs_relock_file(struct cifsFileInfo *cfile)
513{ 516{
517 struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
518 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
519 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
514 int rc = 0; 520 int rc = 0;
515 521
516 /* BB list all locks open on this file and relock */ 522 /* we are going to update can_cache_brlcks here - need a write access */
523 down_write(&cinode->lock_sem);
524 if (cinode->can_cache_brlcks) {
525 /* can cache locks - no need to push them */
526 up_write(&cinode->lock_sem);
527 return rc;
528 }
517 529
530 if (cap_unix(tcon->ses) &&
531 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
532 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
533 rc = cifs_push_posix_locks(cfile);
534 else
535 rc = tcon->ses->server->ops->push_mand_locks(cfile);
536
537 up_write(&cinode->lock_sem);
518 return rc; 538 return rc;
519} 539}
520 540
@@ -739,10 +759,15 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock)
739 } 759 }
740} 760}
741 761
762#define CIFS_LOCK_OP 0
763#define CIFS_READ_OP 1
764#define CIFS_WRITE_OP 2
765
766/* @rw_check : 0 - no op, 1 - read, 2 - write */
742static bool 767static bool
743cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset, 768cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
744 __u64 length, __u8 type, struct cifsFileInfo *cfile, 769 __u64 length, __u8 type, struct cifsFileInfo *cfile,
745 struct cifsLockInfo **conf_lock, bool rw_check) 770 struct cifsLockInfo **conf_lock, int rw_check)
746{ 771{
747 struct cifsLockInfo *li; 772 struct cifsLockInfo *li;
748 struct cifsFileInfo *cur_cfile = fdlocks->cfile; 773 struct cifsFileInfo *cur_cfile = fdlocks->cfile;
@@ -752,9 +777,13 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
752 if (offset + length <= li->offset || 777 if (offset + length <= li->offset ||
753 offset >= li->offset + li->length) 778 offset >= li->offset + li->length)
754 continue; 779 continue;
755 if (rw_check && server->ops->compare_fids(cfile, cur_cfile) && 780 if (rw_check != CIFS_LOCK_OP && current->tgid == li->pid &&
756 current->tgid == li->pid) 781 server->ops->compare_fids(cfile, cur_cfile)) {
757 continue; 782 /* shared lock prevents write op through the same fid */
783 if (!(li->type & server->vals->shared_lock_type) ||
784 rw_check != CIFS_WRITE_OP)
785 continue;
786 }
758 if ((type & server->vals->shared_lock_type) && 787 if ((type & server->vals->shared_lock_type) &&
759 ((server->ops->compare_fids(cfile, cur_cfile) && 788 ((server->ops->compare_fids(cfile, cur_cfile) &&
760 current->tgid == li->pid) || type == li->type)) 789 current->tgid == li->pid) || type == li->type))
@@ -769,7 +798,7 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
769bool 798bool
770cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, 799cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
771 __u8 type, struct cifsLockInfo **conf_lock, 800 __u8 type, struct cifsLockInfo **conf_lock,
772 bool rw_check) 801 int rw_check)
773{ 802{
774 bool rc = false; 803 bool rc = false;
775 struct cifs_fid_locks *cur; 804 struct cifs_fid_locks *cur;
@@ -805,7 +834,7 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
805 down_read(&cinode->lock_sem); 834 down_read(&cinode->lock_sem);
806 835
807 exist = cifs_find_lock_conflict(cfile, offset, length, type, 836 exist = cifs_find_lock_conflict(cfile, offset, length, type,
808 &conf_lock, false); 837 &conf_lock, CIFS_LOCK_OP);
809 if (exist) { 838 if (exist) {
810 flock->fl_start = conf_lock->offset; 839 flock->fl_start = conf_lock->offset;
811 flock->fl_end = conf_lock->offset + conf_lock->length - 1; 840 flock->fl_end = conf_lock->offset + conf_lock->length - 1;
@@ -852,7 +881,7 @@ try_again:
852 down_write(&cinode->lock_sem); 881 down_write(&cinode->lock_sem);
853 882
854 exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length, 883 exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length,
855 lock->type, &conf_lock, false); 884 lock->type, &conf_lock, CIFS_LOCK_OP);
856 if (!exist && cinode->can_cache_brlcks) { 885 if (!exist && cinode->can_cache_brlcks) {
857 list_add_tail(&lock->llist, &cfile->llist->locks); 886 list_add_tail(&lock->llist, &cfile->llist->locks);
858 up_write(&cinode->lock_sem); 887 up_write(&cinode->lock_sem);
@@ -948,7 +977,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
948 int rc = 0, stored_rc; 977 int rc = 0, stored_rc;
949 struct cifsLockInfo *li, *tmp; 978 struct cifsLockInfo *li, *tmp;
950 struct cifs_tcon *tcon; 979 struct cifs_tcon *tcon;
951 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
952 unsigned int num, max_num, max_buf; 980 unsigned int num, max_num, max_buf;
953 LOCKING_ANDX_RANGE *buf, *cur; 981 LOCKING_ANDX_RANGE *buf, *cur;
954 int types[] = {LOCKING_ANDX_LARGE_FILES, 982 int types[] = {LOCKING_ANDX_LARGE_FILES,
@@ -958,21 +986,12 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
958 xid = get_xid(); 986 xid = get_xid();
959 tcon = tlink_tcon(cfile->tlink); 987 tcon = tlink_tcon(cfile->tlink);
960 988
961 /* we are going to update can_cache_brlcks here - need a write access */
962 down_write(&cinode->lock_sem);
963 if (!cinode->can_cache_brlcks) {
964 up_write(&cinode->lock_sem);
965 free_xid(xid);
966 return rc;
967 }
968
969 /* 989 /*
970 * Accessing maxBuf is racy with cifs_reconnect - need to store value 990 * Accessing maxBuf is racy with cifs_reconnect - need to store value
971 * and check it for zero before using. 991 * and check it for zero before using.
972 */ 992 */
973 max_buf = tcon->ses->server->maxBuf; 993 max_buf = tcon->ses->server->maxBuf;
974 if (!max_buf) { 994 if (!max_buf) {
975 up_write(&cinode->lock_sem);
976 free_xid(xid); 995 free_xid(xid);
977 return -EINVAL; 996 return -EINVAL;
978 } 997 }
@@ -981,7 +1000,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
981 sizeof(LOCKING_ANDX_RANGE); 1000 sizeof(LOCKING_ANDX_RANGE);
982 buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); 1001 buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
983 if (!buf) { 1002 if (!buf) {
984 up_write(&cinode->lock_sem);
985 free_xid(xid); 1003 free_xid(xid);
986 return -ENOMEM; 1004 return -ENOMEM;
987 } 1005 }
@@ -1018,9 +1036,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
1018 } 1036 }
1019 } 1037 }
1020 1038
1021 cinode->can_cache_brlcks = false;
1022 up_write(&cinode->lock_sem);
1023
1024 kfree(buf); 1039 kfree(buf);
1025 free_xid(xid); 1040 free_xid(xid);
1026 return rc; 1041 return rc;
@@ -1043,7 +1058,6 @@ struct lock_to_push {
1043static int 1058static int
1044cifs_push_posix_locks(struct cifsFileInfo *cfile) 1059cifs_push_posix_locks(struct cifsFileInfo *cfile)
1045{ 1060{
1046 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
1047 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 1061 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1048 struct file_lock *flock, **before; 1062 struct file_lock *flock, **before;
1049 unsigned int count = 0, i = 0; 1063 unsigned int count = 0, i = 0;
@@ -1054,14 +1068,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1054 1068
1055 xid = get_xid(); 1069 xid = get_xid();
1056 1070
1057 /* we are going to update can_cache_brlcks here - need a write access */
1058 down_write(&cinode->lock_sem);
1059 if (!cinode->can_cache_brlcks) {
1060 up_write(&cinode->lock_sem);
1061 free_xid(xid);
1062 return rc;
1063 }
1064
1065 lock_flocks(); 1071 lock_flocks();
1066 cifs_for_each_lock(cfile->dentry->d_inode, before) { 1072 cifs_for_each_lock(cfile->dentry->d_inode, before) {
1067 if ((*before)->fl_flags & FL_POSIX) 1073 if ((*before)->fl_flags & FL_POSIX)
@@ -1127,9 +1133,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1127 } 1133 }
1128 1134
1129out: 1135out:
1130 cinode->can_cache_brlcks = false;
1131 up_write(&cinode->lock_sem);
1132
1133 free_xid(xid); 1136 free_xid(xid);
1134 return rc; 1137 return rc;
1135err_out: 1138err_out:
@@ -1144,14 +1147,27 @@ static int
1144cifs_push_locks(struct cifsFileInfo *cfile) 1147cifs_push_locks(struct cifsFileInfo *cfile)
1145{ 1148{
1146 struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); 1149 struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
1150 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
1147 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 1151 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1152 int rc = 0;
1153
1154 /* we are going to update can_cache_brlcks here - need a write access */
1155 down_write(&cinode->lock_sem);
1156 if (!cinode->can_cache_brlcks) {
1157 up_write(&cinode->lock_sem);
1158 return rc;
1159 }
1148 1160
1149 if (cap_unix(tcon->ses) && 1161 if (cap_unix(tcon->ses) &&
1150 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && 1162 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
1151 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) 1163 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
1152 return cifs_push_posix_locks(cfile); 1164 rc = cifs_push_posix_locks(cfile);
1165 else
1166 rc = tcon->ses->server->ops->push_mand_locks(cfile);
1153 1167
1154 return tcon->ses->server->ops->push_mand_locks(cfile); 1168 cinode->can_cache_brlcks = false;
1169 up_write(&cinode->lock_sem);
1170 return rc;
1155} 1171}
1156 1172
1157static void 1173static void
@@ -1436,16 +1452,18 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
1436 return -ENOMEM; 1452 return -ENOMEM;
1437 1453
1438 rc = cifs_lock_add_if(cfile, lock, wait_flag); 1454 rc = cifs_lock_add_if(cfile, lock, wait_flag);
1439 if (rc < 0) 1455 if (rc < 0) {
1440 kfree(lock); 1456 kfree(lock);
1441 if (rc <= 0) 1457 return rc;
1458 }
1459 if (!rc)
1442 goto out; 1460 goto out;
1443 1461
1444 rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, 1462 rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
1445 type, 1, 0, wait_flag); 1463 type, 1, 0, wait_flag);
1446 if (rc) { 1464 if (rc) {
1447 kfree(lock); 1465 kfree(lock);
1448 goto out; 1466 return rc;
1449 } 1467 }
1450 1468
1451 cifs_lock_add(cfile, lock); 1469 cifs_lock_add(cfile, lock);
@@ -1794,7 +1812,6 @@ static int cifs_writepages(struct address_space *mapping,
1794 struct TCP_Server_Info *server; 1812 struct TCP_Server_Info *server;
1795 struct page *page; 1813 struct page *page;
1796 int rc = 0; 1814 int rc = 0;
1797 loff_t isize = i_size_read(mapping->host);
1798 1815
1799 /* 1816 /*
1800 * If wsize is smaller than the page cache size, default to writing 1817 * If wsize is smaller than the page cache size, default to writing
@@ -1899,7 +1916,7 @@ retry:
1899 */ 1916 */
1900 set_page_writeback(page); 1917 set_page_writeback(page);
1901 1918
1902 if (page_offset(page) >= isize) { 1919 if (page_offset(page) >= i_size_read(mapping->host)) {
1903 done = true; 1920 done = true;
1904 unlock_page(page); 1921 unlock_page(page);
1905 end_page_writeback(page); 1922 end_page_writeback(page);
@@ -1932,7 +1949,8 @@ retry:
1932 wdata->offset = page_offset(wdata->pages[0]); 1949 wdata->offset = page_offset(wdata->pages[0]);
1933 wdata->pagesz = PAGE_CACHE_SIZE; 1950 wdata->pagesz = PAGE_CACHE_SIZE;
1934 wdata->tailsz = 1951 wdata->tailsz =
1935 min(isize - page_offset(wdata->pages[nr_pages - 1]), 1952 min(i_size_read(mapping->host) -
1953 page_offset(wdata->pages[nr_pages - 1]),
1936 (loff_t)PAGE_CACHE_SIZE); 1954 (loff_t)PAGE_CACHE_SIZE);
1937 wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + 1955 wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) +
1938 wdata->tailsz; 1956 wdata->tailsz;
@@ -2085,7 +2103,15 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
2085 } else { 2103 } else {
2086 rc = copied; 2104 rc = copied;
2087 pos += copied; 2105 pos += copied;
2088 set_page_dirty(page); 2106 /*
2107 * When we use strict cache mode and cifs_strict_writev was run
2108 * with level II oplock (indicated by leave_pages_clean field of
2109 * CIFS_I(inode)), we can leave pages clean - cifs_strict_writev
2110 * sent the data to the server itself.
2111 */
2112 if (!CIFS_I(inode)->leave_pages_clean ||
2113 !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO))
2114 set_page_dirty(page);
2089 } 2115 }
2090 2116
2091 if (rc > 0) { 2117 if (rc > 0) {
@@ -2436,8 +2462,8 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
2436} 2462}
2437 2463
2438static ssize_t 2464static ssize_t
2439cifs_writev(struct kiocb *iocb, const struct iovec *iov, 2465cifs_pagecache_writev(struct kiocb *iocb, const struct iovec *iov,
2440 unsigned long nr_segs, loff_t pos) 2466 unsigned long nr_segs, loff_t pos, bool cache_ex)
2441{ 2467{
2442 struct file *file = iocb->ki_filp; 2468 struct file *file = iocb->ki_filp;
2443 struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; 2469 struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
@@ -2457,10 +2483,14 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
2457 down_read(&cinode->lock_sem); 2483 down_read(&cinode->lock_sem);
2458 if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs), 2484 if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs),
2459 server->vals->exclusive_lock_type, NULL, 2485 server->vals->exclusive_lock_type, NULL,
2460 true)) { 2486 CIFS_WRITE_OP)) {
2461 mutex_lock(&inode->i_mutex); 2487 mutex_lock(&inode->i_mutex);
2488 if (!cache_ex)
2489 cinode->leave_pages_clean = true;
2462 rc = __generic_file_aio_write(iocb, iov, nr_segs, 2490 rc = __generic_file_aio_write(iocb, iov, nr_segs,
2463 &iocb->ki_pos); 2491 &iocb->ki_pos);
2492 if (!cache_ex)
2493 cinode->leave_pages_clean = false;
2464 mutex_unlock(&inode->i_mutex); 2494 mutex_unlock(&inode->i_mutex);
2465 } 2495 }
2466 2496
@@ -2487,42 +2517,62 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
2487 struct cifsFileInfo *cfile = (struct cifsFileInfo *) 2517 struct cifsFileInfo *cfile = (struct cifsFileInfo *)
2488 iocb->ki_filp->private_data; 2518 iocb->ki_filp->private_data;
2489 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 2519 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
2490 2520 ssize_t written, written2;
2491#ifdef CONFIG_CIFS_SMB2
2492 /* 2521 /*
2493 * If we have an oplock for read and want to write a data to the file 2522 * We need to store clientCanCacheAll here to prevent race
2494 * we need to store it in the page cache and then push it to the server 2523 * conditions - this value can be changed during an execution
2495 * to be sure the next read will get a valid data. 2524 * of generic_file_aio_write. For CIFS it can be changed from
2525 * true to false only, but for SMB2 it can be changed both from
2526 * true to false and vice versa. So, we can end up with a data
2527 * stored in the cache, not marked dirty and not sent to the
2528 * server if this value changes its state from false to true
2529 * after cifs_write_end.
2496 */ 2530 */
2497 if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead) { 2531 bool cache_ex = cinode->clientCanCacheAll;
2498 ssize_t written; 2532 bool cache_read = cinode->clientCanCacheRead;
2499 int rc; 2533 int rc;
2500 2534 loff_t saved_pos;
2501 written = generic_file_aio_write(iocb, iov, nr_segs, pos);
2502 rc = filemap_fdatawrite(inode->i_mapping);
2503 if (rc)
2504 return (ssize_t)rc;
2505 2535
2506 return written; 2536 if (cache_ex) {
2537 if (cap_unix(tcon->ses) &&
2538 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) &&
2539 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(
2540 tcon->fsUnixInfo.Capability)))
2541 return generic_file_aio_write(iocb, iov, nr_segs, pos);
2542 return cifs_pagecache_writev(iocb, iov, nr_segs, pos, cache_ex);
2507 } 2543 }
2508#endif
2509 2544
2510 /* 2545 /*
2511 * For non-oplocked files in strict cache mode we need to write the data 2546 * For files without exclusive oplock in strict cache mode we need to
2512 * to the server exactly from the pos to pos+len-1 rather than flush all 2547 * write the data to the server exactly from the pos to pos+len-1 rather
2513 * affected pages because it may cause a error with mandatory locks on 2548 * than flush all affected pages because it may cause a error with
2514 * these pages but not on the region from pos to ppos+len-1. 2549 * mandatory locks on these pages but not on the region from pos to
2550 * ppos+len-1.
2515 */ 2551 */
2552 written = cifs_user_writev(iocb, iov, nr_segs, pos);
2553 if (!cache_read || written <= 0)
2554 return written;
2516 2555
2517 if (!cinode->clientCanCacheAll) 2556 saved_pos = iocb->ki_pos;
2518 return cifs_user_writev(iocb, iov, nr_segs, pos); 2557 iocb->ki_pos = pos;
2519 2558 /* we have a read oplock - need to store a data in the page cache */
2520 if (cap_unix(tcon->ses) && 2559 if (cap_unix(tcon->ses) &&
2521 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && 2560 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) &&
2522 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) 2561 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(
2523 return generic_file_aio_write(iocb, iov, nr_segs, pos); 2562 tcon->fsUnixInfo.Capability)))
2524 2563 written2 = generic_file_aio_write(iocb, iov, nr_segs, pos);
2525 return cifs_writev(iocb, iov, nr_segs, pos); 2564 else
2565 written2 = cifs_pagecache_writev(iocb, iov, nr_segs, pos,
2566 cache_ex);
2567 /* errors occured during writing - invalidate the page cache */
2568 if (written2 < 0) {
2569 rc = cifs_invalidate_mapping(inode);
2570 if (rc)
2571 written = (ssize_t)rc;
2572 else
2573 iocb->ki_pos = saved_pos;
2574 }
2575 return written;
2526} 2576}
2527 2577
2528static struct cifs_readdata * 2578static struct cifs_readdata *
@@ -2892,7 +2942,7 @@ cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
2892 down_read(&cinode->lock_sem); 2942 down_read(&cinode->lock_sem);
2893 if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs), 2943 if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs),
2894 tcon->ses->server->vals->shared_lock_type, 2944 tcon->ses->server->vals->shared_lock_type,
2895 NULL, true)) 2945 NULL, CIFS_READ_OP))
2896 rc = generic_file_aio_read(iocb, iov, nr_segs, pos); 2946 rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
2897 up_read(&cinode->lock_sem); 2947 up_read(&cinode->lock_sem);
2898 return rc; 2948 return rc;
@@ -3536,7 +3586,7 @@ void cifs_oplock_break(struct work_struct *work)
3536 if (cinode->clientCanCacheRead == 0) { 3586 if (cinode->clientCanCacheRead == 0) {
3537 rc = filemap_fdatawait(inode->i_mapping); 3587 rc = filemap_fdatawait(inode->i_mapping);
3538 mapping_set_error(inode->i_mapping, rc); 3588 mapping_set_error(inode->i_mapping, rc);
3539 invalidate_remote_inode(inode); 3589 cifs_invalidate_mapping(inode);
3540 } 3590 }
3541 cFYI(1, "Oplock flush inode %p rc %d", inode, rc); 3591 cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
3542 } 3592 }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index afdff79651f1..ed6208ff85a7 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1791,11 +1791,12 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1791 stat->ino = CIFS_I(inode)->uniqueid; 1791 stat->ino = CIFS_I(inode)->uniqueid;
1792 1792
1793 /* 1793 /*
1794 * If on a multiuser mount without unix extensions, and the admin hasn't 1794 * If on a multiuser mount without unix extensions or cifsacl being
1795 * overridden them, set the ownership to the fsuid/fsgid of the current 1795 * enabled, and the admin hasn't overridden them, set the ownership
1796 * process. 1796 * to the fsuid/fsgid of the current process.
1797 */ 1797 */
1798 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) && 1798 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
1799 !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
1799 !tcon->unix_ext) { 1800 !tcon->unix_ext) {
1800 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)) 1801 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
1801 stat->uid = current_fsuid(); 1802 stat->uid = current_fsuid();
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index d5ce9e26696c..a82bc51fdc82 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -204,7 +204,7 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
204 return rc; 204 return rc;
205} 205}
206 206
207int 207void
208cifs_set_port(struct sockaddr *addr, const unsigned short int port) 208cifs_set_port(struct sockaddr *addr, const unsigned short int port)
209{ 209{
210 switch (addr->sa_family) { 210 switch (addr->sa_family) {
@@ -214,19 +214,7 @@ cifs_set_port(struct sockaddr *addr, const unsigned short int port)
214 case AF_INET6: 214 case AF_INET6:
215 ((struct sockaddr_in6 *)addr)->sin6_port = htons(port); 215 ((struct sockaddr_in6 *)addr)->sin6_port = htons(port);
216 break; 216 break;
217 default:
218 return 0;
219 } 217 }
220 return 1;
221}
222
223int
224cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
225 const unsigned short int port)
226{
227 if (!cifs_convert_address(dst, src, len))
228 return 0;
229 return cifs_set_port(dst, port);
230} 218}
231 219
232/***************************************************************************** 220/*****************************************************************************
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f9b5d3d6cf33..6002fdc920ae 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -66,18 +66,20 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
66#endif /* DEBUG2 */ 66#endif /* DEBUG2 */
67 67
68/* 68/*
69 * Attempt to preload the dcache with the results from the FIND_FIRST/NEXT
70 *
69 * Find the dentry that matches "name". If there isn't one, create one. If it's 71 * Find the dentry that matches "name". If there isn't one, create one. If it's
70 * a negative dentry or the uniqueid changed, then drop it and recreate it. 72 * a negative dentry or the uniqueid changed, then drop it and recreate it.
71 */ 73 */
72static struct dentry * 74static void
73cifs_readdir_lookup(struct dentry *parent, struct qstr *name, 75cifs_prime_dcache(struct dentry *parent, struct qstr *name,
74 struct cifs_fattr *fattr) 76 struct cifs_fattr *fattr)
75{ 77{
76 struct dentry *dentry, *alias; 78 struct dentry *dentry, *alias;
77 struct inode *inode; 79 struct inode *inode;
78 struct super_block *sb = parent->d_inode->i_sb; 80 struct super_block *sb = parent->d_inode->i_sb;
79 81
80 cFYI(1, "For %s", name->name); 82 cFYI(1, "%s: for %s", __func__, name->name);
81 83
82 if (parent->d_op && parent->d_op->d_hash) 84 if (parent->d_op && parent->d_op->d_hash)
83 parent->d_op->d_hash(parent, parent->d_inode, name); 85 parent->d_op->d_hash(parent, parent->d_inode, name);
@@ -86,35 +88,33 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
86 88
87 dentry = d_lookup(parent, name); 89 dentry = d_lookup(parent, name);
88 if (dentry) { 90 if (dentry) {
91 int err;
92
89 inode = dentry->d_inode; 93 inode = dentry->d_inode;
90 /* update inode in place if i_ino didn't change */ 94 /* update inode in place if i_ino didn't change */
91 if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) { 95 if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) {
92 cifs_fattr_to_inode(inode, fattr); 96 cifs_fattr_to_inode(inode, fattr);
93 return dentry; 97 goto out;
94 } 98 }
95 d_drop(dentry); 99 err = d_invalidate(dentry);
96 dput(dentry); 100 dput(dentry);
101 if (err)
102 return;
97 } 103 }
98 104
99 dentry = d_alloc(parent, name); 105 dentry = d_alloc(parent, name);
100 if (dentry == NULL) 106 if (!dentry)
101 return NULL; 107 return;
102 108
103 inode = cifs_iget(sb, fattr); 109 inode = cifs_iget(sb, fattr);
104 if (!inode) { 110 if (!inode)
105 dput(dentry); 111 goto out;
106 return NULL;
107 }
108 112
109 alias = d_materialise_unique(dentry, inode); 113 alias = d_materialise_unique(dentry, inode);
110 if (alias != NULL) { 114 if (alias && !IS_ERR(alias))
111 dput(dentry); 115 dput(alias);
112 if (IS_ERR(alias)) 116out:
113 return NULL; 117 dput(dentry);
114 dentry = alias;
115 }
116
117 return dentry;
118} 118}
119 119
120static void 120static void
@@ -134,6 +134,16 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
134 if (fattr->cf_cifsattrs & ATTR_READONLY) 134 if (fattr->cf_cifsattrs & ATTR_READONLY)
135 fattr->cf_mode &= ~S_IWUGO; 135 fattr->cf_mode &= ~S_IWUGO;
136 136
137 /*
138 * We of course don't get ACL info in FIND_FIRST/NEXT results, so
139 * mark it for revalidation so that "ls -l" will look right. It might
140 * be super-slow, but if we don't do this then the ownership of files
141 * may look wrong since the inodes may not have timed out by the time
142 * "ls" does a stat() call on them.
143 */
144 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
145 fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
146
137 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL && 147 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL &&
138 fattr->cf_cifsattrs & ATTR_SYSTEM) { 148 fattr->cf_cifsattrs & ATTR_SYSTEM) {
139 if (fattr->cf_eof == 0) { 149 if (fattr->cf_eof == 0) {
@@ -649,7 +659,6 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
649 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 659 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
650 struct cifs_dirent de = { NULL, }; 660 struct cifs_dirent de = { NULL, };
651 struct cifs_fattr fattr; 661 struct cifs_fattr fattr;
652 struct dentry *dentry;
653 struct qstr name; 662 struct qstr name;
654 int rc = 0; 663 int rc = 0;
655 ino_t ino; 664 ino_t ino;
@@ -720,13 +729,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
720 */ 729 */
721 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; 730 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
722 731
723 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); 732 cifs_prime_dcache(file->f_dentry, &name, &fattr);
724 dentry = cifs_readdir_lookup(file->f_dentry, &name, &fattr);
725 733
734 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
726 rc = filldir(dirent, name.name, name.len, file->f_pos, ino, 735 rc = filldir(dirent, name.name, name.len, file->f_pos, ino,
727 fattr.cf_dtype); 736 fattr.cf_dtype);
728
729 dput(dentry);
730 return rc; 737 return rc;
731} 738}
732 739
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 56cc4be87807..a5d234c8d5d9 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -575,37 +575,6 @@ cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
575 return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data); 575 return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data);
576} 576}
577 577
578static char *
579cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
580 struct cifs_tcon *tcon)
581{
582 int pplen = vol->prepath ? strlen(vol->prepath) : 0;
583 int dfsplen;
584 char *full_path = NULL;
585
586 /* if no prefix path, simply set path to the root of share to "" */
587 if (pplen == 0) {
588 full_path = kzalloc(1, GFP_KERNEL);
589 return full_path;
590 }
591
592 if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
593 dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
594 else
595 dfsplen = 0;
596
597 full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL);
598 if (full_path == NULL)
599 return full_path;
600
601 if (dfsplen)
602 strncpy(full_path, tcon->treeName, dfsplen);
603 strncpy(full_path + dfsplen, vol->prepath, pplen);
604 convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
605 full_path[dfsplen + pplen] = 0; /* add trailing null */
606 return full_path;
607}
608
609static void 578static void
610cifs_clear_stats(struct cifs_tcon *tcon) 579cifs_clear_stats(struct cifs_tcon *tcon)
611{ 580{
@@ -766,7 +735,6 @@ smb_set_file_info(struct inode *inode, const char *full_path,
766 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 735 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
767 struct tcon_link *tlink = NULL; 736 struct tcon_link *tlink = NULL;
768 struct cifs_tcon *tcon; 737 struct cifs_tcon *tcon;
769 FILE_BASIC_INFO info_buf;
770 738
771 /* if the file is already open for write, just use that fileid */ 739 /* if the file is already open for write, just use that fileid */
772 open_file = find_writable_file(cinode, true); 740 open_file = find_writable_file(cinode, true);
@@ -817,7 +785,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
817 netpid = current->tgid; 785 netpid = current->tgid;
818 786
819set_via_filehandle: 787set_via_filehandle:
820 rc = CIFSSMBSetFileInfo(xid, tcon, &info_buf, netfid, netpid); 788 rc = CIFSSMBSetFileInfo(xid, tcon, buf, netfid, netpid);
821 if (!rc) 789 if (!rc)
822 cinode->cifsAttrs = le32_to_cpu(buf->Attributes); 790 cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
823 791
@@ -944,7 +912,6 @@ struct smb_version_operations smb1_operations = {
944 .set_path_size = CIFSSMBSetEOF, 912 .set_path_size = CIFSSMBSetEOF,
945 .set_file_size = CIFSSMBSetFileSize, 913 .set_file_size = CIFSSMBSetFileSize,
946 .set_file_info = smb_set_file_info, 914 .set_file_info = smb_set_file_info,
947 .build_path_to_root = cifs_build_path_to_root,
948 .echo = CIFSSMBEcho, 915 .echo = CIFSSMBEcho,
949 .mkdir = CIFSSMBMkDir, 916 .mkdir = CIFSSMBMkDir,
950 .mkdir_setinfo = cifs_mkdir_setinfo, 917 .mkdir_setinfo = cifs_mkdir_setinfo,
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index a93eec30a50d..71e6aed4b382 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -260,13 +260,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
260 struct cifs_fid_locks *fdlocks; 260 struct cifs_fid_locks *fdlocks;
261 261
262 xid = get_xid(); 262 xid = get_xid();
263 /* we are going to update can_cache_brlcks here - need a write access */
264 down_write(&cinode->lock_sem);
265 if (!cinode->can_cache_brlcks) {
266 up_write(&cinode->lock_sem);
267 free_xid(xid);
268 return rc;
269 }
270 263
271 /* 264 /*
272 * Accessing maxBuf is racy with cifs_reconnect - need to store value 265 * Accessing maxBuf is racy with cifs_reconnect - need to store value
@@ -274,7 +267,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
274 */ 267 */
275 max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf; 268 max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf;
276 if (!max_buf) { 269 if (!max_buf) {
277 up_write(&cinode->lock_sem);
278 free_xid(xid); 270 free_xid(xid);
279 return -EINVAL; 271 return -EINVAL;
280 } 272 }
@@ -282,7 +274,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
282 max_num = max_buf / sizeof(struct smb2_lock_element); 274 max_num = max_buf / sizeof(struct smb2_lock_element);
283 buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL); 275 buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL);
284 if (!buf) { 276 if (!buf) {
285 up_write(&cinode->lock_sem);
286 free_xid(xid); 277 free_xid(xid);
287 return -ENOMEM; 278 return -ENOMEM;
288 } 279 }
@@ -293,10 +284,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
293 rc = stored_rc; 284 rc = stored_rc;
294 } 285 }
295 286
296 cinode->can_cache_brlcks = false;
297 kfree(buf); 287 kfree(buf);
298
299 up_write(&cinode->lock_sem);
300 free_xid(xid); 288 free_xid(xid);
301 return rc; 289 return rc;
302} 290}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 4d9dbe0b7385..d79de7bc4435 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -262,23 +262,6 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
262 return rc; 262 return rc;
263} 263}
264 264
265static char *
266smb2_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
267 struct cifs_tcon *tcon)
268{
269 int pplen = vol->prepath ? strlen(vol->prepath) : 0;
270 char *full_path = NULL;
271
272 /* if no prefix path, simply set path to the root of share to "" */
273 if (pplen == 0) {
274 full_path = kzalloc(2, GFP_KERNEL);
275 return full_path;
276 }
277
278 cERROR(1, "prefixpath is not supported for SMB2 now");
279 return NULL;
280}
281
282static bool 265static bool
283smb2_can_echo(struct TCP_Server_Info *server) 266smb2_can_echo(struct TCP_Server_Info *server)
284{ 267{
@@ -613,7 +596,6 @@ struct smb_version_operations smb21_operations = {
613 .set_path_size = smb2_set_path_size, 596 .set_path_size = smb2_set_path_size,
614 .set_file_size = smb2_set_file_size, 597 .set_file_size = smb2_set_file_size,
615 .set_file_info = smb2_set_file_info, 598 .set_file_info = smb2_set_file_info,
616 .build_path_to_root = smb2_build_path_to_root,
617 .mkdir = smb2_mkdir, 599 .mkdir = smb2_mkdir,
618 .mkdir_setinfo = smb2_mkdir_setinfo, 600 .mkdir_setinfo = smb2_mkdir_setinfo,
619 .rmdir = smb2_rmdir, 601 .rmdir = smb2_rmdir,
@@ -641,6 +623,91 @@ struct smb_version_operations smb21_operations = {
641 .get_lease_key = smb2_get_lease_key, 623 .get_lease_key = smb2_get_lease_key,
642 .set_lease_key = smb2_set_lease_key, 624 .set_lease_key = smb2_set_lease_key,
643 .new_lease_key = smb2_new_lease_key, 625 .new_lease_key = smb2_new_lease_key,
626 .calc_signature = smb2_calc_signature,
627};
628
629
630struct smb_version_operations smb30_operations = {
631 .compare_fids = smb2_compare_fids,
632 .setup_request = smb2_setup_request,
633 .setup_async_request = smb2_setup_async_request,
634 .check_receive = smb2_check_receive,
635 .add_credits = smb2_add_credits,
636 .set_credits = smb2_set_credits,
637 .get_credits_field = smb2_get_credits_field,
638 .get_credits = smb2_get_credits,
639 .get_next_mid = smb2_get_next_mid,
640 .read_data_offset = smb2_read_data_offset,
641 .read_data_length = smb2_read_data_length,
642 .map_error = map_smb2_to_linux_error,
643 .find_mid = smb2_find_mid,
644 .check_message = smb2_check_message,
645 .dump_detail = smb2_dump_detail,
646 .clear_stats = smb2_clear_stats,
647 .print_stats = smb2_print_stats,
648 .is_oplock_break = smb2_is_valid_oplock_break,
649 .need_neg = smb2_need_neg,
650 .negotiate = smb2_negotiate,
651 .negotiate_wsize = smb2_negotiate_wsize,
652 .negotiate_rsize = smb2_negotiate_rsize,
653 .sess_setup = SMB2_sess_setup,
654 .logoff = SMB2_logoff,
655 .tree_connect = SMB2_tcon,
656 .tree_disconnect = SMB2_tdis,
657 .is_path_accessible = smb2_is_path_accessible,
658 .can_echo = smb2_can_echo,
659 .echo = SMB2_echo,
660 .query_path_info = smb2_query_path_info,
661 .get_srv_inum = smb2_get_srv_inum,
662 .query_file_info = smb2_query_file_info,
663 .set_path_size = smb2_set_path_size,
664 .set_file_size = smb2_set_file_size,
665 .set_file_info = smb2_set_file_info,
666 .mkdir = smb2_mkdir,
667 .mkdir_setinfo = smb2_mkdir_setinfo,
668 .rmdir = smb2_rmdir,
669 .unlink = smb2_unlink,
670 .rename = smb2_rename_path,
671 .create_hardlink = smb2_create_hardlink,
672 .open = smb2_open_file,
673 .set_fid = smb2_set_fid,
674 .close = smb2_close_file,
675 .flush = smb2_flush_file,
676 .async_readv = smb2_async_readv,
677 .async_writev = smb2_async_writev,
678 .sync_read = smb2_sync_read,
679 .sync_write = smb2_sync_write,
680 .query_dir_first = smb2_query_dir_first,
681 .query_dir_next = smb2_query_dir_next,
682 .close_dir = smb2_close_dir,
683 .calc_smb_size = smb2_calc_size,
684 .is_status_pending = smb2_is_status_pending,
685 .oplock_response = smb2_oplock_response,
686 .queryfs = smb2_queryfs,
687 .mand_lock = smb2_mand_lock,
688 .mand_unlock_range = smb2_unlock_range,
689 .push_mand_locks = smb2_push_mandatory_locks,
690 .get_lease_key = smb2_get_lease_key,
691 .set_lease_key = smb2_set_lease_key,
692 .new_lease_key = smb2_new_lease_key,
693 .calc_signature = smb3_calc_signature,
694};
695
696struct smb_version_values smb20_values = {
697 .version_string = SMB20_VERSION_STRING,
698 .protocol_id = SMB20_PROT_ID,
699 .req_capabilities = 0, /* MBZ */
700 .large_lock_type = 0,
701 .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
702 .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
703 .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
704 .header_size = sizeof(struct smb2_hdr),
705 .max_header_size = MAX_SMB2_HDR_SIZE,
706 .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
707 .lock_cmd = SMB2_LOCK,
708 .cap_unix = 0,
709 .cap_nt_find = SMB2_NT_FIND,
710 .cap_large_files = SMB2_LARGE_FILES,
644}; 711};
645 712
646struct smb_version_values smb21_values = { 713struct smb_version_values smb21_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index cf33622cdac8..41d9d0725f0f 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -425,7 +425,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
425 } 425 }
426 426
427 cFYI(1, "sec_flags 0x%x", sec_flags); 427 cFYI(1, "sec_flags 0x%x", sec_flags);
428 if (sec_flags & CIFSSEC_MUST_SIGN) { 428 if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
429 cFYI(1, "Signing required"); 429 cFYI(1, "Signing required");
430 if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED | 430 if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED |
431 SMB2_NEGOTIATE_SIGNING_ENABLED))) { 431 SMB2_NEGOTIATE_SIGNING_ENABLED))) {
@@ -612,7 +612,8 @@ ssetup_ntlmssp_authenticate:
612 612
613 /* BB add code to build os and lm fields */ 613 /* BB add code to build os and lm fields */
614 614
615 rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, CIFS_LOG_ERROR); 615 rc = SendReceive2(xid, ses, iov, 2, &resp_buftype,
616 CIFS_LOG_ERROR | CIFS_NEG_OP);
616 617
617 kfree(security_blob); 618 kfree(security_blob);
618 rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base; 619 rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 7d25f8b14f93..2aa3535e38ce 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -47,6 +47,10 @@ extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses,
47 struct smb_rqst *rqst); 47 struct smb_rqst *rqst);
48extern struct mid_q_entry *smb2_setup_async_request( 48extern struct mid_q_entry *smb2_setup_async_request(
49 struct TCP_Server_Info *server, struct smb_rqst *rqst); 49 struct TCP_Server_Info *server, struct smb_rqst *rqst);
50extern int smb2_calc_signature(struct smb_rqst *rqst,
51 struct TCP_Server_Info *server);
52extern int smb3_calc_signature(struct smb_rqst *rqst,
53 struct TCP_Server_Info *server);
50extern void smb2_echo_request(struct work_struct *work); 54extern void smb2_echo_request(struct work_struct *work);
51extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode); 55extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode);
52extern __u8 smb2_map_lease_to_oplock(__le32 lease_state); 56extern __u8 smb2_map_lease_to_oplock(__le32 lease_state);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 2a5fdf26f79f..8dd73e61d762 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -39,7 +39,7 @@
39#include "smb2status.h" 39#include "smb2status.h"
40#include "smb2glob.h" 40#include "smb2glob.h"
41 41
42static int 42int
43smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) 43smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
44{ 44{
45 int i, rc; 45 int i, rc;
@@ -116,6 +116,13 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
116 return rc; 116 return rc;
117} 117}
118 118
119int
120smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
121{
122 cFYI(1, "smb3 signatures not supported yet");
123 return -EOPNOTSUPP;
124}
125
119/* must be called with server->srv_mutex held */ 126/* must be called with server->srv_mutex held */
120static int 127static int
121smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) 128smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
@@ -132,7 +139,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
132 return rc; 139 return rc;
133 } 140 }
134 141
135 rc = smb2_calc_signature(rqst, server); 142 rc = server->ops->calc_signature(rqst, server);
136 143
137 return rc; 144 return rc;
138} 145}
@@ -168,7 +175,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
168 memset(smb2_pdu->Signature, 0, SMB2_SIGNATURE_SIZE); 175 memset(smb2_pdu->Signature, 0, SMB2_SIGNATURE_SIZE);
169 176
170 mutex_lock(&server->srv_mutex); 177 mutex_lock(&server->srv_mutex);
171 rc = smb2_calc_signature(rqst, server); 178 rc = server->ops->calc_signature(rqst, server);
172 mutex_unlock(&server->srv_mutex); 179 mutex_unlock(&server->srv_mutex);
173 180
174 if (rc) 181 if (rc)
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 4c6285fff598..e2f57a007029 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -844,6 +844,9 @@ COMPATIBLE_IOCTL(TIOCGDEV)
844COMPATIBLE_IOCTL(TIOCCBRK) 844COMPATIBLE_IOCTL(TIOCCBRK)
845COMPATIBLE_IOCTL(TIOCGSID) 845COMPATIBLE_IOCTL(TIOCGSID)
846COMPATIBLE_IOCTL(TIOCGICOUNT) 846COMPATIBLE_IOCTL(TIOCGICOUNT)
847COMPATIBLE_IOCTL(TIOCGPKT)
848COMPATIBLE_IOCTL(TIOCGPTLCK)
849COMPATIBLE_IOCTL(TIOCGEXCL)
847/* Little t */ 850/* Little t */
848COMPATIBLE_IOCTL(TIOCGETD) 851COMPATIBLE_IOCTL(TIOCGETD)
849COMPATIBLE_IOCTL(TIOCSETD) 852COMPATIBLE_IOCTL(TIOCSETD)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 7414ae24a79b..712b10f64c70 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1613,12 +1613,12 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
1613 return 0; 1613 return 0;
1614} 1614}
1615 1615
1616static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin) 1616static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
1617{ 1617{
1618 struct dentry * dentry = file->f_path.dentry; 1618 struct dentry * dentry = file->f_path.dentry;
1619 1619
1620 mutex_lock(&dentry->d_inode->i_mutex); 1620 mutex_lock(&dentry->d_inode->i_mutex);
1621 switch (origin) { 1621 switch (whence) {
1622 case 1: 1622 case 1:
1623 offset += file->f_pos; 1623 offset += file->f_pos;
1624 case 0: 1624 case 0:
diff --git a/fs/coredump.c b/fs/coredump.c
index ce47379bfa61..177493272a61 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -458,7 +458,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
458 return err; 458 return err;
459} 459}
460 460
461void do_coredump(siginfo_t *siginfo, struct pt_regs *regs) 461void do_coredump(siginfo_t *siginfo)
462{ 462{
463 struct core_state core_state; 463 struct core_state core_state;
464 struct core_name cn; 464 struct core_name cn;
@@ -474,7 +474,7 @@ void do_coredump(siginfo_t *siginfo, struct pt_regs *regs)
474 static atomic_t core_dump_count = ATOMIC_INIT(0); 474 static atomic_t core_dump_count = ATOMIC_INIT(0);
475 struct coredump_params cprm = { 475 struct coredump_params cprm = {
476 .siginfo = siginfo, 476 .siginfo = siginfo,
477 .regs = regs, 477 .regs = signal_pt_regs(),
478 .limit = rlimit(RLIMIT_CORE), 478 .limit = rlimit(RLIMIT_CORE),
479 /* 479 /*
480 * We must use the same mm->flags while dumping core to avoid 480 * We must use the same mm->flags while dumping core to avoid
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b607d92cdf24..153bb1e42e63 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -59,7 +59,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev
59 case S_IFDIR: 59 case S_IFDIR:
60 inode->i_op = &simple_dir_inode_operations; 60 inode->i_op = &simple_dir_inode_operations;
61 inode->i_fop = &simple_dir_operations; 61 inode->i_fop = &simple_dir_operations;
62 inode->i_private = NULL;
63 62
64 /* directory inodes start off with i_nlink == 2 63 /* directory inodes start off with i_nlink == 2
65 * (for "." entry) */ 64 * (for "." entry) */
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 14afbabe6546..472e6befc54d 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -545,37 +545,38 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx)
545 mutex_unlock(&allocated_ptys_lock); 545 mutex_unlock(&allocated_ptys_lock);
546} 546}
547 547
548int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty) 548/**
549 * devpts_pty_new -- create a new inode in /dev/pts/
550 * @ptmx_inode: inode of the master
551 * @device: major+minor of the node to be created
552 * @index: used as a name of the node
553 * @priv: what's given back by devpts_get_priv
554 *
555 * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill.
556 */
557struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
558 void *priv)
549{ 559{
550 /* tty layer puts index from devpts_new_index() in here */
551 int number = tty->index;
552 struct tty_driver *driver = tty->driver;
553 dev_t device = MKDEV(driver->major, driver->minor_start+number);
554 struct dentry *dentry; 560 struct dentry *dentry;
555 struct super_block *sb = pts_sb_from_inode(ptmx_inode); 561 struct super_block *sb = pts_sb_from_inode(ptmx_inode);
556 struct inode *inode = new_inode(sb); 562 struct inode *inode;
557 struct dentry *root = sb->s_root; 563 struct dentry *root = sb->s_root;
558 struct pts_fs_info *fsi = DEVPTS_SB(sb); 564 struct pts_fs_info *fsi = DEVPTS_SB(sb);
559 struct pts_mount_opts *opts = &fsi->mount_opts; 565 struct pts_mount_opts *opts = &fsi->mount_opts;
560 int ret = 0;
561 char s[12]; 566 char s[12];
562 567
563 /* We're supposed to be given the slave end of a pty */ 568 inode = new_inode(sb);
564 BUG_ON(driver->type != TTY_DRIVER_TYPE_PTY);
565 BUG_ON(driver->subtype != PTY_TYPE_SLAVE);
566
567 if (!inode) 569 if (!inode)
568 return -ENOMEM; 570 return ERR_PTR(-ENOMEM);
569 571
570 inode->i_ino = number + 3; 572 inode->i_ino = index + 3;
571 inode->i_uid = opts->setuid ? opts->uid : current_fsuid(); 573 inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
572 inode->i_gid = opts->setgid ? opts->gid : current_fsgid(); 574 inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
573 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 575 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
574 init_special_inode(inode, S_IFCHR|opts->mode, device); 576 init_special_inode(inode, S_IFCHR|opts->mode, device);
575 inode->i_private = tty; 577 inode->i_private = priv;
576 tty->driver_data = inode;
577 578
578 sprintf(s, "%d", number); 579 sprintf(s, "%d", index);
579 580
580 mutex_lock(&root->d_inode->i_mutex); 581 mutex_lock(&root->d_inode->i_mutex);
581 582
@@ -585,18 +586,24 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
585 fsnotify_create(root->d_inode, dentry); 586 fsnotify_create(root->d_inode, dentry);
586 } else { 587 } else {
587 iput(inode); 588 iput(inode);
588 ret = -ENOMEM; 589 inode = ERR_PTR(-ENOMEM);
589 } 590 }
590 591
591 mutex_unlock(&root->d_inode->i_mutex); 592 mutex_unlock(&root->d_inode->i_mutex);
592 593
593 return ret; 594 return inode;
594} 595}
595 596
596struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number) 597/**
598 * devpts_get_priv -- get private data for a slave
599 * @pts_inode: inode of the slave
600 *
601 * Returns whatever was passed as priv in devpts_pty_new for a given inode.
602 */
603void *devpts_get_priv(struct inode *pts_inode)
597{ 604{
598 struct dentry *dentry; 605 struct dentry *dentry;
599 struct tty_struct *tty; 606 void *priv = NULL;
600 607
601 BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); 608 BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
602 609
@@ -605,18 +612,22 @@ struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
605 if (!dentry) 612 if (!dentry)
606 return NULL; 613 return NULL;
607 614
608 tty = NULL;
609 if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) 615 if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
610 tty = (struct tty_struct *)pts_inode->i_private; 616 priv = pts_inode->i_private;
611 617
612 dput(dentry); 618 dput(dentry);
613 619
614 return tty; 620 return priv;
615} 621}
616 622
617void devpts_pty_kill(struct tty_struct *tty) 623/**
624 * devpts_pty_kill -- remove inode form /dev/pts/
625 * @inode: inode of the slave to be removed
626 *
627 * This is an inverse operation of devpts_pty_new.
628 */
629void devpts_pty_kill(struct inode *inode)
618{ 630{
619 struct inode *inode = tty->driver_data;
620 struct super_block *sb = pts_sb_from_inode(inode); 631 struct super_block *sb = pts_sb_from_inode(inode);
621 struct dentry *root = sb->s_root; 632 struct dentry *root = sb->s_root;
622 struct dentry *dentry; 633 struct dentry *dentry;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index f86c720dba0e..cf5b44b10c67 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -540,6 +540,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
540 sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ 540 sector_t fs_endblk; /* Into file, in filesystem-sized blocks */
541 unsigned long fs_count; /* Number of filesystem-sized blocks */ 541 unsigned long fs_count; /* Number of filesystem-sized blocks */
542 int create; 542 int create;
543 unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;
543 544
544 /* 545 /*
545 * If there was a memory error and we've overwritten all the 546 * If there was a memory error and we've overwritten all the
@@ -554,7 +555,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
554 fs_count = fs_endblk - fs_startblk + 1; 555 fs_count = fs_endblk - fs_startblk + 1;
555 556
556 map_bh->b_state = 0; 557 map_bh->b_state = 0;
557 map_bh->b_size = fs_count << dio->inode->i_blkbits; 558 map_bh->b_size = fs_count << i_blkbits;
558 559
559 /* 560 /*
560 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we 561 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
@@ -1053,7 +1054,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1053 int seg; 1054 int seg;
1054 size_t size; 1055 size_t size;
1055 unsigned long addr; 1056 unsigned long addr;
1056 unsigned blkbits = inode->i_blkbits; 1057 unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
1058 unsigned blkbits = i_blkbits;
1057 unsigned blocksize_mask = (1 << blkbits) - 1; 1059 unsigned blocksize_mask = (1 << blkbits) - 1;
1058 ssize_t retval = -EINVAL; 1060 ssize_t retval = -EINVAL;
1059 loff_t end = offset; 1061 loff_t end = offset;
@@ -1149,7 +1151,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1149 dio->inode = inode; 1151 dio->inode = inode;
1150 dio->rw = rw; 1152 dio->rw = rw;
1151 sdio.blkbits = blkbits; 1153 sdio.blkbits = blkbits;
1152 sdio.blkfactor = inode->i_blkbits - blkbits; 1154 sdio.blkfactor = i_blkbits - blkbits;
1153 sdio.block_in_file = offset >> blkbits; 1155 sdio.block_in_file = offset >> blkbits;
1154 1156
1155 sdio.get_block = get_block; 1157 sdio.get_block = get_block;
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 1897eb1b4b6a..e4242c3f8486 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -1,6 +1,6 @@
1menuconfig DLM 1menuconfig DLM
2 tristate "Distributed Lock Manager (DLM)" 2 tristate "Distributed Lock Manager (DLM)"
3 depends on EXPERIMENTAL && INET 3 depends on INET
4 depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) 4 depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
5 select IP_SCTP 5 select IP_SCTP
6 help 6 help
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 871c1abf6029..77c0f70f8fe8 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -337,6 +337,7 @@ enum rsb_flags {
337 RSB_NEW_MASTER2, 337 RSB_NEW_MASTER2,
338 RSB_RECOVER_CONVERT, 338 RSB_RECOVER_CONVERT,
339 RSB_RECOVER_GRANT, 339 RSB_RECOVER_GRANT,
340 RSB_RECOVER_LVB_INVAL,
340}; 341};
341 342
342static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag) 343static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index b56950758188..a579f30f237d 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -5393,6 +5393,13 @@ static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r,
5393 if ((lkb->lkb_nodeid == nodeid_gone) || 5393 if ((lkb->lkb_nodeid == nodeid_gone) ||
5394 dlm_is_removed(ls, lkb->lkb_nodeid)) { 5394 dlm_is_removed(ls, lkb->lkb_nodeid)) {
5395 5395
5396 /* tell recover_lvb to invalidate the lvb
5397 because a node holding EX/PW failed */
5398 if ((lkb->lkb_exflags & DLM_LKF_VALBLK) &&
5399 (lkb->lkb_grmode >= DLM_LOCK_PW)) {
5400 rsb_set_flag(r, RSB_RECOVER_LVB_INVAL);
5401 }
5402
5396 del_lkb(r, lkb); 5403 del_lkb(r, lkb);
5397 5404
5398 /* this put should free the lkb */ 5405 /* this put should free the lkb */
@@ -6025,15 +6032,18 @@ static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
6025 return error; 6032 return error;
6026} 6033}
6027 6034
6028/* The force flag allows the unlock to go ahead even if the lkb isn't granted. 6035/* The FORCEUNLOCK flag allows the unlock to go ahead even if the lkb isn't
6029 Regardless of what rsb queue the lock is on, it's removed and freed. */ 6036 granted. Regardless of what rsb queue the lock is on, it's removed and
6037 freed. The IVVALBLK flag causes the lvb on the resource to be invalidated
6038 if our lock is PW/EX (it's ignored if our granted mode is smaller.) */
6030 6039
6031static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) 6040static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
6032{ 6041{
6033 struct dlm_args args; 6042 struct dlm_args args;
6034 int error; 6043 int error;
6035 6044
6036 set_unlock_args(DLM_LKF_FORCEUNLOCK, lkb->lkb_ua, &args); 6045 set_unlock_args(DLM_LKF_FORCEUNLOCK | DLM_LKF_IVVALBLK,
6046 lkb->lkb_ua, &args);
6037 6047
6038 error = unlock_lock(ls, lkb, &args); 6048 error = unlock_lock(ls, lkb, &args);
6039 if (error == -DLM_EUNLOCK) 6049 if (error == -DLM_EUNLOCK)
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 331ea4f94efd..dd87a31bcc21 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1385,7 +1385,6 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
1385 struct connection *con; 1385 struct connection *con;
1386 struct writequeue_entry *e; 1386 struct writequeue_entry *e;
1387 int offset = 0; 1387 int offset = 0;
1388 int users = 0;
1389 1388
1390 con = nodeid2con(nodeid, allocation); 1389 con = nodeid2con(nodeid, allocation);
1391 if (!con) 1390 if (!con)
@@ -1399,7 +1398,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
1399 } else { 1398 } else {
1400 offset = e->end; 1399 offset = e->end;
1401 e->end += len; 1400 e->end += len;
1402 users = e->users++; 1401 e->users++;
1403 } 1402 }
1404 spin_unlock(&con->writequeue_lock); 1403 spin_unlock(&con->writequeue_lock);
1405 1404
@@ -1414,7 +1413,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
1414 spin_lock(&con->writequeue_lock); 1413 spin_lock(&con->writequeue_lock);
1415 offset = e->end; 1414 offset = e->end;
1416 e->end += len; 1415 e->end += len;
1417 users = e->users++; 1416 e->users++;
1418 list_add_tail(&e->list, &con->writequeue); 1417 list_add_tail(&e->list, &con->writequeue);
1419 spin_unlock(&con->writequeue_lock); 1418 spin_unlock(&con->writequeue_lock);
1420 goto got_one; 1419 goto got_one;
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 4a7a76e42fc3..aedea28a86a1 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -717,8 +717,14 @@ void dlm_recovered_lock(struct dlm_rsb *r)
717 * the VALNOTVALID flag if necessary, and determining the correct lvb contents 717 * the VALNOTVALID flag if necessary, and determining the correct lvb contents
718 * based on the lvb's of the locks held on the rsb. 718 * based on the lvb's of the locks held on the rsb.
719 * 719 *
720 * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it 720 * RSB_VALNOTVALID is set in two cases:
721 * was already set prior to recovery, it's not cleared, regardless of locks. 721 *
722 * 1. we are master, but not new, and we purged an EX/PW lock held by a
723 * failed node (in dlm_recover_purge which set RSB_RECOVER_LVB_INVAL)
724 *
725 * 2. we are a new master, and there are only NL/CR locks left.
726 * (We could probably improve this by only invaliding in this way when
727 * the previous master left uncleanly. VMS docs mention that.)
722 * 728 *
723 * The LVB contents are only considered for changing when this is a new master 729 * The LVB contents are only considered for changing when this is a new master
724 * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with 730 * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
@@ -734,6 +740,19 @@ static void recover_lvb(struct dlm_rsb *r)
734 int big_lock_exists = 0; 740 int big_lock_exists = 0;
735 int lvblen = r->res_ls->ls_lvblen; 741 int lvblen = r->res_ls->ls_lvblen;
736 742
743 if (!rsb_flag(r, RSB_NEW_MASTER2) &&
744 rsb_flag(r, RSB_RECOVER_LVB_INVAL)) {
745 /* case 1 above */
746 rsb_set_flag(r, RSB_VALNOTVALID);
747 return;
748 }
749
750 if (!rsb_flag(r, RSB_NEW_MASTER2))
751 return;
752
753 /* we are the new master, so figure out if VALNOTVALID should
754 be set, and set the rsb lvb from the best lkb available. */
755
737 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { 756 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
738 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) 757 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
739 continue; 758 continue;
@@ -772,13 +791,10 @@ static void recover_lvb(struct dlm_rsb *r)
772 if (!lock_lvb_exists) 791 if (!lock_lvb_exists)
773 goto out; 792 goto out;
774 793
794 /* lvb is invalidated if only NL/CR locks remain */
775 if (!big_lock_exists) 795 if (!big_lock_exists)
776 rsb_set_flag(r, RSB_VALNOTVALID); 796 rsb_set_flag(r, RSB_VALNOTVALID);
777 797
778 /* don't mess with the lvb unless we're the new master */
779 if (!rsb_flag(r, RSB_NEW_MASTER2))
780 goto out;
781
782 if (!r->res_lvbptr) { 798 if (!r->res_lvbptr) {
783 r->res_lvbptr = dlm_allocate_lvb(r->res_ls); 799 r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
784 if (!r->res_lvbptr) 800 if (!r->res_lvbptr)
@@ -852,12 +868,19 @@ void dlm_recover_rsbs(struct dlm_ls *ls)
852 if (is_master(r)) { 868 if (is_master(r)) {
853 if (rsb_flag(r, RSB_RECOVER_CONVERT)) 869 if (rsb_flag(r, RSB_RECOVER_CONVERT))
854 recover_conversion(r); 870 recover_conversion(r);
871
872 /* recover lvb before granting locks so the updated
873 lvb/VALNOTVALID is presented in the completion */
874 recover_lvb(r);
875
855 if (rsb_flag(r, RSB_NEW_MASTER2)) 876 if (rsb_flag(r, RSB_NEW_MASTER2))
856 recover_grant(r); 877 recover_grant(r);
857 recover_lvb(r);
858 count++; 878 count++;
879 } else {
880 rsb_clear_flag(r, RSB_VALNOTVALID);
859 } 881 }
860 rsb_clear_flag(r, RSB_RECOVER_CONVERT); 882 rsb_clear_flag(r, RSB_RECOVER_CONVERT);
883 rsb_clear_flag(r, RSB_RECOVER_LVB_INVAL);
861 rsb_clear_flag(r, RSB_NEW_MASTER2); 884 rsb_clear_flag(r, RSB_NEW_MASTER2);
862 unlock_rsb(r); 885 unlock_rsb(r);
863 } 886 }
diff --git a/fs/eventfd.c b/fs/eventfd.c
index d81b9f654086..35470d9b96e6 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -19,6 +19,8 @@
19#include <linux/export.h> 19#include <linux/export.h>
20#include <linux/kref.h> 20#include <linux/kref.h>
21#include <linux/eventfd.h> 21#include <linux/eventfd.h>
22#include <linux/proc_fs.h>
23#include <linux/seq_file.h>
22 24
23struct eventfd_ctx { 25struct eventfd_ctx {
24 struct kref kref; 26 struct kref kref;
@@ -284,7 +286,25 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
284 return res; 286 return res;
285} 287}
286 288
289#ifdef CONFIG_PROC_FS
290static int eventfd_show_fdinfo(struct seq_file *m, struct file *f)
291{
292 struct eventfd_ctx *ctx = f->private_data;
293 int ret;
294
295 spin_lock_irq(&ctx->wqh.lock);
296 ret = seq_printf(m, "eventfd-count: %16llx\n",
297 (unsigned long long)ctx->count);
298 spin_unlock_irq(&ctx->wqh.lock);
299
300 return ret;
301}
302#endif
303
287static const struct file_operations eventfd_fops = { 304static const struct file_operations eventfd_fops = {
305#ifdef CONFIG_PROC_FS
306 .show_fdinfo = eventfd_show_fdinfo,
307#endif
288 .release = eventfd_release, 308 .release = eventfd_release,
289 .poll = eventfd_poll, 309 .poll = eventfd_poll,
290 .read = eventfd_read, 310 .read = eventfd_read,
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index da72250ddc1c..be56b21435f8 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -38,6 +38,8 @@
38#include <asm/io.h> 38#include <asm/io.h>
39#include <asm/mman.h> 39#include <asm/mman.h>
40#include <linux/atomic.h> 40#include <linux/atomic.h>
41#include <linux/proc_fs.h>
42#include <linux/seq_file.h>
41 43
42/* 44/*
43 * LOCKING: 45 * LOCKING:
@@ -346,7 +348,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
346/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ 348/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
347static inline int ep_op_has_event(int op) 349static inline int ep_op_has_event(int op)
348{ 350{
349 return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD; 351 return op != EPOLL_CTL_DEL;
350} 352}
351 353
352/* Initialize the poll safe wake up structure */ 354/* Initialize the poll safe wake up structure */
@@ -676,34 +678,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
676 return 0; 678 return 0;
677} 679}
678 680
679/*
680 * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item
681 * had no event flags set, indicating that another thread may be currently
682 * handling that item's events (in the case that EPOLLONESHOT was being
683 * used). Otherwise a zero result indicates that the item has been disabled
684 * from receiving events. A disabled item may be re-enabled via
685 * EPOLL_CTL_MOD. Must be called with "mtx" held.
686 */
687static int ep_disable(struct eventpoll *ep, struct epitem *epi)
688{
689 int result = 0;
690 unsigned long flags;
691
692 spin_lock_irqsave(&ep->lock, flags);
693 if (epi->event.events & ~EP_PRIVATE_BITS) {
694 if (ep_is_linked(&epi->rdllink))
695 list_del_init(&epi->rdllink);
696 /* Ensure ep_poll_callback will not add epi back onto ready
697 list: */
698 epi->event.events &= EP_PRIVATE_BITS;
699 }
700 else
701 result = -EBUSY;
702 spin_unlock_irqrestore(&ep->lock, flags);
703
704 return result;
705}
706
707static void ep_free(struct eventpoll *ep) 681static void ep_free(struct eventpoll *ep)
708{ 682{
709 struct rb_node *rbp; 683 struct rb_node *rbp;
@@ -811,8 +785,34 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
811 return pollflags != -1 ? pollflags : 0; 785 return pollflags != -1 ? pollflags : 0;
812} 786}
813 787
788#ifdef CONFIG_PROC_FS
789static int ep_show_fdinfo(struct seq_file *m, struct file *f)
790{
791 struct eventpoll *ep = f->private_data;
792 struct rb_node *rbp;
793 int ret = 0;
794
795 mutex_lock(&ep->mtx);
796 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
797 struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
798
799 ret = seq_printf(m, "tfd: %8d events: %8x data: %16llx\n",
800 epi->ffd.fd, epi->event.events,
801 (long long)epi->event.data);
802 if (ret)
803 break;
804 }
805 mutex_unlock(&ep->mtx);
806
807 return ret;
808}
809#endif
810
814/* File callbacks that implement the eventpoll file behaviour */ 811/* File callbacks that implement the eventpoll file behaviour */
815static const struct file_operations eventpoll_fops = { 812static const struct file_operations eventpoll_fops = {
813#ifdef CONFIG_PROC_FS
814 .show_fdinfo = ep_show_fdinfo,
815#endif
816 .release = ep_eventpoll_release, 816 .release = ep_eventpoll_release,
817 .poll = ep_eventpoll_poll, 817 .poll = ep_eventpoll_poll,
818 .llseek = noop_llseek, 818 .llseek = noop_llseek,
@@ -1048,6 +1048,8 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
1048 rb_insert_color(&epi->rbn, &ep->rbr); 1048 rb_insert_color(&epi->rbn, &ep->rbr);
1049} 1049}
1050 1050
1051
1052
1051#define PATH_ARR_SIZE 5 1053#define PATH_ARR_SIZE 5
1052/* 1054/*
1053 * These are the number paths of length 1 to 5, that we are allowing to emanate 1055 * These are the number paths of length 1 to 5, that we are allowing to emanate
@@ -1813,12 +1815,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1813 } else 1815 } else
1814 error = -ENOENT; 1816 error = -ENOENT;
1815 break; 1817 break;
1816 case EPOLL_CTL_DISABLE:
1817 if (epi)
1818 error = ep_disable(ep, epi);
1819 else
1820 error = -ENOENT;
1821 break;
1822 } 1818 }
1823 mutex_unlock(&ep->mtx); 1819 mutex_unlock(&ep->mtx);
1824 1820
diff --git a/fs/exec.c b/fs/exec.c
index 0039055b1fc6..d8e1191cb112 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1266,14 +1266,13 @@ int prepare_binprm(struct linux_binprm *bprm)
1266 bprm->cred->egid = current_egid(); 1266 bprm->cred->egid = current_egid();
1267 1267
1268 if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) && 1268 if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
1269 !current->no_new_privs) { 1269 !current->no_new_privs &&
1270 kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) &&
1271 kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) {
1270 /* Set-uid? */ 1272 /* Set-uid? */
1271 if (mode & S_ISUID) { 1273 if (mode & S_ISUID) {
1272 if (!kuid_has_mapping(bprm->cred->user_ns, inode->i_uid))
1273 return -EPERM;
1274 bprm->per_clear |= PER_CLEAR_ON_SETID; 1274 bprm->per_clear |= PER_CLEAR_ON_SETID;
1275 bprm->cred->euid = inode->i_uid; 1275 bprm->cred->euid = inode->i_uid;
1276
1277 } 1276 }
1278 1277
1279 /* Set-gid? */ 1278 /* Set-gid? */
@@ -1283,8 +1282,6 @@ int prepare_binprm(struct linux_binprm *bprm)
1283 * executable. 1282 * executable.
1284 */ 1283 */
1285 if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { 1284 if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1286 if (!kgid_has_mapping(bprm->cred->user_ns, inode->i_gid))
1287 return -EPERM;
1288 bprm->per_clear |= PER_CLEAR_ON_SETID; 1285 bprm->per_clear |= PER_CLEAR_ON_SETID;
1289 bprm->cred->egid = inode->i_gid; 1286 bprm->cred->egid = inode->i_gid;
1290 } 1287 }
@@ -1349,13 +1346,17 @@ EXPORT_SYMBOL(remove_arg_zero);
1349/* 1346/*
1350 * cycle the list of binary formats handler, until one recognizes the image 1347 * cycle the list of binary formats handler, until one recognizes the image
1351 */ 1348 */
1352int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) 1349int search_binary_handler(struct linux_binprm *bprm)
1353{ 1350{
1354 unsigned int depth = bprm->recursion_depth; 1351 unsigned int depth = bprm->recursion_depth;
1355 int try,retval; 1352 int try,retval;
1356 struct linux_binfmt *fmt; 1353 struct linux_binfmt *fmt;
1357 pid_t old_pid, old_vpid; 1354 pid_t old_pid, old_vpid;
1358 1355
1356 /* This allows 4 levels of binfmt rewrites before failing hard. */
1357 if (depth > 5)
1358 return -ELOOP;
1359
1359 retval = security_bprm_check(bprm); 1360 retval = security_bprm_check(bprm);
1360 if (retval) 1361 if (retval)
1361 return retval; 1362 return retval;
@@ -1374,18 +1375,14 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1374 for (try=0; try<2; try++) { 1375 for (try=0; try<2; try++) {
1375 read_lock(&binfmt_lock); 1376 read_lock(&binfmt_lock);
1376 list_for_each_entry(fmt, &formats, lh) { 1377 list_for_each_entry(fmt, &formats, lh) {
1377 int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary; 1378 int (*fn)(struct linux_binprm *) = fmt->load_binary;
1378 if (!fn) 1379 if (!fn)
1379 continue; 1380 continue;
1380 if (!try_module_get(fmt->module)) 1381 if (!try_module_get(fmt->module))
1381 continue; 1382 continue;
1382 read_unlock(&binfmt_lock); 1383 read_unlock(&binfmt_lock);
1383 retval = fn(bprm, regs); 1384 bprm->recursion_depth = depth + 1;
1384 /* 1385 retval = fn(bprm);
1385 * Restore the depth counter to its starting value
1386 * in this call, so we don't have to rely on every
1387 * load_binary function to restore it on return.
1388 */
1389 bprm->recursion_depth = depth; 1386 bprm->recursion_depth = depth;
1390 if (retval >= 0) { 1387 if (retval >= 0) {
1391 if (depth == 0) { 1388 if (depth == 0) {
@@ -1439,8 +1436,7 @@ EXPORT_SYMBOL(search_binary_handler);
1439 */ 1436 */
1440static int do_execve_common(const char *filename, 1437static int do_execve_common(const char *filename,
1441 struct user_arg_ptr argv, 1438 struct user_arg_ptr argv,
1442 struct user_arg_ptr envp, 1439 struct user_arg_ptr envp)
1443 struct pt_regs *regs)
1444{ 1440{
1445 struct linux_binprm *bprm; 1441 struct linux_binprm *bprm;
1446 struct file *file; 1442 struct file *file;
@@ -1524,7 +1520,7 @@ static int do_execve_common(const char *filename,
1524 if (retval < 0) 1520 if (retval < 0)
1525 goto out; 1521 goto out;
1526 1522
1527 retval = search_binary_handler(bprm,regs); 1523 retval = search_binary_handler(bprm);
1528 if (retval < 0) 1524 if (retval < 0)
1529 goto out; 1525 goto out;
1530 1526
@@ -1566,19 +1562,17 @@ out_ret:
1566 1562
1567int do_execve(const char *filename, 1563int do_execve(const char *filename,
1568 const char __user *const __user *__argv, 1564 const char __user *const __user *__argv,
1569 const char __user *const __user *__envp, 1565 const char __user *const __user *__envp)
1570 struct pt_regs *regs)
1571{ 1566{
1572 struct user_arg_ptr argv = { .ptr.native = __argv }; 1567 struct user_arg_ptr argv = { .ptr.native = __argv };
1573 struct user_arg_ptr envp = { .ptr.native = __envp }; 1568 struct user_arg_ptr envp = { .ptr.native = __envp };
1574 return do_execve_common(filename, argv, envp, regs); 1569 return do_execve_common(filename, argv, envp);
1575} 1570}
1576 1571
1577#ifdef CONFIG_COMPAT 1572#ifdef CONFIG_COMPAT
1578int compat_do_execve(const char *filename, 1573static int compat_do_execve(const char *filename,
1579 const compat_uptr_t __user *__argv, 1574 const compat_uptr_t __user *__argv,
1580 const compat_uptr_t __user *__envp, 1575 const compat_uptr_t __user *__envp)
1581 struct pt_regs *regs)
1582{ 1576{
1583 struct user_arg_ptr argv = { 1577 struct user_arg_ptr argv = {
1584 .is_compat = true, 1578 .is_compat = true,
@@ -1588,7 +1582,7 @@ int compat_do_execve(const char *filename,
1588 .is_compat = true, 1582 .is_compat = true,
1589 .ptr.compat = __envp, 1583 .ptr.compat = __envp,
1590 }; 1584 };
1591 return do_execve_common(filename, argv, envp, regs); 1585 return do_execve_common(filename, argv, envp);
1592} 1586}
1593#endif 1587#endif
1594 1588
@@ -1669,7 +1663,7 @@ SYSCALL_DEFINE3(execve,
1669 struct filename *path = getname(filename); 1663 struct filename *path = getname(filename);
1670 int error = PTR_ERR(path); 1664 int error = PTR_ERR(path);
1671 if (!IS_ERR(path)) { 1665 if (!IS_ERR(path)) {
1672 error = do_execve(path->name, argv, envp, current_pt_regs()); 1666 error = do_execve(path->name, argv, envp);
1673 putname(path); 1667 putname(path);
1674 } 1668 }
1675 return error; 1669 return error;
@@ -1682,8 +1676,7 @@ asmlinkage long compat_sys_execve(const char __user * filename,
1682 struct filename *path = getname(filename); 1676 struct filename *path = getname(filename);
1683 int error = PTR_ERR(path); 1677 int error = PTR_ERR(path);
1684 if (!IS_ERR(path)) { 1678 if (!IS_ERR(path)) {
1685 error = compat_do_execve(path->name, argv, envp, 1679 error = compat_do_execve(path->name, argv, envp);
1686 current_pt_regs());
1687 putname(path); 1680 putname(path);
1688 } 1681 }
1689 return error; 1682 return error;
@@ -1696,12 +1689,9 @@ int kernel_execve(const char *filename,
1696 const char *const argv[], 1689 const char *const argv[],
1697 const char *const envp[]) 1690 const char *const envp[])
1698{ 1691{
1699 struct pt_regs *p = current_pt_regs(); 1692 int ret = do_execve(filename,
1700 int ret;
1701
1702 ret = do_execve(filename,
1703 (const char __user *const __user *)argv, 1693 (const char __user *const __user *)argv,
1704 (const char __user *const __user *)envp, p); 1694 (const char __user *const __user *)envp);
1705 if (ret < 0) 1695 if (ret < 0)
1706 return ret; 1696 return ret;
1707 1697
@@ -1709,6 +1699,6 @@ int kernel_execve(const char *filename,
1709 * We were successful. We won't be returning to our caller, but 1699 * We were successful. We won't be returning to our caller, but
1710 * instead to user space by manipulating the kernel stack. 1700 * instead to user space by manipulating the kernel stack.
1711 */ 1701 */
1712 ret_from_kernel_execve(p); 1702 ret_from_kernel_execve(current_pt_regs());
1713} 1703}
1714#endif 1704#endif
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index b56181047751..d1f80abd8828 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -361,12 +361,12 @@ static int read_exec(struct page_collect *pcol)
361 return 0; 361 return 0;
362 362
363err: 363err:
364 if (!pcol->read_4_write) 364 if (!pcol_copy) /* Failed before ownership transfer */
365 _unlock_pcol_pages(pcol, ret, READ); 365 pcol_copy = pcol;
366 366 _unlock_pcol_pages(pcol_copy, ret, READ);
367 pcol_free(pcol); 367 pcol_free(pcol_copy);
368
369 kfree(pcol_copy); 368 kfree(pcol_copy);
369
370 return ret; 370 return ret;
371} 371}
372 372
@@ -676,8 +676,10 @@ static int write_exec(struct page_collect *pcol)
676 return 0; 676 return 0;
677 677
678err: 678err:
679 _unlock_pcol_pages(pcol, ret, WRITE); 679 if (!pcol_copy) /* Failed before ownership transfer */
680 pcol_free(pcol); 680 pcol_copy = pcol;
681 _unlock_pcol_pages(pcol_copy, ret, WRITE);
682 pcol_free(pcol_copy);
681 kfree(pcol_copy); 683 kfree(pcol_copy);
682 684
683 return ret; 685 return ret;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 29ab099e3e08..606bb074c501 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -341,10 +341,21 @@ static int export_encode_fh(struct inode *inode, struct fid *fid,
341 return type; 341 return type;
342} 342}
343 343
344int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
345 int *max_len, struct inode *parent)
346{
347 const struct export_operations *nop = inode->i_sb->s_export_op;
348
349 if (nop && nop->encode_fh)
350 return nop->encode_fh(inode, fid->raw, max_len, parent);
351
352 return export_encode_fh(inode, fid, max_len, parent);
353}
354EXPORT_SYMBOL_GPL(exportfs_encode_inode_fh);
355
344int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len, 356int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len,
345 int connectable) 357 int connectable)
346{ 358{
347 const struct export_operations *nop = dentry->d_sb->s_export_op;
348 int error; 359 int error;
349 struct dentry *p = NULL; 360 struct dentry *p = NULL;
350 struct inode *inode = dentry->d_inode, *parent = NULL; 361 struct inode *inode = dentry->d_inode, *parent = NULL;
@@ -357,10 +368,8 @@ int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len,
357 */ 368 */
358 parent = p->d_inode; 369 parent = p->d_inode;
359 } 370 }
360 if (nop->encode_fh) 371
361 error = nop->encode_fh(inode, fid->raw, max_len, parent); 372 error = exportfs_encode_inode_fh(inode, fid, max_len, parent);
362 else
363 error = export_encode_fh(inode, fid, max_len, parent);
364 dput(p); 373 dput(p);
365 374
366 return error; 375 return error;
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 7320a66e958f..22548f56197b 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -2101,8 +2101,9 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
2101 end = start + (range->len >> sb->s_blocksize_bits) - 1; 2101 end = start + (range->len >> sb->s_blocksize_bits) - 1;
2102 minlen = range->minlen >> sb->s_blocksize_bits; 2102 minlen = range->minlen >> sb->s_blocksize_bits;
2103 2103
2104 if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)) || 2104 if (minlen > EXT3_BLOCKS_PER_GROUP(sb) ||
2105 unlikely(start >= max_blks)) 2105 start >= max_blks ||
2106 range->len < sb->s_blocksize)
2106 return -EINVAL; 2107 return -EINVAL;
2107 if (end >= max_blks) 2108 if (end >= max_blks)
2108 end = max_blks - 1; 2109 end = max_blks - 1;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index c8fff930790d..dd91264ba94f 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -296,17 +296,17 @@ static inline loff_t ext3_get_htree_eof(struct file *filp)
296 * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX) 296 * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
297 * will be invalid once the directory was converted into a dx directory 297 * will be invalid once the directory was converted into a dx directory
298 */ 298 */
299loff_t ext3_dir_llseek(struct file *file, loff_t offset, int origin) 299loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
300{ 300{
301 struct inode *inode = file->f_mapping->host; 301 struct inode *inode = file->f_mapping->host;
302 int dx_dir = is_dx_dir(inode); 302 int dx_dir = is_dx_dir(inode);
303 loff_t htree_max = ext3_get_htree_eof(file); 303 loff_t htree_max = ext3_get_htree_eof(file);
304 304
305 if (likely(dx_dir)) 305 if (likely(dx_dir))
306 return generic_file_llseek_size(file, offset, origin, 306 return generic_file_llseek_size(file, offset, whence,
307 htree_max, htree_max); 307 htree_max, htree_max);
308 else 308 else
309 return generic_file_llseek(file, offset, origin); 309 return generic_file_llseek(file, offset, whence);
310} 310}
311 311
312/* 312/*
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 7e87e37a372a..b176d4253544 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1071,8 +1071,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
1071 * mapped. 0 in case of a HOLE. 1071 * mapped. 0 in case of a HOLE.
1072 */ 1072 */
1073 if (err > 0) { 1073 if (err > 0) {
1074 if (err > 1) 1074 WARN_ON(err > 1);
1075 WARN_ON(1);
1076 err = 0; 1075 err = 0;
1077 } 1076 }
1078 *errp = err; 1077 *errp = err;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 5366393528df..6e50223b3299 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1661,9 +1661,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1661 return -ENOMEM; 1661 return -ENOMEM;
1662 } 1662 }
1663 sb->s_fs_info = sbi; 1663 sb->s_fs_info = sbi;
1664 sbi->s_mount_opt = 0;
1665 sbi->s_resuid = make_kuid(&init_user_ns, EXT3_DEF_RESUID);
1666 sbi->s_resgid = make_kgid(&init_user_ns, EXT3_DEF_RESGID);
1667 sbi->s_sb_block = sb_block; 1664 sbi->s_sb_block = sb_block;
1668 1665
1669 blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); 1666 blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index c22f17021b6e..0a475c881852 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -39,22 +39,8 @@ config EXT4_USE_FOR_EXT23
39 compiled kernel size by using one file system driver for 39 compiled kernel size by using one file system driver for
40 ext2, ext3, and ext4 file systems. 40 ext2, ext3, and ext4 file systems.
41 41
42config EXT4_FS_XATTR
43 bool "Ext4 extended attributes"
44 depends on EXT4_FS
45 default y
46 help
47 Extended attributes are name:value pairs associated with inodes by
48 the kernel or by users (see the attr(5) manual page, or visit
49 <http://acl.bestbits.at/> for details).
50
51 If unsure, say N.
52
53 You need this for POSIX ACL support on ext4.
54
55config EXT4_FS_POSIX_ACL 42config EXT4_FS_POSIX_ACL
56 bool "Ext4 POSIX Access Control Lists" 43 bool "Ext4 POSIX Access Control Lists"
57 depends on EXT4_FS_XATTR
58 select FS_POSIX_ACL 44 select FS_POSIX_ACL
59 help 45 help
60 POSIX Access Control Lists (ACLs) support permissions for users and 46 POSIX Access Control Lists (ACLs) support permissions for users and
@@ -67,7 +53,6 @@ config EXT4_FS_POSIX_ACL
67 53
68config EXT4_FS_SECURITY 54config EXT4_FS_SECURITY
69 bool "Ext4 Security Labels" 55 bool "Ext4 Security Labels"
70 depends on EXT4_FS_XATTR
71 help 56 help
72 Security labels support alternative access control models 57 Security labels support alternative access control models
73 implemented by security modules like SELinux. This option 58 implemented by security modules like SELinux. This option
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 56fd8f865930..0310fec2ee3d 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -7,8 +7,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ 7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ 9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
10 mmp.o indirect.o 10 mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
11 xattr_trusted.o inline.o
11 12
12ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
13ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o 13ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
14ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o 14ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index d3c5b88fd89f..e6e0d988439b 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -423,8 +423,10 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
423 423
424retry: 424retry:
425 handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); 425 handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
426 if (IS_ERR(handle)) 426 if (IS_ERR(handle)) {
427 return PTR_ERR(handle); 427 error = PTR_ERR(handle);
428 goto release_and_out;
429 }
428 error = ext4_set_acl(handle, inode, type, acl); 430 error = ext4_set_acl(handle, inode, type, acl);
429 ext4_journal_stop(handle); 431 ext4_journal_stop(handle);
430 if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 432 if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 8e07d2a5a139..80a28b297279 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -27,23 +27,11 @@
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/rbtree.h> 28#include <linux/rbtree.h>
29#include "ext4.h" 29#include "ext4.h"
30 30#include "xattr.h"
31static unsigned char ext4_filetype_table[] = {
32 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
33};
34 31
35static int ext4_dx_readdir(struct file *filp, 32static int ext4_dx_readdir(struct file *filp,
36 void *dirent, filldir_t filldir); 33 void *dirent, filldir_t filldir);
37 34
38static unsigned char get_dtype(struct super_block *sb, int filetype)
39{
40 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
41 (filetype >= EXT4_FT_MAX))
42 return DT_UNKNOWN;
43
44 return (ext4_filetype_table[filetype]);
45}
46
47/** 35/**
48 * Check if the given dir-inode refers to an htree-indexed directory 36 * Check if the given dir-inode refers to an htree-indexed directory
49 * (or a directory which chould potentially get coverted to use htree 37 * (or a directory which chould potentially get coverted to use htree
@@ -68,11 +56,14 @@ static int is_dx_dir(struct inode *inode)
68 * Return 0 if the directory entry is OK, and 1 if there is a problem 56 * Return 0 if the directory entry is OK, and 1 if there is a problem
69 * 57 *
70 * Note: this is the opposite of what ext2 and ext3 historically returned... 58 * Note: this is the opposite of what ext2 and ext3 historically returned...
59 *
60 * bh passed here can be an inode block or a dir data block, depending
61 * on the inode inline data flag.
71 */ 62 */
72int __ext4_check_dir_entry(const char *function, unsigned int line, 63int __ext4_check_dir_entry(const char *function, unsigned int line,
73 struct inode *dir, struct file *filp, 64 struct inode *dir, struct file *filp,
74 struct ext4_dir_entry_2 *de, 65 struct ext4_dir_entry_2 *de,
75 struct buffer_head *bh, 66 struct buffer_head *bh, char *buf, int size,
76 unsigned int offset) 67 unsigned int offset)
77{ 68{
78 const char *error_msg = NULL; 69 const char *error_msg = NULL;
@@ -85,9 +76,8 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
85 error_msg = "rec_len % 4 != 0"; 76 error_msg = "rec_len % 4 != 0";
86 else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) 77 else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
87 error_msg = "rec_len is too small for name_len"; 78 error_msg = "rec_len is too small for name_len";
88 else if (unlikely(((char *) de - bh->b_data) + rlen > 79 else if (unlikely(((char *) de - buf) + rlen > size))
89 dir->i_sb->s_blocksize)) 80 error_msg = "directory entry across range";
90 error_msg = "directory entry across blocks";
91 else if (unlikely(le32_to_cpu(de->inode) > 81 else if (unlikely(le32_to_cpu(de->inode) >
92 le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) 82 le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
93 error_msg = "inode out of bounds"; 83 error_msg = "inode out of bounds";
@@ -98,14 +88,14 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
98 ext4_error_file(filp, function, line, bh->b_blocknr, 88 ext4_error_file(filp, function, line, bh->b_blocknr,
99 "bad entry in directory: %s - offset=%u(%u), " 89 "bad entry in directory: %s - offset=%u(%u), "
100 "inode=%u, rec_len=%d, name_len=%d", 90 "inode=%u, rec_len=%d, name_len=%d",
101 error_msg, (unsigned) (offset % bh->b_size), 91 error_msg, (unsigned) (offset % size),
102 offset, le32_to_cpu(de->inode), 92 offset, le32_to_cpu(de->inode),
103 rlen, de->name_len); 93 rlen, de->name_len);
104 else 94 else
105 ext4_error_inode(dir, function, line, bh->b_blocknr, 95 ext4_error_inode(dir, function, line, bh->b_blocknr,
106 "bad entry in directory: %s - offset=%u(%u), " 96 "bad entry in directory: %s - offset=%u(%u), "
107 "inode=%u, rec_len=%d, name_len=%d", 97 "inode=%u, rec_len=%d, name_len=%d",
108 error_msg, (unsigned) (offset % bh->b_size), 98 error_msg, (unsigned) (offset % size),
109 offset, le32_to_cpu(de->inode), 99 offset, le32_to_cpu(de->inode),
110 rlen, de->name_len); 100 rlen, de->name_len);
111 101
@@ -125,6 +115,14 @@ static int ext4_readdir(struct file *filp,
125 int ret = 0; 115 int ret = 0;
126 int dir_has_error = 0; 116 int dir_has_error = 0;
127 117
118 if (ext4_has_inline_data(inode)) {
119 int has_inline_data = 1;
120 ret = ext4_read_inline_dir(filp, dirent, filldir,
121 &has_inline_data);
122 if (has_inline_data)
123 return ret;
124 }
125
128 if (is_dx_dir(inode)) { 126 if (is_dx_dir(inode)) {
129 err = ext4_dx_readdir(filp, dirent, filldir); 127 err = ext4_dx_readdir(filp, dirent, filldir);
130 if (err != ERR_BAD_DX_DIR) { 128 if (err != ERR_BAD_DX_DIR) {
@@ -221,8 +219,9 @@ revalidate:
221 while (!error && filp->f_pos < inode->i_size 219 while (!error && filp->f_pos < inode->i_size
222 && offset < sb->s_blocksize) { 220 && offset < sb->s_blocksize) {
223 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); 221 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
224 if (ext4_check_dir_entry(inode, filp, de, 222 if (ext4_check_dir_entry(inode, filp, de, bh,
225 bh, offset)) { 223 bh->b_data, bh->b_size,
224 offset)) {
226 /* 225 /*
227 * On error, skip the f_pos to the next block 226 * On error, skip the f_pos to the next block
228 */ 227 */
@@ -334,17 +333,17 @@ static inline loff_t ext4_get_htree_eof(struct file *filp)
334 * 333 *
335 * For non-htree, ext4_llseek already chooses the proper max offset. 334 * For non-htree, ext4_llseek already chooses the proper max offset.
336 */ 335 */
337loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin) 336loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
338{ 337{
339 struct inode *inode = file->f_mapping->host; 338 struct inode *inode = file->f_mapping->host;
340 int dx_dir = is_dx_dir(inode); 339 int dx_dir = is_dx_dir(inode);
341 loff_t htree_max = ext4_get_htree_eof(file); 340 loff_t htree_max = ext4_get_htree_eof(file);
342 341
343 if (likely(dx_dir)) 342 if (likely(dx_dir))
344 return generic_file_llseek_size(file, offset, origin, 343 return generic_file_llseek_size(file, offset, whence,
345 htree_max, htree_max); 344 htree_max, htree_max);
346 else 345 else
347 return ext4_llseek(file, offset, origin); 346 return ext4_llseek(file, offset, whence);
348} 347}
349 348
350/* 349/*
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3c20de1d59d0..8462eb3c33aa 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -57,6 +57,16 @@
57#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) 57#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
58#endif 58#endif
59 59
60/*
61 * Turn on EXT_DEBUG to get lots of info about extents operations.
62 */
63#define EXT_DEBUG__
64#ifdef EXT_DEBUG
65#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
66#else
67#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
68#endif
69
60#define EXT4_ERROR_INODE(inode, fmt, a...) \ 70#define EXT4_ERROR_INODE(inode, fmt, a...) \
61 ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) 71 ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
62 72
@@ -392,6 +402,7 @@ struct flex_groups {
392#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ 402#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
393#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ 403#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
394#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ 404#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
405#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
395#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 406#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
396 407
397#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ 408#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
@@ -448,28 +459,26 @@ enum {
448 EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ 459 EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
449 EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ 460 EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
450 EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ 461 EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
462 EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
451 EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ 463 EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
452}; 464};
453 465
454#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) 466/*
455#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \ 467 * Since it's pretty easy to mix up bit numbers and hex values, we use a
456 printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \ 468 * build-time check to make sure that EXT4_XXX_FL is consistent with respect to
457 EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); } 469 * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost
458 470 * any extra space in the compiled kernel image, otherwise, the build will fail.
459/* 471 * It's important that these values are the same, since we are using
460 * Since it's pretty easy to mix up bit numbers and hex values, and we 472 * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent
461 * can't do a compile-time test for ENUM values, we use a run-time 473 * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk
462 * test to make sure that EXT4_XXX_FL is consistent with respect to 474 * values found in ext2, ext3 and ext4 filesystems, and of course the values
463 * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop 475 * defined in e2fsprogs.
464 * out so it won't cost any extra space in the compiled kernel image.
465 * But it's important that these values are the same, since we are
466 * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
467 * must be consistent with the values of FS_XXX_FL defined in
468 * include/linux/fs.h and the on-disk values found in ext2, ext3, and
469 * ext4 filesystems, and of course the values defined in e2fsprogs.
470 * 476 *
471 * It's not paranoia if the Murphy's Law really *is* out to get you. :-) 477 * It's not paranoia if the Murphy's Law really *is* out to get you. :-)
472 */ 478 */
479#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
480#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))
481
473static inline void ext4_check_flag_values(void) 482static inline void ext4_check_flag_values(void)
474{ 483{
475 CHECK_FLAG_VALUE(SECRM); 484 CHECK_FLAG_VALUE(SECRM);
@@ -494,6 +503,7 @@ static inline void ext4_check_flag_values(void)
494 CHECK_FLAG_VALUE(EXTENTS); 503 CHECK_FLAG_VALUE(EXTENTS);
495 CHECK_FLAG_VALUE(EA_INODE); 504 CHECK_FLAG_VALUE(EA_INODE);
496 CHECK_FLAG_VALUE(EOFBLOCKS); 505 CHECK_FLAG_VALUE(EOFBLOCKS);
506 CHECK_FLAG_VALUE(INLINE_DATA);
497 CHECK_FLAG_VALUE(RESERVED); 507 CHECK_FLAG_VALUE(RESERVED);
498} 508}
499 509
@@ -811,6 +821,8 @@ struct ext4_ext_cache {
811 __u32 ec_len; /* must be 32bit to return holes */ 821 __u32 ec_len; /* must be 32bit to return holes */
812}; 822};
813 823
824#include "extents_status.h"
825
814/* 826/*
815 * fourth extended file system inode data in memory 827 * fourth extended file system inode data in memory
816 */ 828 */
@@ -833,7 +845,6 @@ struct ext4_inode_info {
833#endif 845#endif
834 unsigned long i_flags; 846 unsigned long i_flags;
835 847
836#ifdef CONFIG_EXT4_FS_XATTR
837 /* 848 /*
838 * Extended attributes can be read independently of the main file 849 * Extended attributes can be read independently of the main file
839 * data. Taking i_mutex even when reading would cause contention 850 * data. Taking i_mutex even when reading would cause contention
@@ -842,7 +853,6 @@ struct ext4_inode_info {
842 * EAs. 853 * EAs.
843 */ 854 */
844 struct rw_semaphore xattr_sem; 855 struct rw_semaphore xattr_sem;
845#endif
846 856
847 struct list_head i_orphan; /* unlinked but open inodes */ 857 struct list_head i_orphan; /* unlinked but open inodes */
848 858
@@ -888,6 +898,10 @@ struct ext4_inode_info {
888 struct list_head i_prealloc_list; 898 struct list_head i_prealloc_list;
889 spinlock_t i_prealloc_lock; 899 spinlock_t i_prealloc_lock;
890 900
901 /* extents status tree */
902 struct ext4_es_tree i_es_tree;
903 rwlock_t i_es_lock;
904
891 /* ialloc */ 905 /* ialloc */
892 ext4_group_t i_last_alloc_group; 906 ext4_group_t i_last_alloc_group;
893 907
@@ -902,6 +916,10 @@ struct ext4_inode_info {
902 /* on-disk additional length */ 916 /* on-disk additional length */
903 __u16 i_extra_isize; 917 __u16 i_extra_isize;
904 918
919 /* Indicate the inline data space. */
920 u16 i_inline_off;
921 u16 i_inline_size;
922
905#ifdef CONFIG_QUOTA 923#ifdef CONFIG_QUOTA
906 /* quota space reservation, managed internally by quota code */ 924 /* quota space reservation, managed internally by quota code */
907 qsize_t i_reserved_quota; 925 qsize_t i_reserved_quota;
@@ -1360,6 +1378,7 @@ enum {
1360 EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ 1378 EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
1361 EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read 1379 EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
1362 nolocking */ 1380 nolocking */
1381 EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
1363}; 1382};
1364 1383
1365#define EXT4_INODE_BIT_FNS(name, field, offset) \ 1384#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1481,7 +1500,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1481#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ 1500#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
1482#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ 1501#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */
1483#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ 1502#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
1484#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x8000 /* data in inode */ 1503#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */
1485 1504
1486#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR 1505#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
1487#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ 1506#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1505,7 +1524,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1505 EXT4_FEATURE_INCOMPAT_EXTENTS| \ 1524 EXT4_FEATURE_INCOMPAT_EXTENTS| \
1506 EXT4_FEATURE_INCOMPAT_64BIT| \ 1525 EXT4_FEATURE_INCOMPAT_64BIT| \
1507 EXT4_FEATURE_INCOMPAT_FLEX_BG| \ 1526 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
1508 EXT4_FEATURE_INCOMPAT_MMP) 1527 EXT4_FEATURE_INCOMPAT_MMP | \
1528 EXT4_FEATURE_INCOMPAT_INLINE_DATA)
1509#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ 1529#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
1510 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ 1530 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
1511 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ 1531 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1592,6 +1612,11 @@ struct ext4_dir_entry_tail {
1592 __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ 1612 __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */
1593}; 1613};
1594 1614
1615#define EXT4_DIRENT_TAIL(block, blocksize) \
1616 ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
1617 ((blocksize) - \
1618 sizeof(struct ext4_dir_entry_tail))))
1619
1595/* 1620/*
1596 * Ext4 directory file types. Only the low 3 bits are used. The 1621 * Ext4 directory file types. Only the low 3 bits are used. The
1597 * other bits are reserved for now. 1622 * other bits are reserved for now.
@@ -1936,14 +1961,42 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
1936extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, 1961extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
1937 struct file *, 1962 struct file *,
1938 struct ext4_dir_entry_2 *, 1963 struct ext4_dir_entry_2 *,
1939 struct buffer_head *, unsigned int); 1964 struct buffer_head *, char *, int,
1940#define ext4_check_dir_entry(dir, filp, de, bh, offset) \ 1965 unsigned int);
1966#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \
1941 unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ 1967 unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
1942 (de), (bh), (offset))) 1968 (de), (bh), (buf), (size), (offset)))
1943extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, 1969extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1944 __u32 minor_hash, 1970 __u32 minor_hash,
1945 struct ext4_dir_entry_2 *dirent); 1971 struct ext4_dir_entry_2 *dirent);
1946extern void ext4_htree_free_dir_info(struct dir_private_info *p); 1972extern void ext4_htree_free_dir_info(struct dir_private_info *p);
1973extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
1974 struct buffer_head *bh,
1975 void *buf, int buf_size,
1976 const char *name, int namelen,
1977 struct ext4_dir_entry_2 **dest_de);
1978void ext4_insert_dentry(struct inode *inode,
1979 struct ext4_dir_entry_2 *de,
1980 int buf_size,
1981 const char *name, int namelen);
1982static inline void ext4_update_dx_flag(struct inode *inode)
1983{
1984 if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
1985 EXT4_FEATURE_COMPAT_DIR_INDEX))
1986 ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
1987}
1988static unsigned char ext4_filetype_table[] = {
1989 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
1990};
1991
1992static inline unsigned char get_dtype(struct super_block *sb, int filetype)
1993{
1994 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
1995 (filetype >= EXT4_FT_MAX))
1996 return DT_UNKNOWN;
1997
1998 return ext4_filetype_table[filetype];
1999}
1947 2000
1948/* fsync.c */ 2001/* fsync.c */
1949extern int ext4_sync_file(struct file *, loff_t, loff_t, int); 2002extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
@@ -1994,8 +2047,23 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *,
1994 ext4_lblk_t, int, int *); 2047 ext4_lblk_t, int, int *);
1995struct buffer_head *ext4_bread(handle_t *, struct inode *, 2048struct buffer_head *ext4_bread(handle_t *, struct inode *,
1996 ext4_lblk_t, int, int *); 2049 ext4_lblk_t, int, int *);
2050int ext4_get_block_write(struct inode *inode, sector_t iblock,
2051 struct buffer_head *bh_result, int create);
1997int ext4_get_block(struct inode *inode, sector_t iblock, 2052int ext4_get_block(struct inode *inode, sector_t iblock,
1998 struct buffer_head *bh_result, int create); 2053 struct buffer_head *bh_result, int create);
2054int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2055 struct buffer_head *bh, int create);
2056int ext4_walk_page_buffers(handle_t *handle,
2057 struct buffer_head *head,
2058 unsigned from,
2059 unsigned to,
2060 int *partial,
2061 int (*fn)(handle_t *handle,
2062 struct buffer_head *bh));
2063int do_journal_get_write_access(handle_t *handle,
2064 struct buffer_head *bh);
2065#define FALL_BACK_TO_NONDELALLOC 1
2066#define CONVERT_INLINE_DATA 2
1999 2067
2000extern struct inode *ext4_iget(struct super_block *, unsigned long); 2068extern struct inode *ext4_iget(struct super_block *, unsigned long);
2001extern int ext4_write_inode(struct inode *, struct writeback_control *); 2069extern int ext4_write_inode(struct inode *, struct writeback_control *);
@@ -2050,6 +2118,20 @@ extern int ext4_orphan_add(handle_t *, struct inode *);
2050extern int ext4_orphan_del(handle_t *, struct inode *); 2118extern int ext4_orphan_del(handle_t *, struct inode *);
2051extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, 2119extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
2052 __u32 start_minor_hash, __u32 *next_hash); 2120 __u32 start_minor_hash, __u32 *next_hash);
2121extern int search_dir(struct buffer_head *bh,
2122 char *search_buf,
2123 int buf_size,
2124 struct inode *dir,
2125 const struct qstr *d_name,
2126 unsigned int offset,
2127 struct ext4_dir_entry_2 **res_dir);
2128extern int ext4_generic_delete_entry(handle_t *handle,
2129 struct inode *dir,
2130 struct ext4_dir_entry_2 *de_del,
2131 struct buffer_head *bh,
2132 void *entry_buf,
2133 int buf_size,
2134 int csum_size);
2053 2135
2054/* resize.c */ 2136/* resize.c */
2055extern int ext4_group_add(struct super_block *sb, 2137extern int ext4_group_add(struct super_block *sb,
@@ -2376,6 +2458,15 @@ extern void ext4_unwritten_wait(struct inode *inode);
2376extern const struct inode_operations ext4_dir_inode_operations; 2458extern const struct inode_operations ext4_dir_inode_operations;
2377extern const struct inode_operations ext4_special_inode_operations; 2459extern const struct inode_operations ext4_special_inode_operations;
2378extern struct dentry *ext4_get_parent(struct dentry *child); 2460extern struct dentry *ext4_get_parent(struct dentry *child);
2461extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
2462 struct ext4_dir_entry_2 *de,
2463 int blocksize, int csum_size,
2464 unsigned int parent_ino, int dotdot_real_len);
2465extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
2466 unsigned int blocksize);
2467extern int ext4_handle_dirty_dirent_node(handle_t *handle,
2468 struct inode *inode,
2469 struct buffer_head *bh);
2379 2470
2380/* symlink.c */ 2471/* symlink.c */
2381extern const struct inode_operations ext4_symlink_inode_operations; 2472extern const struct inode_operations ext4_symlink_inode_operations;
@@ -2393,6 +2484,9 @@ extern int ext4_check_blockref(const char *, unsigned int,
2393 struct inode *, __le32 *, unsigned int); 2484 struct inode *, __le32 *, unsigned int);
2394 2485
2395/* extents.c */ 2486/* extents.c */
2487struct ext4_ext_path;
2488struct ext4_extent;
2489
2396extern int ext4_ext_tree_init(handle_t *handle, struct inode *); 2490extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
2397extern int ext4_ext_writepage_trans_blocks(struct inode *, int); 2491extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
2398extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, 2492extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
@@ -2410,8 +2504,27 @@ extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
2410 ssize_t len); 2504 ssize_t len);
2411extern int ext4_map_blocks(handle_t *handle, struct inode *inode, 2505extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
2412 struct ext4_map_blocks *map, int flags); 2506 struct ext4_map_blocks *map, int flags);
2507extern int ext4_ext_calc_metadata_amount(struct inode *inode,
2508 ext4_lblk_t lblocks);
2509extern int ext4_extent_tree_init(handle_t *, struct inode *);
2510extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
2511 int num,
2512 struct ext4_ext_path *path);
2513extern int ext4_can_extents_be_merged(struct inode *inode,
2514 struct ext4_extent *ex1,
2515 struct ext4_extent *ex2);
2516extern int ext4_ext_insert_extent(handle_t *, struct inode *,
2517 struct ext4_ext_path *,
2518 struct ext4_extent *, int);
2519extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
2520 struct ext4_ext_path *);
2521extern void ext4_ext_drop_refs(struct ext4_ext_path *);
2522extern int ext4_ext_check_inode(struct inode *inode);
2523extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
2413extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2524extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2414 __u64 start, __u64 len); 2525 __u64 start, __u64 len);
2526
2527
2415/* move_extent.c */ 2528/* move_extent.c */
2416extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, 2529extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2417 __u64 start_orig, __u64 start_donor, 2530 __u64 start_orig, __u64 start_donor,
@@ -2445,17 +2558,13 @@ enum ext4_state_bits {
2445 * never, ever appear in a buffer_head's state 2558 * never, ever appear in a buffer_head's state
2446 * flag. See EXT4_MAP_FROM_CLUSTER to see where 2559 * flag. See EXT4_MAP_FROM_CLUSTER to see where
2447 * this is used. */ 2560 * this is used. */
2448 BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This
2449 * flag is set when ext4_map_blocks is called on a
2450 * delayed allocated block to get its real mapping. */
2451}; 2561};
2452 2562
2453BUFFER_FNS(Uninit, uninit) 2563BUFFER_FNS(Uninit, uninit)
2454TAS_BUFFER_FNS(Uninit, uninit) 2564TAS_BUFFER_FNS(Uninit, uninit)
2455BUFFER_FNS(Da_Mapped, da_mapped)
2456 2565
2457/* 2566/*
2458 * Add new method to test wether block and inode bitmaps are properly 2567 * Add new method to test whether block and inode bitmaps are properly
2459 * initialized. With uninit_bg reading the block from disk is not enough 2568 * initialized. With uninit_bg reading the block from disk is not enough
2460 * to mark the bitmap uptodate. We need to also zero-out the bitmap 2569 * to mark the bitmap uptodate. We need to also zero-out the bitmap
2461 */ 2570 */
@@ -2503,6 +2612,4 @@ extern void ext4_resize_end(struct super_block *sb);
2503 2612
2504#endif /* __KERNEL__ */ 2613#endif /* __KERNEL__ */
2505 2614
2506#include "ext4_extents.h"
2507
2508#endif /* _EXT4_H */ 2615#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index cb1b2c919963..487fda12bc00 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -43,16 +43,6 @@
43#define CHECK_BINSEARCH__ 43#define CHECK_BINSEARCH__
44 44
45/* 45/*
46 * Turn on EXT_DEBUG to get lots of info about extents operations.
47 */
48#define EXT_DEBUG__
49#ifdef EXT_DEBUG
50#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
51#else
52#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
53#endif
54
55/*
56 * If EXT_STATS is defined then stats numbers are collected. 46 * If EXT_STATS is defined then stats numbers are collected.
57 * These number will be displayed at umount time. 47 * These number will be displayed at umount time.
58 */ 48 */
@@ -144,20 +134,6 @@ struct ext4_ext_path {
144 */ 134 */
145 135
146/* 136/*
147 * to be called by ext4_ext_walk_space()
148 * negative retcode - error
149 * positive retcode - signal for ext4_ext_walk_space(), see below
150 * callback must return valid extent (passed or newly created)
151 */
152typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t,
153 struct ext4_ext_cache *,
154 struct ext4_extent *, void *);
155
156#define EXT_CONTINUE 0
157#define EXT_BREAK 1
158#define EXT_REPEAT 2
159
160/*
161 * Maximum number of logical blocks in a file; ext4_extent's ee_block is 137 * Maximum number of logical blocks in a file; ext4_extent's ee_block is
162 * __le32. 138 * __le32.
163 */ 139 */
@@ -300,21 +276,5 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
300 0xffff); 276 0xffff);
301} 277}
302 278
303extern int ext4_ext_calc_metadata_amount(struct inode *inode,
304 ext4_lblk_t lblocks);
305extern int ext4_extent_tree_init(handle_t *, struct inode *);
306extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
307 int num,
308 struct ext4_ext_path *path);
309extern int ext4_can_extents_be_merged(struct inode *inode,
310 struct ext4_extent *ex1,
311 struct ext4_extent *ex2);
312extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
313extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
314 struct ext4_ext_path *);
315extern void ext4_ext_drop_refs(struct ext4_ext_path *);
316extern int ext4_ext_check_inode(struct inode *inode);
317extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
318 int search_hint_reverse);
319#endif /* _EXT4_EXTENTS */ 279#endif /* _EXT4_EXTENTS */
320 280
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 56d258c18303..7177f9b21cb2 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -254,13 +254,6 @@ static inline void ext4_handle_sync(handle_t *handle)
254 handle->h_sync = 1; 254 handle->h_sync = 1;
255} 255}
256 256
257static inline void ext4_handle_release_buffer(handle_t *handle,
258 struct buffer_head *bh)
259{
260 if (ext4_handle_valid(handle))
261 jbd2_journal_release_buffer(handle, bh);
262}
263
264static inline int ext4_handle_is_aborted(handle_t *handle) 257static inline int ext4_handle_is_aborted(handle_t *handle)
265{ 258{
266 if (ext4_handle_valid(handle)) 259 if (ext4_handle_valid(handle))
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7011ac967208..26af22832a84 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -41,6 +41,8 @@
41#include <asm/uaccess.h> 41#include <asm/uaccess.h>
42#include <linux/fiemap.h> 42#include <linux/fiemap.h>
43#include "ext4_jbd2.h" 43#include "ext4_jbd2.h"
44#include "ext4_extents.h"
45#include "xattr.h"
44 46
45#include <trace/events/ext4.h> 47#include <trace/events/ext4.h>
46 48
@@ -109,6 +111,9 @@ static int ext4_split_extent_at(handle_t *handle,
109 int split_flag, 111 int split_flag,
110 int flags); 112 int flags);
111 113
114static int ext4_find_delayed_extent(struct inode *inode,
115 struct ext4_ext_cache *newex);
116
112static int ext4_ext_truncate_extend_restart(handle_t *handle, 117static int ext4_ext_truncate_extend_restart(handle_t *handle,
113 struct inode *inode, 118 struct inode *inode,
114 int needed) 119 int needed)
@@ -1959,27 +1964,33 @@ cleanup:
1959 return err; 1964 return err;
1960} 1965}
1961 1966
1962static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, 1967static int ext4_fill_fiemap_extents(struct inode *inode,
1963 ext4_lblk_t num, ext_prepare_callback func, 1968 ext4_lblk_t block, ext4_lblk_t num,
1964 void *cbdata) 1969 struct fiemap_extent_info *fieinfo)
1965{ 1970{
1966 struct ext4_ext_path *path = NULL; 1971 struct ext4_ext_path *path = NULL;
1967 struct ext4_ext_cache cbex; 1972 struct ext4_ext_cache newex;
1968 struct ext4_extent *ex; 1973 struct ext4_extent *ex;
1969 ext4_lblk_t next, start = 0, end = 0; 1974 ext4_lblk_t next, next_del, start = 0, end = 0;
1970 ext4_lblk_t last = block + num; 1975 ext4_lblk_t last = block + num;
1971 int depth, exists, err = 0; 1976 int exists, depth = 0, err = 0;
1972 1977 unsigned int flags = 0;
1973 BUG_ON(func == NULL); 1978 unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
1974 BUG_ON(inode == NULL);
1975 1979
1976 while (block < last && block != EXT_MAX_BLOCKS) { 1980 while (block < last && block != EXT_MAX_BLOCKS) {
1977 num = last - block; 1981 num = last - block;
1978 /* find extent for this block */ 1982 /* find extent for this block */
1979 down_read(&EXT4_I(inode)->i_data_sem); 1983 down_read(&EXT4_I(inode)->i_data_sem);
1984
1985 if (path && ext_depth(inode) != depth) {
1986 /* depth was changed. we have to realloc path */
1987 kfree(path);
1988 path = NULL;
1989 }
1990
1980 path = ext4_ext_find_extent(inode, block, path); 1991 path = ext4_ext_find_extent(inode, block, path);
1981 up_read(&EXT4_I(inode)->i_data_sem);
1982 if (IS_ERR(path)) { 1992 if (IS_ERR(path)) {
1993 up_read(&EXT4_I(inode)->i_data_sem);
1983 err = PTR_ERR(path); 1994 err = PTR_ERR(path);
1984 path = NULL; 1995 path = NULL;
1985 break; 1996 break;
@@ -1987,13 +1998,16 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1987 1998
1988 depth = ext_depth(inode); 1999 depth = ext_depth(inode);
1989 if (unlikely(path[depth].p_hdr == NULL)) { 2000 if (unlikely(path[depth].p_hdr == NULL)) {
2001 up_read(&EXT4_I(inode)->i_data_sem);
1990 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 2002 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1991 err = -EIO; 2003 err = -EIO;
1992 break; 2004 break;
1993 } 2005 }
1994 ex = path[depth].p_ext; 2006 ex = path[depth].p_ext;
1995 next = ext4_ext_next_allocated_block(path); 2007 next = ext4_ext_next_allocated_block(path);
2008 ext4_ext_drop_refs(path);
1996 2009
2010 flags = 0;
1997 exists = 0; 2011 exists = 0;
1998 if (!ex) { 2012 if (!ex) {
1999 /* there is no extent yet, so try to allocate 2013 /* there is no extent yet, so try to allocate
@@ -2030,40 +2044,64 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
2030 BUG_ON(end <= start); 2044 BUG_ON(end <= start);
2031 2045
2032 if (!exists) { 2046 if (!exists) {
2033 cbex.ec_block = start; 2047 newex.ec_block = start;
2034 cbex.ec_len = end - start; 2048 newex.ec_len = end - start;
2035 cbex.ec_start = 0; 2049 newex.ec_start = 0;
2036 } else { 2050 } else {
2037 cbex.ec_block = le32_to_cpu(ex->ee_block); 2051 newex.ec_block = le32_to_cpu(ex->ee_block);
2038 cbex.ec_len = ext4_ext_get_actual_len(ex); 2052 newex.ec_len = ext4_ext_get_actual_len(ex);
2039 cbex.ec_start = ext4_ext_pblock(ex); 2053 newex.ec_start = ext4_ext_pblock(ex);
2054 if (ext4_ext_is_uninitialized(ex))
2055 flags |= FIEMAP_EXTENT_UNWRITTEN;
2040 } 2056 }
2041 2057
2042 if (unlikely(cbex.ec_len == 0)) { 2058 /*
2043 EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); 2059 * Find delayed extent and update newex accordingly. We call
2044 err = -EIO; 2060 * it even in !exists case to find out whether newex is the
2045 break; 2061 * last existing extent or not.
2062 */
2063 next_del = ext4_find_delayed_extent(inode, &newex);
2064 if (!exists && next_del) {
2065 exists = 1;
2066 flags |= FIEMAP_EXTENT_DELALLOC;
2046 } 2067 }
2047 err = func(inode, next, &cbex, ex, cbdata); 2068 up_read(&EXT4_I(inode)->i_data_sem);
2048 ext4_ext_drop_refs(path);
2049 2069
2050 if (err < 0) 2070 if (unlikely(newex.ec_len == 0)) {
2071 EXT4_ERROR_INODE(inode, "newex.ec_len == 0");
2072 err = -EIO;
2051 break; 2073 break;
2074 }
2052 2075
2053 if (err == EXT_REPEAT) 2076 /* This is possible iff next == next_del == EXT_MAX_BLOCKS */
2054 continue; 2077 if (next == next_del) {
2055 else if (err == EXT_BREAK) { 2078 flags |= FIEMAP_EXTENT_LAST;
2056 err = 0; 2079 if (unlikely(next_del != EXT_MAX_BLOCKS ||
2057 break; 2080 next != EXT_MAX_BLOCKS)) {
2081 EXT4_ERROR_INODE(inode,
2082 "next extent == %u, next "
2083 "delalloc extent = %u",
2084 next, next_del);
2085 err = -EIO;
2086 break;
2087 }
2058 } 2088 }
2059 2089
2060 if (ext_depth(inode) != depth) { 2090 if (exists) {
2061 /* depth was changed. we have to realloc path */ 2091 err = fiemap_fill_next_extent(fieinfo,
2062 kfree(path); 2092 (__u64)newex.ec_block << blksize_bits,
2063 path = NULL; 2093 (__u64)newex.ec_start << blksize_bits,
2094 (__u64)newex.ec_len << blksize_bits,
2095 flags);
2096 if (err < 0)
2097 break;
2098 if (err == 1) {
2099 err = 0;
2100 break;
2101 }
2064 } 2102 }
2065 2103
2066 block = cbex.ec_block + cbex.ec_len; 2104 block = newex.ec_block + newex.ec_len;
2067 } 2105 }
2068 2106
2069 if (path) { 2107 if (path) {
@@ -2156,7 +2194,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2156 struct ext4_extent *ex) 2194 struct ext4_extent *ex)
2157{ 2195{
2158 struct ext4_ext_cache *cex; 2196 struct ext4_ext_cache *cex;
2159 struct ext4_sb_info *sbi;
2160 int ret = 0; 2197 int ret = 0;
2161 2198
2162 /* 2199 /*
@@ -2164,7 +2201,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2164 */ 2201 */
2165 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2202 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2166 cex = &EXT4_I(inode)->i_cached_extent; 2203 cex = &EXT4_I(inode)->i_cached_extent;
2167 sbi = EXT4_SB(inode->i_sb);
2168 2204
2169 /* has cache valid data? */ 2205 /* has cache valid data? */
2170 if (cex->ec_len == 0) 2206 if (cex->ec_len == 0)
@@ -2273,7 +2309,13 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2273int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 2309int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2274{ 2310{
2275 int index; 2311 int index;
2276 int depth = ext_depth(inode); 2312 int depth;
2313
2314 /* If we are converting the inline data, only one is needed here. */
2315 if (ext4_has_inline_data(inode))
2316 return 1;
2317
2318 depth = ext_depth(inode);
2277 2319
2278 if (chunk) 2320 if (chunk)
2279 index = depth * 2; 2321 index = depth * 2;
@@ -3461,115 +3503,34 @@ out:
3461/** 3503/**
3462 * ext4_find_delalloc_range: find delayed allocated block in the given range. 3504 * ext4_find_delalloc_range: find delayed allocated block in the given range.
3463 * 3505 *
3464 * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns 3506 * Return 1 if there is a delalloc block in the range, otherwise 0.
3465 * whether there are any buffers marked for delayed allocation. It returns '1'
3466 * on the first delalloc'ed buffer head found. If no buffer head in the given
3467 * range is marked for delalloc, it returns 0.
3468 * lblk_start should always be <= lblk_end.
3469 * search_hint_reverse is to indicate that searching in reverse from lblk_end to
3470 * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
3471 * block sooner). This is useful when blocks are truncated sequentially from
3472 * lblk_start towards lblk_end.
3473 */ 3507 */
3474static int ext4_find_delalloc_range(struct inode *inode, 3508static int ext4_find_delalloc_range(struct inode *inode,
3475 ext4_lblk_t lblk_start, 3509 ext4_lblk_t lblk_start,
3476 ext4_lblk_t lblk_end, 3510 ext4_lblk_t lblk_end)
3477 int search_hint_reverse)
3478{ 3511{
3479 struct address_space *mapping = inode->i_mapping; 3512 struct extent_status es;
3480 struct buffer_head *head, *bh = NULL;
3481 struct page *page;
3482 ext4_lblk_t i, pg_lblk;
3483 pgoff_t index;
3484
3485 if (!test_opt(inode->i_sb, DELALLOC))
3486 return 0;
3487
3488 /* reverse search wont work if fs block size is less than page size */
3489 if (inode->i_blkbits < PAGE_CACHE_SHIFT)
3490 search_hint_reverse = 0;
3491 3513
3492 if (search_hint_reverse) 3514 es.start = lblk_start;
3493 i = lblk_end; 3515 ext4_es_find_extent(inode, &es);
3516 if (es.len == 0)
3517 return 0; /* there is no delay extent in this tree */
3518 else if (es.start <= lblk_start && lblk_start < es.start + es.len)
3519 return 1;
3520 else if (lblk_start <= es.start && es.start <= lblk_end)
3521 return 1;
3494 else 3522 else
3495 i = lblk_start; 3523 return 0;
3496
3497 index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
3498
3499 while ((i >= lblk_start) && (i <= lblk_end)) {
3500 page = find_get_page(mapping, index);
3501 if (!page)
3502 goto nextpage;
3503
3504 if (!page_has_buffers(page))
3505 goto nextpage;
3506
3507 head = page_buffers(page);
3508 if (!head)
3509 goto nextpage;
3510
3511 bh = head;
3512 pg_lblk = index << (PAGE_CACHE_SHIFT -
3513 inode->i_blkbits);
3514 do {
3515 if (unlikely(pg_lblk < lblk_start)) {
3516 /*
3517 * This is possible when fs block size is less
3518 * than page size and our cluster starts/ends in
3519 * middle of the page. So we need to skip the
3520 * initial few blocks till we reach the 'lblk'
3521 */
3522 pg_lblk++;
3523 continue;
3524 }
3525
3526 /* Check if the buffer is delayed allocated and that it
3527 * is not yet mapped. (when da-buffers are mapped during
3528 * their writeout, their da_mapped bit is set.)
3529 */
3530 if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
3531 page_cache_release(page);
3532 trace_ext4_find_delalloc_range(inode,
3533 lblk_start, lblk_end,
3534 search_hint_reverse,
3535 1, i);
3536 return 1;
3537 }
3538 if (search_hint_reverse)
3539 i--;
3540 else
3541 i++;
3542 } while ((i >= lblk_start) && (i <= lblk_end) &&
3543 ((bh = bh->b_this_page) != head));
3544nextpage:
3545 if (page)
3546 page_cache_release(page);
3547 /*
3548 * Move to next page. 'i' will be the first lblk in the next
3549 * page.
3550 */
3551 if (search_hint_reverse)
3552 index--;
3553 else
3554 index++;
3555 i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
3556 }
3557
3558 trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
3559 search_hint_reverse, 0, 0);
3560 return 0;
3561} 3524}
3562 3525
3563int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, 3526int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
3564 int search_hint_reverse)
3565{ 3527{
3566 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3528 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3567 ext4_lblk_t lblk_start, lblk_end; 3529 ext4_lblk_t lblk_start, lblk_end;
3568 lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); 3530 lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
3569 lblk_end = lblk_start + sbi->s_cluster_ratio - 1; 3531 lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
3570 3532
3571 return ext4_find_delalloc_range(inode, lblk_start, lblk_end, 3533 return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
3572 search_hint_reverse);
3573} 3534}
3574 3535
3575/** 3536/**
@@ -3630,7 +3591,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3630 lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); 3591 lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
3631 lblk_to = lblk_from + c_offset - 1; 3592 lblk_to = lblk_from + c_offset - 1;
3632 3593
3633 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) 3594 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
3634 allocated_clusters--; 3595 allocated_clusters--;
3635 } 3596 }
3636 3597
@@ -3640,7 +3601,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3640 lblk_from = lblk_start + num_blks; 3601 lblk_from = lblk_start + num_blks;
3641 lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; 3602 lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
3642 3603
3643 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) 3604 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
3644 allocated_clusters--; 3605 allocated_clusters--;
3645 } 3606 }
3646 3607
@@ -3663,8 +3624,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3663 flags, allocated); 3624 flags, allocated);
3664 ext4_ext_show_leaf(inode, path); 3625 ext4_ext_show_leaf(inode, path);
3665 3626
3666 trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated, 3627 trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
3667 newblock); 3628 allocated, newblock);
3668 3629
3669 /* get_block() before submit the IO, split the extent */ 3630 /* get_block() before submit the IO, split the extent */
3670 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3631 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
@@ -3911,7 +3872,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3911 struct ext4_extent newex, *ex, *ex2; 3872 struct ext4_extent newex, *ex, *ex2;
3912 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3873 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3913 ext4_fsblk_t newblock = 0; 3874 ext4_fsblk_t newblock = 0;
3914 int free_on_err = 0, err = 0, depth, ret; 3875 int free_on_err = 0, err = 0, depth;
3915 unsigned int allocated = 0, offset = 0; 3876 unsigned int allocated = 0, offset = 0;
3916 unsigned int allocated_clusters = 0; 3877 unsigned int allocated_clusters = 0;
3917 struct ext4_allocation_request ar; 3878 struct ext4_allocation_request ar;
@@ -3927,7 +3888,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3927 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { 3888 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3928 if (!newex.ee_start_lo && !newex.ee_start_hi) { 3889 if (!newex.ee_start_lo && !newex.ee_start_hi) {
3929 if ((sbi->s_cluster_ratio > 1) && 3890 if ((sbi->s_cluster_ratio > 1) &&
3930 ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) 3891 ext4_find_delalloc_cluster(inode, map->m_lblk))
3931 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 3892 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3932 3893
3933 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3894 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -4007,15 +3968,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4007 ee_len, ee_start); 3968 ee_len, ee_start);
4008 goto out; 3969 goto out;
4009 } 3970 }
4010 ret = ext4_ext_handle_uninitialized_extents( 3971 allocated = ext4_ext_handle_uninitialized_extents(
4011 handle, inode, map, path, flags, 3972 handle, inode, map, path, flags,
4012 allocated, newblock); 3973 allocated, newblock);
4013 return ret; 3974 goto out3;
4014 } 3975 }
4015 } 3976 }
4016 3977
4017 if ((sbi->s_cluster_ratio > 1) && 3978 if ((sbi->s_cluster_ratio > 1) &&
4018 ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) 3979 ext4_find_delalloc_cluster(inode, map->m_lblk))
4019 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 3980 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4020 3981
4021 /* 3982 /*
@@ -4284,8 +4245,8 @@ out2:
4284 kfree(path); 4245 kfree(path);
4285 } 4246 }
4286 4247
4287 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, 4248out3:
4288 newblock, map->m_len, err ? err : allocated); 4249 trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated);
4289 4250
4290 return err ? err : allocated; 4251 return err ? err : allocated;
4291} 4252}
@@ -4344,6 +4305,8 @@ void ext4_ext_truncate(struct inode *inode)
4344 4305
4345 last_block = (inode->i_size + sb->s_blocksize - 1) 4306 last_block = (inode->i_size + sb->s_blocksize - 1)
4346 >> EXT4_BLOCK_SIZE_BITS(sb); 4307 >> EXT4_BLOCK_SIZE_BITS(sb);
4308 err = ext4_es_remove_extent(inode, last_block,
4309 EXT_MAX_BLOCKS - last_block);
4347 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); 4310 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4348 4311
4349 /* In a multi-transaction truncate, we only make the final 4312 /* In a multi-transaction truncate, we only make the final
@@ -4434,6 +4397,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4434 if (mode & FALLOC_FL_PUNCH_HOLE) 4397 if (mode & FALLOC_FL_PUNCH_HOLE)
4435 return ext4_punch_hole(file, offset, len); 4398 return ext4_punch_hole(file, offset, len);
4436 4399
4400 ret = ext4_convert_inline_data(inode);
4401 if (ret)
4402 return ret;
4403
4437 trace_ext4_fallocate_enter(inode, offset, len, mode); 4404 trace_ext4_fallocate_enter(inode, offset, len, mode);
4438 map.m_lblk = offset >> blkbits; 4405 map.m_lblk = offset >> blkbits;
4439 /* 4406 /*
@@ -4572,206 +4539,43 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4572} 4539}
4573 4540
4574/* 4541/*
4575 * Callback function called for each extent to gather FIEMAP information. 4542 * If newex is not existing extent (newex->ec_start equals zero) find
4543 * delayed extent at start of newex and update newex accordingly and
4544 * return start of the next delayed extent.
4545 *
4546 * If newex is existing extent (newex->ec_start is not equal zero)
4547 * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
4548 * extent found. Leave newex unmodified.
4576 */ 4549 */
4577static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, 4550static int ext4_find_delayed_extent(struct inode *inode,
4578 struct ext4_ext_cache *newex, struct ext4_extent *ex, 4551 struct ext4_ext_cache *newex)
4579 void *data)
4580{ 4552{
4581 __u64 logical; 4553 struct extent_status es;
4582 __u64 physical; 4554 ext4_lblk_t next_del;
4583 __u64 length;
4584 __u32 flags = 0;
4585 int ret = 0;
4586 struct fiemap_extent_info *fieinfo = data;
4587 unsigned char blksize_bits;
4588 4555
4589 blksize_bits = inode->i_sb->s_blocksize_bits; 4556 es.start = newex->ec_block;
4590 logical = (__u64)newex->ec_block << blksize_bits; 4557 next_del = ext4_es_find_extent(inode, &es);
4591 4558
4592 if (newex->ec_start == 0) { 4559 if (newex->ec_start == 0) {
4593 /* 4560 /*
4594 * No extent in extent-tree contains block @newex->ec_start, 4561 * No extent in extent-tree contains block @newex->ec_start,
4595 * then the block may stay in 1)a hole or 2)delayed-extent. 4562 * then the block may stay in 1)a hole or 2)delayed-extent.
4596 *
4597 * Holes or delayed-extents are processed as follows.
4598 * 1. lookup dirty pages with specified range in pagecache.
4599 * If no page is got, then there is no delayed-extent and
4600 * return with EXT_CONTINUE.
4601 * 2. find the 1st mapped buffer,
4602 * 3. check if the mapped buffer is both in the request range
4603 * and a delayed buffer. If not, there is no delayed-extent,
4604 * then return.
4605 * 4. a delayed-extent is found, the extent will be collected.
4606 */ 4563 */
4607 ext4_lblk_t end = 0; 4564 if (es.len == 0)
4608 pgoff_t last_offset; 4565 /* A hole found. */
4609 pgoff_t offset; 4566 return 0;
4610 pgoff_t index;
4611 pgoff_t start_index = 0;
4612 struct page **pages = NULL;
4613 struct buffer_head *bh = NULL;
4614 struct buffer_head *head = NULL;
4615 unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
4616
4617 pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
4618 if (pages == NULL)
4619 return -ENOMEM;
4620
4621 offset = logical >> PAGE_SHIFT;
4622repeat:
4623 last_offset = offset;
4624 head = NULL;
4625 ret = find_get_pages_tag(inode->i_mapping, &offset,
4626 PAGECACHE_TAG_DIRTY, nr_pages, pages);
4627
4628 if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
4629 /* First time, try to find a mapped buffer. */
4630 if (ret == 0) {
4631out:
4632 for (index = 0; index < ret; index++)
4633 page_cache_release(pages[index]);
4634 /* just a hole. */
4635 kfree(pages);
4636 return EXT_CONTINUE;
4637 }
4638 index = 0;
4639
4640next_page:
4641 /* Try to find the 1st mapped buffer. */
4642 end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
4643 blksize_bits;
4644 if (!page_has_buffers(pages[index]))
4645 goto out;
4646 head = page_buffers(pages[index]);
4647 if (!head)
4648 goto out;
4649
4650 index++;
4651 bh = head;
4652 do {
4653 if (end >= newex->ec_block +
4654 newex->ec_len)
4655 /* The buffer is out of
4656 * the request range.
4657 */
4658 goto out;
4659
4660 if (buffer_mapped(bh) &&
4661 end >= newex->ec_block) {
4662 start_index = index - 1;
4663 /* get the 1st mapped buffer. */
4664 goto found_mapped_buffer;
4665 }
4666
4667 bh = bh->b_this_page;
4668 end++;
4669 } while (bh != head);
4670
4671 /* No mapped buffer in the range found in this page,
4672 * We need to look up next page.
4673 */
4674 if (index >= ret) {
4675 /* There is no page left, but we need to limit
4676 * newex->ec_len.
4677 */
4678 newex->ec_len = end - newex->ec_block;
4679 goto out;
4680 }
4681 goto next_page;
4682 } else {
4683 /*Find contiguous delayed buffers. */
4684 if (ret > 0 && pages[0]->index == last_offset)
4685 head = page_buffers(pages[0]);
4686 bh = head;
4687 index = 1;
4688 start_index = 0;
4689 }
4690
4691found_mapped_buffer:
4692 if (bh != NULL && buffer_delay(bh)) {
4693 /* 1st or contiguous delayed buffer found. */
4694 if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
4695 /*
4696 * 1st delayed buffer found, record
4697 * the start of extent.
4698 */
4699 flags |= FIEMAP_EXTENT_DELALLOC;
4700 newex->ec_block = end;
4701 logical = (__u64)end << blksize_bits;
4702 }
4703 /* Find contiguous delayed buffers. */
4704 do {
4705 if (!buffer_delay(bh))
4706 goto found_delayed_extent;
4707 bh = bh->b_this_page;
4708 end++;
4709 } while (bh != head);
4710
4711 for (; index < ret; index++) {
4712 if (!page_has_buffers(pages[index])) {
4713 bh = NULL;
4714 break;
4715 }
4716 head = page_buffers(pages[index]);
4717 if (!head) {
4718 bh = NULL;
4719 break;
4720 }
4721
4722 if (pages[index]->index !=
4723 pages[start_index]->index + index
4724 - start_index) {
4725 /* Blocks are not contiguous. */
4726 bh = NULL;
4727 break;
4728 }
4729 bh = head;
4730 do {
4731 if (!buffer_delay(bh))
4732 /* Delayed-extent ends. */
4733 goto found_delayed_extent;
4734 bh = bh->b_this_page;
4735 end++;
4736 } while (bh != head);
4737 }
4738 } else if (!(flags & FIEMAP_EXTENT_DELALLOC))
4739 /* a hole found. */
4740 goto out;
4741 4567
4742found_delayed_extent: 4568 if (es.start > newex->ec_block) {
4743 newex->ec_len = min(end - newex->ec_block, 4569 /* A hole found. */
4744 (ext4_lblk_t)EXT_INIT_MAX_LEN); 4570 newex->ec_len = min(es.start - newex->ec_block,
4745 if (ret == nr_pages && bh != NULL && 4571 newex->ec_len);
4746 newex->ec_len < EXT_INIT_MAX_LEN && 4572 return 0;
4747 buffer_delay(bh)) {
4748 /* Have not collected an extent and continue. */
4749 for (index = 0; index < ret; index++)
4750 page_cache_release(pages[index]);
4751 goto repeat;
4752 } 4573 }
4753 4574
4754 for (index = 0; index < ret; index++) 4575 newex->ec_len = es.start + es.len - newex->ec_block;
4755 page_cache_release(pages[index]);
4756 kfree(pages);
4757 } 4576 }
4758 4577
4759 physical = (__u64)newex->ec_start << blksize_bits; 4578 return next_del;
4760 length = (__u64)newex->ec_len << blksize_bits;
4761
4762 if (ex && ext4_ext_is_uninitialized(ex))
4763 flags |= FIEMAP_EXTENT_UNWRITTEN;
4764
4765 if (next == EXT_MAX_BLOCKS)
4766 flags |= FIEMAP_EXTENT_LAST;
4767
4768 ret = fiemap_fill_next_extent(fieinfo, logical, physical,
4769 length, flags);
4770 if (ret < 0)
4771 return ret;
4772 if (ret == 1)
4773 return EXT_BREAK;
4774 return EXT_CONTINUE;
4775} 4579}
4776/* fiemap flags we can handle specified here */ 4580/* fiemap flags we can handle specified here */
4777#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 4581#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -4971,6 +4775,8 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4971 ext4_ext_invalidate_cache(inode); 4775 ext4_ext_invalidate_cache(inode);
4972 ext4_discard_preallocations(inode); 4776 ext4_discard_preallocations(inode);
4973 4777
4778 err = ext4_es_remove_extent(inode, first_block,
4779 stop_block - first_block);
4974 err = ext4_ext_remove_space(inode, first_block, stop_block - 1); 4780 err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
4975 4781
4976 ext4_ext_invalidate_cache(inode); 4782 ext4_ext_invalidate_cache(inode);
@@ -4991,12 +4797,22 @@ out_mutex:
4991 mutex_unlock(&inode->i_mutex); 4797 mutex_unlock(&inode->i_mutex);
4992 return err; 4798 return err;
4993} 4799}
4800
4994int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4801int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4995 __u64 start, __u64 len) 4802 __u64 start, __u64 len)
4996{ 4803{
4997 ext4_lblk_t start_blk; 4804 ext4_lblk_t start_blk;
4998 int error = 0; 4805 int error = 0;
4999 4806
4807 if (ext4_has_inline_data(inode)) {
4808 int has_inline = 1;
4809
4810 error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline);
4811
4812 if (has_inline)
4813 return error;
4814 }
4815
5000 /* fallback to generic here if not in extents fmt */ 4816 /* fallback to generic here if not in extents fmt */
5001 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4817 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5002 return generic_block_fiemap(inode, fieinfo, start, len, 4818 return generic_block_fiemap(inode, fieinfo, start, len,
@@ -5018,11 +4834,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5018 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; 4834 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
5019 4835
5020 /* 4836 /*
5021 * Walk the extent tree gathering extent information. 4837 * Walk the extent tree gathering extent information
5022 * ext4_ext_fiemap_cb will push extents back to user. 4838 * and pushing extents back to the user.
5023 */ 4839 */
5024 error = ext4_ext_walk_space(inode, start_blk, len_blks, 4840 error = ext4_fill_fiemap_extents(inode, start_blk,
5025 ext4_ext_fiemap_cb, fieinfo); 4841 len_blks, fieinfo);
5026 } 4842 }
5027 4843
5028 return error; 4844 return error;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
new file mode 100644
index 000000000000..564d981a2fcc
--- /dev/null
+++ b/fs/ext4/extents_status.c
@@ -0,0 +1,500 @@
1/*
2 * fs/ext4/extents_status.c
3 *
4 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
5 * Modified by
6 * Allison Henderson <achender@linux.vnet.ibm.com>
7 * Hugh Dickins <hughd@google.com>
8 * Zheng Liu <wenqing.lz@taobao.com>
9 *
10 * Ext4 extents status tree core functions.
11 */
12#include <linux/rbtree.h>
13#include "ext4.h"
14#include "extents_status.h"
15#include "ext4_extents.h"
16
17#include <trace/events/ext4.h>
18
19/*
20 * According to previous discussion in Ext4 Developer Workshop, we
21 * will introduce a new structure called io tree to track all extent
22 * status in order to solve some problems that we have met
23 * (e.g. Reservation space warning), and provide extent-level locking.
24 * Delay extent tree is the first step to achieve this goal. It is
25 * original built by Yongqiang Yang. At that time it is called delay
26 * extent tree, whose goal is only track delay extent in memory to
27 * simplify the implementation of fiemap and bigalloc, and introduce
28 * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called
29 * delay extent tree at the following comment. But for better
30 * understand what it does, it has been rename to extent status tree.
31 *
32 * Currently the first step has been done. All delay extents are
33 * tracked in the tree. It maintains the delay extent when a delay
34 * allocation is issued, and the delay extent is written out or
35 * invalidated. Therefore the implementation of fiemap and bigalloc
36 * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
37 *
38 * The following comment describes the implemenmtation of extent
39 * status tree and future works.
40 */
41
42/*
43 * extents status tree implementation for ext4.
44 *
45 *
46 * ==========================================================================
47 * Extents status encompass delayed extents and extent locks
48 *
49 * 1. Why delayed extent implementation ?
50 *
51 * Without delayed extent, ext4 identifies a delayed extent by looking
52 * up page cache, this has several deficiencies - complicated, buggy,
53 * and inefficient code.
54 *
55 * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need
56 * to know if a block or a range of blocks are belonged to a delayed
57 * extent.
58 *
59 * Let us have a look at how they do without delayed extents implementation.
60 * -- FIEMAP
61 * FIEMAP looks up page cache to identify delayed allocations from holes.
62 *
63 * -- SEEK_HOLE/DATA
64 * SEEK_HOLE/DATA has the same problem as FIEMAP.
65 *
66 * -- bigalloc
67 * bigalloc looks up page cache to figure out if a block is
68 * already under delayed allocation or not to determine whether
69 * quota reserving is needed for the cluster.
70 *
71 * -- punch hole
72 * punch hole looks up page cache to identify a delayed extent.
73 *
74 * -- writeout
75 * Writeout looks up whole page cache to see if a buffer is
76 * mapped, If there are not very many delayed buffers, then it is
77 * time comsuming.
78 *
79 * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA,
80 * bigalloc and writeout can figure out if a block or a range of
81 * blocks is under delayed allocation(belonged to a delayed extent) or
82 * not by searching the delayed extent tree.
83 *
84 *
85 * ==========================================================================
86 * 2. ext4 delayed extents impelmentation
87 *
88 * -- delayed extent
89 * A delayed extent is a range of blocks which are contiguous
90 * logically and under delayed allocation. Unlike extent in
91 * ext4, delayed extent in ext4 is a in-memory struct, there is
92 * no corresponding on-disk data. There is no limit on length of
93 * delayed extent, so a delayed extent can contain as many blocks
94 * as they are contiguous logically.
95 *
96 * -- delayed extent tree
97 * Every inode has a delayed extent tree and all under delayed
98 * allocation blocks are added to the tree as delayed extents.
99 * Delayed extents in the tree are ordered by logical block no.
100 *
101 * -- operations on a delayed extent tree
102 * There are three operations on a delayed extent tree: find next
103 * delayed extent, adding a space(a range of blocks) and removing
104 * a space.
105 *
106 * -- race on a delayed extent tree
107 * Delayed extent tree is protected inode->i_es_lock.
108 *
109 *
110 * ==========================================================================
111 * 3. performance analysis
112 * -- overhead
113 * 1. There is a cache extent for write access, so if writes are
114 * not very random, adding space operaions are in O(1) time.
115 *
116 * -- gain
117 * 2. Code is much simpler, more readable, more maintainable and
118 * more efficient.
119 *
120 *
121 * ==========================================================================
122 * 4. TODO list
123 * -- Track all extent status
124 *
125 * -- Improve get block process
126 *
127 * -- Extent-level locking
128 */
129
130static struct kmem_cache *ext4_es_cachep;
131
132int __init ext4_init_es(void)
133{
134 ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
135 if (ext4_es_cachep == NULL)
136 return -ENOMEM;
137 return 0;
138}
139
140void ext4_exit_es(void)
141{
142 if (ext4_es_cachep)
143 kmem_cache_destroy(ext4_es_cachep);
144}
145
146void ext4_es_init_tree(struct ext4_es_tree *tree)
147{
148 tree->root = RB_ROOT;
149 tree->cache_es = NULL;
150}
151
152#ifdef ES_DEBUG__
153static void ext4_es_print_tree(struct inode *inode)
154{
155 struct ext4_es_tree *tree;
156 struct rb_node *node;
157
158 printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
159 tree = &EXT4_I(inode)->i_es_tree;
160 node = rb_first(&tree->root);
161 while (node) {
162 struct extent_status *es;
163 es = rb_entry(node, struct extent_status, rb_node);
164 printk(KERN_DEBUG " [%u/%u)", es->start, es->len);
165 node = rb_next(node);
166 }
167 printk(KERN_DEBUG "\n");
168}
169#else
170#define ext4_es_print_tree(inode)
171#endif
172
173static inline ext4_lblk_t extent_status_end(struct extent_status *es)
174{
175 BUG_ON(es->start + es->len < es->start);
176 return es->start + es->len - 1;
177}
178
179/*
180 * search through the tree for an delayed extent with a given offset. If
181 * it can't be found, try to find next extent.
182 */
183static struct extent_status *__es_tree_search(struct rb_root *root,
184 ext4_lblk_t offset)
185{
186 struct rb_node *node = root->rb_node;
187 struct extent_status *es = NULL;
188
189 while (node) {
190 es = rb_entry(node, struct extent_status, rb_node);
191 if (offset < es->start)
192 node = node->rb_left;
193 else if (offset > extent_status_end(es))
194 node = node->rb_right;
195 else
196 return es;
197 }
198
199 if (es && offset < es->start)
200 return es;
201
202 if (es && offset > extent_status_end(es)) {
203 node = rb_next(&es->rb_node);
204 return node ? rb_entry(node, struct extent_status, rb_node) :
205 NULL;
206 }
207
208 return NULL;
209}
210
211/*
212 * ext4_es_find_extent: find the 1st delayed extent covering @es->start
213 * if it exists, otherwise, the next extent after @es->start.
214 *
215 * @inode: the inode which owns delayed extents
216 * @es: delayed extent that we found
217 *
218 * Returns the first block of the next extent after es, otherwise
219 * EXT_MAX_BLOCKS if no delay extent is found.
220 * Delayed extent is returned via @es.
221 */
222ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es)
223{
224 struct ext4_es_tree *tree = NULL;
225 struct extent_status *es1 = NULL;
226 struct rb_node *node;
227 ext4_lblk_t ret = EXT_MAX_BLOCKS;
228
229 trace_ext4_es_find_extent_enter(inode, es->start);
230
231 read_lock(&EXT4_I(inode)->i_es_lock);
232 tree = &EXT4_I(inode)->i_es_tree;
233
234 /* find delay extent in cache firstly */
235 if (tree->cache_es) {
236 es1 = tree->cache_es;
237 if (in_range(es->start, es1->start, es1->len)) {
238 es_debug("%u cached by [%u/%u)\n",
239 es->start, es1->start, es1->len);
240 goto out;
241 }
242 }
243
244 es->len = 0;
245 es1 = __es_tree_search(&tree->root, es->start);
246
247out:
248 if (es1) {
249 tree->cache_es = es1;
250 es->start = es1->start;
251 es->len = es1->len;
252 node = rb_next(&es1->rb_node);
253 if (node) {
254 es1 = rb_entry(node, struct extent_status, rb_node);
255 ret = es1->start;
256 }
257 }
258
259 read_unlock(&EXT4_I(inode)->i_es_lock);
260
261 trace_ext4_es_find_extent_exit(inode, es, ret);
262 return ret;
263}
264
265static struct extent_status *
266ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
267{
268 struct extent_status *es;
269 es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
270 if (es == NULL)
271 return NULL;
272 es->start = start;
273 es->len = len;
274 return es;
275}
276
277static void ext4_es_free_extent(struct extent_status *es)
278{
279 kmem_cache_free(ext4_es_cachep, es);
280}
281
282static struct extent_status *
283ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
284{
285 struct extent_status *es1;
286 struct rb_node *node;
287
288 node = rb_prev(&es->rb_node);
289 if (!node)
290 return es;
291
292 es1 = rb_entry(node, struct extent_status, rb_node);
293 if (es->start == extent_status_end(es1) + 1) {
294 es1->len += es->len;
295 rb_erase(&es->rb_node, &tree->root);
296 ext4_es_free_extent(es);
297 es = es1;
298 }
299
300 return es;
301}
302
303static struct extent_status *
304ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
305{
306 struct extent_status *es1;
307 struct rb_node *node;
308
309 node = rb_next(&es->rb_node);
310 if (!node)
311 return es;
312
313 es1 = rb_entry(node, struct extent_status, rb_node);
314 if (es1->start == extent_status_end(es) + 1) {
315 es->len += es1->len;
316 rb_erase(node, &tree->root);
317 ext4_es_free_extent(es1);
318 }
319
320 return es;
321}
322
323static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset,
324 ext4_lblk_t len)
325{
326 struct rb_node **p = &tree->root.rb_node;
327 struct rb_node *parent = NULL;
328 struct extent_status *es;
329 ext4_lblk_t end = offset + len - 1;
330
331 BUG_ON(end < offset);
332 es = tree->cache_es;
333 if (es && offset == (extent_status_end(es) + 1)) {
334 es_debug("cached by [%u/%u)\n", es->start, es->len);
335 es->len += len;
336 es = ext4_es_try_to_merge_right(tree, es);
337 goto out;
338 } else if (es && es->start == end + 1) {
339 es_debug("cached by [%u/%u)\n", es->start, es->len);
340 es->start = offset;
341 es->len += len;
342 es = ext4_es_try_to_merge_left(tree, es);
343 goto out;
344 } else if (es && es->start <= offset &&
345 end <= extent_status_end(es)) {
346 es_debug("cached by [%u/%u)\n", es->start, es->len);
347 goto out;
348 }
349
350 while (*p) {
351 parent = *p;
352 es = rb_entry(parent, struct extent_status, rb_node);
353
354 if (offset < es->start) {
355 if (es->start == end + 1) {
356 es->start = offset;
357 es->len += len;
358 es = ext4_es_try_to_merge_left(tree, es);
359 goto out;
360 }
361 p = &(*p)->rb_left;
362 } else if (offset > extent_status_end(es)) {
363 if (offset == extent_status_end(es) + 1) {
364 es->len += len;
365 es = ext4_es_try_to_merge_right(tree, es);
366 goto out;
367 }
368 p = &(*p)->rb_right;
369 } else {
370 if (extent_status_end(es) <= end)
371 es->len = offset - es->start + len;
372 goto out;
373 }
374 }
375
376 es = ext4_es_alloc_extent(offset, len);
377 if (!es)
378 return -ENOMEM;
379 rb_link_node(&es->rb_node, parent, p);
380 rb_insert_color(&es->rb_node, &tree->root);
381
382out:
383 tree->cache_es = es;
384 return 0;
385}
386
387/*
388 * ext4_es_insert_extent() adds a space to a delayed extent tree.
389 * Caller holds inode->i_es_lock.
390 *
391 * ext4_es_insert_extent is called by ext4_da_write_begin and
392 * ext4_es_remove_extent.
393 *
394 * Return 0 on success, error code on failure.
395 */
396int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset,
397 ext4_lblk_t len)
398{
399 struct ext4_es_tree *tree;
400 int err = 0;
401
402 trace_ext4_es_insert_extent(inode, offset, len);
403 es_debug("add [%u/%u) to extent status tree of inode %lu\n",
404 offset, len, inode->i_ino);
405
406 write_lock(&EXT4_I(inode)->i_es_lock);
407 tree = &EXT4_I(inode)->i_es_tree;
408 err = __es_insert_extent(tree, offset, len);
409 write_unlock(&EXT4_I(inode)->i_es_lock);
410
411 ext4_es_print_tree(inode);
412
413 return err;
414}
415
416/*
417 * ext4_es_remove_extent() removes a space from a delayed extent tree.
418 * Caller holds inode->i_es_lock.
419 *
420 * Return 0 on success, error code on failure.
421 */
422int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
423 ext4_lblk_t len)
424{
425 struct rb_node *node;
426 struct ext4_es_tree *tree;
427 struct extent_status *es;
428 struct extent_status orig_es;
429 ext4_lblk_t len1, len2, end;
430 int err = 0;
431
432 trace_ext4_es_remove_extent(inode, offset, len);
433 es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
434 offset, len, inode->i_ino);
435
436 end = offset + len - 1;
437 BUG_ON(end < offset);
438 write_lock(&EXT4_I(inode)->i_es_lock);
439 tree = &EXT4_I(inode)->i_es_tree;
440 es = __es_tree_search(&tree->root, offset);
441 if (!es)
442 goto out;
443 if (es->start > end)
444 goto out;
445
446 /* Simply invalidate cache_es. */
447 tree->cache_es = NULL;
448
449 orig_es.start = es->start;
450 orig_es.len = es->len;
451 len1 = offset > es->start ? offset - es->start : 0;
452 len2 = extent_status_end(es) > end ?
453 extent_status_end(es) - end : 0;
454 if (len1 > 0)
455 es->len = len1;
456 if (len2 > 0) {
457 if (len1 > 0) {
458 err = __es_insert_extent(tree, end + 1, len2);
459 if (err) {
460 es->start = orig_es.start;
461 es->len = orig_es.len;
462 goto out;
463 }
464 } else {
465 es->start = end + 1;
466 es->len = len2;
467 }
468 goto out;
469 }
470
471 if (len1 > 0) {
472 node = rb_next(&es->rb_node);
473 if (node)
474 es = rb_entry(node, struct extent_status, rb_node);
475 else
476 es = NULL;
477 }
478
479 while (es && extent_status_end(es) <= end) {
480 node = rb_next(&es->rb_node);
481 rb_erase(&es->rb_node, &tree->root);
482 ext4_es_free_extent(es);
483 if (!node) {
484 es = NULL;
485 break;
486 }
487 es = rb_entry(node, struct extent_status, rb_node);
488 }
489
490 if (es && es->start < end + 1) {
491 len1 = extent_status_end(es) - end;
492 es->start = end + 1;
493 es->len = len1;
494 }
495
496out:
497 write_unlock(&EXT4_I(inode)->i_es_lock);
498 ext4_es_print_tree(inode);
499 return err;
500}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
new file mode 100644
index 000000000000..077f82db092a
--- /dev/null
+++ b/fs/ext4/extents_status.h
@@ -0,0 +1,45 @@
1/*
2 * fs/ext4/extents_status.h
3 *
4 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
5 * Modified by
6 * Allison Henderson <achender@linux.vnet.ibm.com>
7 * Zheng Liu <wenqing.lz@taobao.com>
8 *
9 */
10
11#ifndef _EXT4_EXTENTS_STATUS_H
12#define _EXT4_EXTENTS_STATUS_H
13
14/*
15 * Turn on ES_DEBUG__ to get lots of info about extent status operations.
16 */
17#ifdef ES_DEBUG__
18#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
19#else
20#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
21#endif
22
23struct extent_status {
24 struct rb_node rb_node;
25 ext4_lblk_t start; /* first block extent covers */
26 ext4_lblk_t len; /* length of extent in block */
27};
28
29struct ext4_es_tree {
30 struct rb_root root;
31 struct extent_status *cache_es; /* recently accessed extent */
32};
33
34extern int __init ext4_init_es(void);
35extern void ext4_exit_es(void);
36extern void ext4_es_init_tree(struct ext4_es_tree *tree);
37
38extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start,
39 ext4_lblk_t len);
40extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start,
41 ext4_lblk_t len);
42extern ext4_lblk_t ext4_es_find_extent(struct inode *inode,
43 struct extent_status *es);
44
45#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index bf3966bccd34..d07c27ca594a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -24,6 +24,7 @@
24#include <linux/mount.h> 24#include <linux/mount.h>
25#include <linux/path.h> 25#include <linux/path.h>
26#include <linux/quotaops.h> 26#include <linux/quotaops.h>
27#include <linux/pagevec.h>
27#include "ext4.h" 28#include "ext4.h"
28#include "ext4_jbd2.h" 29#include "ext4_jbd2.h"
29#include "xattr.h" 30#include "xattr.h"
@@ -286,11 +287,329 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
286} 287}
287 288
288/* 289/*
290 * Here we use ext4_map_blocks() to get a block mapping for a extent-based
291 * file rather than ext4_ext_walk_space() because we can introduce
292 * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
293 * function. When extent status tree has been fully implemented, it will
294 * track all extent status for a file and we can directly use it to
295 * retrieve the offset for SEEK_DATA/SEEK_HOLE.
296 */
297
298/*
299 * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
300 * lookup page cache to check whether or not there has some data between
301 * [startoff, endoff] because, if this range contains an unwritten extent,
302 * we determine this extent as a data or a hole according to whether the
303 * page cache has data or not.
304 */
305static int ext4_find_unwritten_pgoff(struct inode *inode,
306 int whence,
307 struct ext4_map_blocks *map,
308 loff_t *offset)
309{
310 struct pagevec pvec;
311 unsigned int blkbits;
312 pgoff_t index;
313 pgoff_t end;
314 loff_t endoff;
315 loff_t startoff;
316 loff_t lastoff;
317 int found = 0;
318
319 blkbits = inode->i_sb->s_blocksize_bits;
320 startoff = *offset;
321 lastoff = startoff;
322 endoff = (map->m_lblk + map->m_len) << blkbits;
323
324 index = startoff >> PAGE_CACHE_SHIFT;
325 end = endoff >> PAGE_CACHE_SHIFT;
326
327 pagevec_init(&pvec, 0);
328 do {
329 int i, num;
330 unsigned long nr_pages;
331
332 num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
333 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
334 (pgoff_t)num);
335 if (nr_pages == 0) {
336 if (whence == SEEK_DATA)
337 break;
338
339 BUG_ON(whence != SEEK_HOLE);
340 /*
341 * If this is the first time to go into the loop and
342 * offset is not beyond the end offset, it will be a
343 * hole at this offset
344 */
345 if (lastoff == startoff || lastoff < endoff)
346 found = 1;
347 break;
348 }
349
350 /*
351 * If this is the first time to go into the loop and
352 * offset is smaller than the first page offset, it will be a
353 * hole at this offset.
354 */
355 if (lastoff == startoff && whence == SEEK_HOLE &&
356 lastoff < page_offset(pvec.pages[0])) {
357 found = 1;
358 break;
359 }
360
361 for (i = 0; i < nr_pages; i++) {
362 struct page *page = pvec.pages[i];
363 struct buffer_head *bh, *head;
364
365 /*
366 * If the current offset is not beyond the end of given
367 * range, it will be a hole.
368 */
369 if (lastoff < endoff && whence == SEEK_HOLE &&
370 page->index > end) {
371 found = 1;
372 *offset = lastoff;
373 goto out;
374 }
375
376 lock_page(page);
377
378 if (unlikely(page->mapping != inode->i_mapping)) {
379 unlock_page(page);
380 continue;
381 }
382
383 if (!page_has_buffers(page)) {
384 unlock_page(page);
385 continue;
386 }
387
388 if (page_has_buffers(page)) {
389 lastoff = page_offset(page);
390 bh = head = page_buffers(page);
391 do {
392 if (buffer_uptodate(bh) ||
393 buffer_unwritten(bh)) {
394 if (whence == SEEK_DATA)
395 found = 1;
396 } else {
397 if (whence == SEEK_HOLE)
398 found = 1;
399 }
400 if (found) {
401 *offset = max_t(loff_t,
402 startoff, lastoff);
403 unlock_page(page);
404 goto out;
405 }
406 lastoff += bh->b_size;
407 bh = bh->b_this_page;
408 } while (bh != head);
409 }
410
411 lastoff = page_offset(page) + PAGE_SIZE;
412 unlock_page(page);
413 }
414
415 /*
416 * The no. of pages is less than our desired, that would be a
417 * hole in there.
418 */
419 if (nr_pages < num && whence == SEEK_HOLE) {
420 found = 1;
421 *offset = lastoff;
422 break;
423 }
424
425 index = pvec.pages[i - 1]->index + 1;
426 pagevec_release(&pvec);
427 } while (index <= end);
428
429out:
430 pagevec_release(&pvec);
431 return found;
432}
433
434/*
435 * ext4_seek_data() retrieves the offset for SEEK_DATA.
436 */
437static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
438{
439 struct inode *inode = file->f_mapping->host;
440 struct ext4_map_blocks map;
441 struct extent_status es;
442 ext4_lblk_t start, last, end;
443 loff_t dataoff, isize;
444 int blkbits;
445 int ret = 0;
446
447 mutex_lock(&inode->i_mutex);
448
449 isize = i_size_read(inode);
450 if (offset >= isize) {
451 mutex_unlock(&inode->i_mutex);
452 return -ENXIO;
453 }
454
455 blkbits = inode->i_sb->s_blocksize_bits;
456 start = offset >> blkbits;
457 last = start;
458 end = isize >> blkbits;
459 dataoff = offset;
460
461 do {
462 map.m_lblk = last;
463 map.m_len = end - last + 1;
464 ret = ext4_map_blocks(NULL, inode, &map, 0);
465 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
466 if (last != start)
467 dataoff = last << blkbits;
468 break;
469 }
470
471 /*
472 * If there is a delay extent at this offset,
473 * it will be as a data.
474 */
475 es.start = last;
476 (void)ext4_es_find_extent(inode, &es);
477 if (last >= es.start &&
478 last < es.start + es.len) {
479 if (last != start)
480 dataoff = last << blkbits;
481 break;
482 }
483
484 /*
485 * If there is a unwritten extent at this offset,
486 * it will be as a data or a hole according to page
487 * cache that has data or not.
488 */
489 if (map.m_flags & EXT4_MAP_UNWRITTEN) {
490 int unwritten;
491 unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
492 &map, &dataoff);
493 if (unwritten)
494 break;
495 }
496
497 last++;
498 dataoff = last << blkbits;
499 } while (last <= end);
500
501 mutex_unlock(&inode->i_mutex);
502
503 if (dataoff > isize)
504 return -ENXIO;
505
506 if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
507 return -EINVAL;
508 if (dataoff > maxsize)
509 return -EINVAL;
510
511 if (dataoff != file->f_pos) {
512 file->f_pos = dataoff;
513 file->f_version = 0;
514 }
515
516 return dataoff;
517}
518
519/*
520 * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
521 */
522static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
523{
524 struct inode *inode = file->f_mapping->host;
525 struct ext4_map_blocks map;
526 struct extent_status es;
527 ext4_lblk_t start, last, end;
528 loff_t holeoff, isize;
529 int blkbits;
530 int ret = 0;
531
532 mutex_lock(&inode->i_mutex);
533
534 isize = i_size_read(inode);
535 if (offset >= isize) {
536 mutex_unlock(&inode->i_mutex);
537 return -ENXIO;
538 }
539
540 blkbits = inode->i_sb->s_blocksize_bits;
541 start = offset >> blkbits;
542 last = start;
543 end = isize >> blkbits;
544 holeoff = offset;
545
546 do {
547 map.m_lblk = last;
548 map.m_len = end - last + 1;
549 ret = ext4_map_blocks(NULL, inode, &map, 0);
550 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
551 last += ret;
552 holeoff = last << blkbits;
553 continue;
554 }
555
556 /*
557 * If there is a delay extent at this offset,
558 * we will skip this extent.
559 */
560 es.start = last;
561 (void)ext4_es_find_extent(inode, &es);
562 if (last >= es.start &&
563 last < es.start + es.len) {
564 last = es.start + es.len;
565 holeoff = last << blkbits;
566 continue;
567 }
568
569 /*
570 * If there is a unwritten extent at this offset,
571 * it will be as a data or a hole according to page
572 * cache that has data or not.
573 */
574 if (map.m_flags & EXT4_MAP_UNWRITTEN) {
575 int unwritten;
576 unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
577 &map, &holeoff);
578 if (!unwritten) {
579 last += ret;
580 holeoff = last << blkbits;
581 continue;
582 }
583 }
584
585 /* find a hole */
586 break;
587 } while (last <= end);
588
589 mutex_unlock(&inode->i_mutex);
590
591 if (holeoff > isize)
592 holeoff = isize;
593
594 if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
595 return -EINVAL;
596 if (holeoff > maxsize)
597 return -EINVAL;
598
599 if (holeoff != file->f_pos) {
600 file->f_pos = holeoff;
601 file->f_version = 0;
602 }
603
604 return holeoff;
605}
606
607/*
289 * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values 608 * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
290 * by calling generic_file_llseek_size() with the appropriate maxbytes 609 * by calling generic_file_llseek_size() with the appropriate maxbytes
291 * value for each. 610 * value for each.
292 */ 611 */
293loff_t ext4_llseek(struct file *file, loff_t offset, int origin) 612loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
294{ 613{
295 struct inode *inode = file->f_mapping->host; 614 struct inode *inode = file->f_mapping->host;
296 loff_t maxbytes; 615 loff_t maxbytes;
@@ -300,8 +619,19 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
300 else 619 else
301 maxbytes = inode->i_sb->s_maxbytes; 620 maxbytes = inode->i_sb->s_maxbytes;
302 621
303 return generic_file_llseek_size(file, offset, origin, 622 switch (whence) {
304 maxbytes, i_size_read(inode)); 623 case SEEK_SET:
624 case SEEK_CUR:
625 case SEEK_END:
626 return generic_file_llseek_size(file, offset, whence,
627 maxbytes, i_size_read(inode));
628 case SEEK_DATA:
629 return ext4_seek_data(file, offset, maxbytes);
630 case SEEK_HOLE:
631 return ext4_seek_hole(file, offset, maxbytes);
632 }
633
634 return -EINVAL;
305} 635}
306 636
307const struct file_operations ext4_file_operations = { 637const struct file_operations ext4_file_operations = {
@@ -326,12 +656,10 @@ const struct file_operations ext4_file_operations = {
326const struct inode_operations ext4_file_inode_operations = { 656const struct inode_operations ext4_file_inode_operations = {
327 .setattr = ext4_setattr, 657 .setattr = ext4_setattr,
328 .getattr = ext4_getattr, 658 .getattr = ext4_getattr,
329#ifdef CONFIG_EXT4_FS_XATTR
330 .setxattr = generic_setxattr, 659 .setxattr = generic_setxattr,
331 .getxattr = generic_getxattr, 660 .getxattr = generic_getxattr,
332 .listxattr = ext4_listxattr, 661 .listxattr = ext4_listxattr,
333 .removexattr = generic_removexattr, 662 .removexattr = generic_removexattr,
334#endif
335 .get_acl = ext4_get_acl, 663 .get_acl = ext4_get_acl,
336 .fiemap = ext4_fiemap, 664 .fiemap = ext4_fiemap,
337}; 665};
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index be1d89f385b4..dfbc1fe96674 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -44,7 +44,6 @@
44 */ 44 */
45static int ext4_sync_parent(struct inode *inode) 45static int ext4_sync_parent(struct inode *inode)
46{ 46{
47 struct writeback_control wbc;
48 struct dentry *dentry = NULL; 47 struct dentry *dentry = NULL;
49 struct inode *next; 48 struct inode *next;
50 int ret = 0; 49 int ret = 0;
@@ -66,10 +65,7 @@ static int ext4_sync_parent(struct inode *inode)
66 ret = sync_mapping_buffers(inode->i_mapping); 65 ret = sync_mapping_buffers(inode->i_mapping);
67 if (ret) 66 if (ret)
68 break; 67 break;
69 memset(&wbc, 0, sizeof(wbc)); 68 ret = sync_inode_metadata(inode, 1);
70 wbc.sync_mode = WB_SYNC_ALL;
71 wbc.nr_to_write = 0; /* only write out the inode */
72 ret = sync_inode(inode, &wbc);
73 if (ret) 69 if (ret)
74 break; 70 break;
75 } 71 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 3a100e7a62a8..3f32c8012447 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -762,7 +762,6 @@ got:
762 762
763 BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); 763 BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
764 err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); 764 err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
765 brelse(block_bitmap_bh);
766 765
767 /* recheck and clear flag under lock if we still need to */ 766 /* recheck and clear flag under lock if we still need to */
768 ext4_lock_group(sb, group); 767 ext4_lock_group(sb, group);
@@ -775,6 +774,7 @@ got:
775 ext4_group_desc_csum_set(sb, group, gdp); 774 ext4_group_desc_csum_set(sb, group, gdp);
776 } 775 }
777 ext4_unlock_group(sb, group); 776 ext4_unlock_group(sb, group);
777 brelse(block_bitmap_bh);
778 778
779 if (err) 779 if (err)
780 goto fail; 780 goto fail;
@@ -902,6 +902,10 @@ got:
902 902
903 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; 903 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
904 904
905 ei->i_inline_off = 0;
906 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
907 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
908
905 ret = inode; 909 ret = inode;
906 dquot_initialize(inode); 910 dquot_initialize(inode);
907 err = dquot_alloc_inode(inode); 911 err = dquot_alloc_inode(inode);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 792e388e7b44..20862f96e8ae 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -22,6 +22,7 @@
22 22
23#include "ext4_jbd2.h" 23#include "ext4_jbd2.h"
24#include "truncate.h" 24#include "truncate.h"
25#include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */
25 26
26#include <trace/events/ext4.h> 27#include <trace/events/ext4.h>
27 28
@@ -755,8 +756,7 @@ cleanup:
755 partial--; 756 partial--;
756 } 757 }
757out: 758out:
758 trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, 759 trace_ext4_ind_map_blocks_exit(inode, map, err);
759 map->m_pblk, map->m_len, err);
760 return err; 760 return err;
761} 761}
762 762
@@ -1412,6 +1412,7 @@ void ext4_ind_truncate(struct inode *inode)
1412 down_write(&ei->i_data_sem); 1412 down_write(&ei->i_data_sem);
1413 1413
1414 ext4_discard_preallocations(inode); 1414 ext4_discard_preallocations(inode);
1415 ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
1415 1416
1416 /* 1417 /*
1417 * The orphan list entry will now protect us from any crash which 1418 * The orphan list entry will now protect us from any crash which
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
new file mode 100644
index 000000000000..387c47c6cda9
--- /dev/null
+++ b/fs/ext4/inline.c
@@ -0,0 +1,1884 @@
1/*
2 * Copyright (c) 2012 Taobao.
3 * Written by Tao Ma <boyu.mt@taobao.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 */
14#include "ext4_jbd2.h"
15#include "ext4.h"
16#include "xattr.h"
17#include "truncate.h"
18#include <linux/fiemap.h>
19
20#define EXT4_XATTR_SYSTEM_DATA "data"
21#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS))
22#define EXT4_INLINE_DOTDOT_SIZE 4
23
24int ext4_get_inline_size(struct inode *inode)
25{
26 if (EXT4_I(inode)->i_inline_off)
27 return EXT4_I(inode)->i_inline_size;
28
29 return 0;
30}
31
32static int get_max_inline_xattr_value_size(struct inode *inode,
33 struct ext4_iloc *iloc)
34{
35 struct ext4_xattr_ibody_header *header;
36 struct ext4_xattr_entry *entry;
37 struct ext4_inode *raw_inode;
38 int free, min_offs;
39
40 min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
41 EXT4_GOOD_OLD_INODE_SIZE -
42 EXT4_I(inode)->i_extra_isize -
43 sizeof(struct ext4_xattr_ibody_header);
44
45 /*
46 * We need to subtract another sizeof(__u32) since an in-inode xattr
47 * needs an empty 4 bytes to indicate the gap between the xattr entry
48 * and the name/value pair.
49 */
50 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
51 return EXT4_XATTR_SIZE(min_offs -
52 EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) -
53 EXT4_XATTR_ROUND - sizeof(__u32));
54
55 raw_inode = ext4_raw_inode(iloc);
56 header = IHDR(inode, raw_inode);
57 entry = IFIRST(header);
58
59 /* Compute min_offs. */
60 for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
61 if (!entry->e_value_block && entry->e_value_size) {
62 size_t offs = le16_to_cpu(entry->e_value_offs);
63 if (offs < min_offs)
64 min_offs = offs;
65 }
66 }
67 free = min_offs -
68 ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32);
69
70 if (EXT4_I(inode)->i_inline_off) {
71 entry = (struct ext4_xattr_entry *)
72 ((void *)raw_inode + EXT4_I(inode)->i_inline_off);
73
74 free += le32_to_cpu(entry->e_value_size);
75 goto out;
76 }
77
78 free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA));
79
80 if (free > EXT4_XATTR_ROUND)
81 free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND);
82 else
83 free = 0;
84
85out:
86 return free;
87}
88
89/*
90 * Get the maximum size we now can store in an inode.
91 * If we can't find the space for a xattr entry, don't use the space
92 * of the extents since we have no space to indicate the inline data.
93 */
94int ext4_get_max_inline_size(struct inode *inode)
95{
96 int error, max_inline_size;
97 struct ext4_iloc iloc;
98
99 if (EXT4_I(inode)->i_extra_isize == 0)
100 return 0;
101
102 error = ext4_get_inode_loc(inode, &iloc);
103 if (error) {
104 ext4_error_inode(inode, __func__, __LINE__, 0,
105 "can't get inode location %lu",
106 inode->i_ino);
107 return 0;
108 }
109
110 down_read(&EXT4_I(inode)->xattr_sem);
111 max_inline_size = get_max_inline_xattr_value_size(inode, &iloc);
112 up_read(&EXT4_I(inode)->xattr_sem);
113
114 brelse(iloc.bh);
115
116 if (!max_inline_size)
117 return 0;
118
119 return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE;
120}
121
122int ext4_has_inline_data(struct inode *inode)
123{
124 return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
125 EXT4_I(inode)->i_inline_off;
126}
127
128/*
129 * this function does not take xattr_sem, which is OK because it is
130 * currently only used in a code path coming form ext4_iget, before
131 * the new inode has been unlocked
132 */
133int ext4_find_inline_data_nolock(struct inode *inode)
134{
135 struct ext4_xattr_ibody_find is = {
136 .s = { .not_found = -ENODATA, },
137 };
138 struct ext4_xattr_info i = {
139 .name_index = EXT4_XATTR_INDEX_SYSTEM,
140 .name = EXT4_XATTR_SYSTEM_DATA,
141 };
142 int error;
143
144 if (EXT4_I(inode)->i_extra_isize == 0)
145 return 0;
146
147 error = ext4_get_inode_loc(inode, &is.iloc);
148 if (error)
149 return error;
150
151 error = ext4_xattr_ibody_find(inode, &i, &is);
152 if (error)
153 goto out;
154
155 if (!is.s.not_found) {
156 EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
157 (void *)ext4_raw_inode(&is.iloc));
158 EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
159 le32_to_cpu(is.s.here->e_value_size);
160 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
161 }
162out:
163 brelse(is.iloc.bh);
164 return error;
165}
166
167static int ext4_read_inline_data(struct inode *inode, void *buffer,
168 unsigned int len,
169 struct ext4_iloc *iloc)
170{
171 struct ext4_xattr_entry *entry;
172 struct ext4_xattr_ibody_header *header;
173 int cp_len = 0;
174 struct ext4_inode *raw_inode;
175
176 if (!len)
177 return 0;
178
179 BUG_ON(len > EXT4_I(inode)->i_inline_size);
180
181 cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ?
182 len : EXT4_MIN_INLINE_DATA_SIZE;
183
184 raw_inode = ext4_raw_inode(iloc);
185 memcpy(buffer, (void *)(raw_inode->i_block), cp_len);
186
187 len -= cp_len;
188 buffer += cp_len;
189
190 if (!len)
191 goto out;
192
193 header = IHDR(inode, raw_inode);
194 entry = (struct ext4_xattr_entry *)((void *)raw_inode +
195 EXT4_I(inode)->i_inline_off);
196 len = min_t(unsigned int, len,
197 (unsigned int)le32_to_cpu(entry->e_value_size));
198
199 memcpy(buffer,
200 (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len);
201 cp_len += len;
202
203out:
204 return cp_len;
205}
206
207/*
208 * write the buffer to the inline inode.
209 * If 'create' is set, we don't need to do the extra copy in the xattr
210 * value since it is already handled by ext4_xattr_ibody_inline_set.
211 * That saves us one memcpy.
212 */
213void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
214 void *buffer, loff_t pos, unsigned int len)
215{
216 struct ext4_xattr_entry *entry;
217 struct ext4_xattr_ibody_header *header;
218 struct ext4_inode *raw_inode;
219 int cp_len = 0;
220
221 BUG_ON(!EXT4_I(inode)->i_inline_off);
222 BUG_ON(pos + len > EXT4_I(inode)->i_inline_size);
223
224 raw_inode = ext4_raw_inode(iloc);
225 buffer += pos;
226
227 if (pos < EXT4_MIN_INLINE_DATA_SIZE) {
228 cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ?
229 EXT4_MIN_INLINE_DATA_SIZE - pos : len;
230 memcpy((void *)raw_inode->i_block + pos, buffer, cp_len);
231
232 len -= cp_len;
233 buffer += cp_len;
234 pos += cp_len;
235 }
236
237 if (!len)
238 return;
239
240 pos -= EXT4_MIN_INLINE_DATA_SIZE;
241 header = IHDR(inode, raw_inode);
242 entry = (struct ext4_xattr_entry *)((void *)raw_inode +
243 EXT4_I(inode)->i_inline_off);
244
245 memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos,
246 buffer, len);
247}
248
249static int ext4_create_inline_data(handle_t *handle,
250 struct inode *inode, unsigned len)
251{
252 int error;
253 void *value = NULL;
254 struct ext4_xattr_ibody_find is = {
255 .s = { .not_found = -ENODATA, },
256 };
257 struct ext4_xattr_info i = {
258 .name_index = EXT4_XATTR_INDEX_SYSTEM,
259 .name = EXT4_XATTR_SYSTEM_DATA,
260 };
261
262 error = ext4_get_inode_loc(inode, &is.iloc);
263 if (error)
264 return error;
265
266 error = ext4_journal_get_write_access(handle, is.iloc.bh);
267 if (error)
268 goto out;
269
270 if (len > EXT4_MIN_INLINE_DATA_SIZE) {
271 value = EXT4_ZERO_XATTR_VALUE;
272 len -= EXT4_MIN_INLINE_DATA_SIZE;
273 } else {
274 value = "";
275 len = 0;
276 }
277
278 /* Insert the the xttr entry. */
279 i.value = value;
280 i.value_len = len;
281
282 error = ext4_xattr_ibody_find(inode, &i, &is);
283 if (error)
284 goto out;
285
286 BUG_ON(!is.s.not_found);
287
288 error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
289 if (error) {
290 if (error == -ENOSPC)
291 ext4_clear_inode_state(inode,
292 EXT4_STATE_MAY_INLINE_DATA);
293 goto out;
294 }
295
296 memset((void *)ext4_raw_inode(&is.iloc)->i_block,
297 0, EXT4_MIN_INLINE_DATA_SIZE);
298
299 EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
300 (void *)ext4_raw_inode(&is.iloc));
301 EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE;
302 ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
303 ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
304 get_bh(is.iloc.bh);
305 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
306
307out:
308 brelse(is.iloc.bh);
309 return error;
310}
311
312static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
313 unsigned int len)
314{
315 int error;
316 void *value = NULL;
317 struct ext4_xattr_ibody_find is = {
318 .s = { .not_found = -ENODATA, },
319 };
320 struct ext4_xattr_info i = {
321 .name_index = EXT4_XATTR_INDEX_SYSTEM,
322 .name = EXT4_XATTR_SYSTEM_DATA,
323 };
324
325 /* If the old space is ok, write the data directly. */
326 if (len <= EXT4_I(inode)->i_inline_size)
327 return 0;
328
329 error = ext4_get_inode_loc(inode, &is.iloc);
330 if (error)
331 return error;
332
333 error = ext4_xattr_ibody_find(inode, &i, &is);
334 if (error)
335 goto out;
336
337 BUG_ON(is.s.not_found);
338
339 len -= EXT4_MIN_INLINE_DATA_SIZE;
340 value = kzalloc(len, GFP_NOFS);
341 if (!value)
342 goto out;
343
344 error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
345 value, len);
346 if (error == -ENODATA)
347 goto out;
348
349 error = ext4_journal_get_write_access(handle, is.iloc.bh);
350 if (error)
351 goto out;
352
353 /* Update the xttr entry. */
354 i.value = value;
355 i.value_len = len;
356
357 error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
358 if (error)
359 goto out;
360
361 EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
362 (void *)ext4_raw_inode(&is.iloc));
363 EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
364 le32_to_cpu(is.s.here->e_value_size);
365 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
366 get_bh(is.iloc.bh);
367 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
368
369out:
370 kfree(value);
371 brelse(is.iloc.bh);
372 return error;
373}
374
375int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
376 unsigned int len)
377{
378 int ret, size;
379 struct ext4_inode_info *ei = EXT4_I(inode);
380
381 if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
382 return -ENOSPC;
383
384 size = ext4_get_max_inline_size(inode);
385 if (size < len)
386 return -ENOSPC;
387
388 down_write(&EXT4_I(inode)->xattr_sem);
389
390 if (ei->i_inline_off)
391 ret = ext4_update_inline_data(handle, inode, len);
392 else
393 ret = ext4_create_inline_data(handle, inode, len);
394
395 up_write(&EXT4_I(inode)->xattr_sem);
396
397 return ret;
398}
399
400static int ext4_destroy_inline_data_nolock(handle_t *handle,
401 struct inode *inode)
402{
403 struct ext4_inode_info *ei = EXT4_I(inode);
404 struct ext4_xattr_ibody_find is = {
405 .s = { .not_found = 0, },
406 };
407 struct ext4_xattr_info i = {
408 .name_index = EXT4_XATTR_INDEX_SYSTEM,
409 .name = EXT4_XATTR_SYSTEM_DATA,
410 .value = NULL,
411 .value_len = 0,
412 };
413 int error;
414
415 if (!ei->i_inline_off)
416 return 0;
417
418 error = ext4_get_inode_loc(inode, &is.iloc);
419 if (error)
420 return error;
421
422 error = ext4_xattr_ibody_find(inode, &i, &is);
423 if (error)
424 goto out;
425
426 error = ext4_journal_get_write_access(handle, is.iloc.bh);
427 if (error)
428 goto out;
429
430 error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
431 if (error)
432 goto out;
433
434 memset((void *)ext4_raw_inode(&is.iloc)->i_block,
435 0, EXT4_MIN_INLINE_DATA_SIZE);
436
437 if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
438 EXT4_FEATURE_INCOMPAT_EXTENTS)) {
439 if (S_ISDIR(inode->i_mode) ||
440 S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) {
441 ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
442 ext4_ext_tree_init(handle, inode);
443 }
444 }
445 ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA);
446
447 get_bh(is.iloc.bh);
448 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
449
450 EXT4_I(inode)->i_inline_off = 0;
451 EXT4_I(inode)->i_inline_size = 0;
452 ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
453out:
454 brelse(is.iloc.bh);
455 if (error == -ENODATA)
456 error = 0;
457 return error;
458}
459
460static int ext4_read_inline_page(struct inode *inode, struct page *page)
461{
462 void *kaddr;
463 int ret = 0;
464 size_t len;
465 struct ext4_iloc iloc;
466
467 BUG_ON(!PageLocked(page));
468 BUG_ON(!ext4_has_inline_data(inode));
469 BUG_ON(page->index);
470
471 if (!EXT4_I(inode)->i_inline_off) {
472 ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.",
473 inode->i_ino);
474 goto out;
475 }
476
477 ret = ext4_get_inode_loc(inode, &iloc);
478 if (ret)
479 goto out;
480
481 len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode));
482 kaddr = kmap_atomic(page);
483 ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
484 flush_dcache_page(page);
485 kunmap_atomic(kaddr);
486 zero_user_segment(page, len, PAGE_CACHE_SIZE);
487 SetPageUptodate(page);
488 brelse(iloc.bh);
489
490out:
491 return ret;
492}
493
494int ext4_readpage_inline(struct inode *inode, struct page *page)
495{
496 int ret = 0;
497
498 down_read(&EXT4_I(inode)->xattr_sem);
499 if (!ext4_has_inline_data(inode)) {
500 up_read(&EXT4_I(inode)->xattr_sem);
501 return -EAGAIN;
502 }
503
504 /*
505 * Current inline data can only exist in the 1st page,
506 * So for all the other pages, just set them uptodate.
507 */
508 if (!page->index)
509 ret = ext4_read_inline_page(inode, page);
510 else if (!PageUptodate(page)) {
511 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
512 SetPageUptodate(page);
513 }
514
515 up_read(&EXT4_I(inode)->xattr_sem);
516
517 unlock_page(page);
518 return ret >= 0 ? 0 : ret;
519}
520
521static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
522 struct inode *inode,
523 unsigned flags)
524{
525 int ret, needed_blocks;
526 handle_t *handle = NULL;
527 int retries = 0, sem_held = 0;
528 struct page *page = NULL;
529 unsigned from, to;
530 struct ext4_iloc iloc;
531
532 if (!ext4_has_inline_data(inode)) {
533 /*
534 * clear the flag so that no new write
535 * will trap here again.
536 */
537 ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
538 return 0;
539 }
540
541 needed_blocks = ext4_writepage_trans_blocks(inode);
542
543 ret = ext4_get_inode_loc(inode, &iloc);
544 if (ret)
545 return ret;
546
547retry:
548 handle = ext4_journal_start(inode, needed_blocks);
549 if (IS_ERR(handle)) {
550 ret = PTR_ERR(handle);
551 handle = NULL;
552 goto out;
553 }
554
555 /* We cannot recurse into the filesystem as the transaction is already
556 * started */
557 flags |= AOP_FLAG_NOFS;
558
559 page = grab_cache_page_write_begin(mapping, 0, flags);
560 if (!page) {
561 ret = -ENOMEM;
562 goto out;
563 }
564
565 down_write(&EXT4_I(inode)->xattr_sem);
566 sem_held = 1;
567 /* If some one has already done this for us, just exit. */
568 if (!ext4_has_inline_data(inode)) {
569 ret = 0;
570 goto out;
571 }
572
573 from = 0;
574 to = ext4_get_inline_size(inode);
575 if (!PageUptodate(page)) {
576 ret = ext4_read_inline_page(inode, page);
577 if (ret < 0)
578 goto out;
579 }
580
581 ret = ext4_destroy_inline_data_nolock(handle, inode);
582 if (ret)
583 goto out;
584
585 if (ext4_should_dioread_nolock(inode))
586 ret = __block_write_begin(page, from, to, ext4_get_block_write);
587 else
588 ret = __block_write_begin(page, from, to, ext4_get_block);
589
590 if (!ret && ext4_should_journal_data(inode)) {
591 ret = ext4_walk_page_buffers(handle, page_buffers(page),
592 from, to, NULL,
593 do_journal_get_write_access);
594 }
595
596 if (ret) {
597 unlock_page(page);
598 page_cache_release(page);
599 ext4_orphan_add(handle, inode);
600 up_write(&EXT4_I(inode)->xattr_sem);
601 sem_held = 0;
602 ext4_journal_stop(handle);
603 handle = NULL;
604 ext4_truncate_failed_write(inode);
605 /*
606 * If truncate failed early the inode might
607 * still be on the orphan list; we need to
608 * make sure the inode is removed from the
609 * orphan list in that case.
610 */
611 if (inode->i_nlink)
612 ext4_orphan_del(NULL, inode);
613 }
614
615 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
616 goto retry;
617
618 block_commit_write(page, from, to);
619out:
620 if (page) {
621 unlock_page(page);
622 page_cache_release(page);
623 }
624 if (sem_held)
625 up_write(&EXT4_I(inode)->xattr_sem);
626 if (handle)
627 ext4_journal_stop(handle);
628 brelse(iloc.bh);
629 return ret;
630}
631
632/*
633 * Try to write data in the inode.
634 * If the inode has inline data, check whether the new write can be
635 * in the inode also. If not, create the page the handle, move the data
636 * to the page make it update and let the later codes create extent for it.
637 */
638int ext4_try_to_write_inline_data(struct address_space *mapping,
639 struct inode *inode,
640 loff_t pos, unsigned len,
641 unsigned flags,
642 struct page **pagep)
643{
644 int ret;
645 handle_t *handle;
646 struct page *page;
647 struct ext4_iloc iloc;
648
649 if (pos + len > ext4_get_max_inline_size(inode))
650 goto convert;
651
652 ret = ext4_get_inode_loc(inode, &iloc);
653 if (ret)
654 return ret;
655
656 /*
657 * The possible write could happen in the inode,
658 * so try to reserve the space in inode first.
659 */
660 handle = ext4_journal_start(inode, 1);
661 if (IS_ERR(handle)) {
662 ret = PTR_ERR(handle);
663 handle = NULL;
664 goto out;
665 }
666
667 ret = ext4_prepare_inline_data(handle, inode, pos + len);
668 if (ret && ret != -ENOSPC)
669 goto out;
670
671 /* We don't have space in inline inode, so convert it to extent. */
672 if (ret == -ENOSPC) {
673 ext4_journal_stop(handle);
674 brelse(iloc.bh);
675 goto convert;
676 }
677
678 flags |= AOP_FLAG_NOFS;
679
680 page = grab_cache_page_write_begin(mapping, 0, flags);
681 if (!page) {
682 ret = -ENOMEM;
683 goto out;
684 }
685
686 *pagep = page;
687 down_read(&EXT4_I(inode)->xattr_sem);
688 if (!ext4_has_inline_data(inode)) {
689 ret = 0;
690 unlock_page(page);
691 page_cache_release(page);
692 goto out_up_read;
693 }
694
695 if (!PageUptodate(page)) {
696 ret = ext4_read_inline_page(inode, page);
697 if (ret < 0)
698 goto out_up_read;
699 }
700
701 ret = 1;
702 handle = NULL;
703out_up_read:
704 up_read(&EXT4_I(inode)->xattr_sem);
705out:
706 if (handle)
707 ext4_journal_stop(handle);
708 brelse(iloc.bh);
709 return ret;
710convert:
711 return ext4_convert_inline_data_to_extent(mapping,
712 inode, flags);
713}
714
715int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
716 unsigned copied, struct page *page)
717{
718 int ret;
719 void *kaddr;
720 struct ext4_iloc iloc;
721
722 if (unlikely(copied < len)) {
723 if (!PageUptodate(page)) {
724 copied = 0;
725 goto out;
726 }
727 }
728
729 ret = ext4_get_inode_loc(inode, &iloc);
730 if (ret) {
731 ext4_std_error(inode->i_sb, ret);
732 copied = 0;
733 goto out;
734 }
735
736 down_write(&EXT4_I(inode)->xattr_sem);
737 BUG_ON(!ext4_has_inline_data(inode));
738
739 kaddr = kmap_atomic(page);
740 ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
741 kunmap_atomic(kaddr);
742 SetPageUptodate(page);
743 /* clear page dirty so that writepages wouldn't work for us. */
744 ClearPageDirty(page);
745
746 up_write(&EXT4_I(inode)->xattr_sem);
747 brelse(iloc.bh);
748out:
749 return copied;
750}
751
752struct buffer_head *
753ext4_journalled_write_inline_data(struct inode *inode,
754 unsigned len,
755 struct page *page)
756{
757 int ret;
758 void *kaddr;
759 struct ext4_iloc iloc;
760
761 ret = ext4_get_inode_loc(inode, &iloc);
762 if (ret) {
763 ext4_std_error(inode->i_sb, ret);
764 return NULL;
765 }
766
767 down_write(&EXT4_I(inode)->xattr_sem);
768 kaddr = kmap_atomic(page);
769 ext4_write_inline_data(inode, &iloc, kaddr, 0, len);
770 kunmap_atomic(kaddr);
771 up_write(&EXT4_I(inode)->xattr_sem);
772
773 return iloc.bh;
774}
775
776/*
777 * Try to make the page cache and handle ready for the inline data case.
778 * We can call this function in 2 cases:
779 * 1. The inode is created and the first write exceeds inline size. We can
780 * clear the inode state safely.
781 * 2. The inode has inline data, then we need to read the data, make it
782 * update and dirty so that ext4_da_writepages can handle it. We don't
783 * need to start the journal since the file's metatdata isn't changed now.
784 */
785static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
786 struct inode *inode,
787 unsigned flags,
788 void **fsdata)
789{
790 int ret = 0, inline_size;
791 struct page *page;
792
793 page = grab_cache_page_write_begin(mapping, 0, flags);
794 if (!page)
795 return -ENOMEM;
796
797 down_read(&EXT4_I(inode)->xattr_sem);
798 if (!ext4_has_inline_data(inode)) {
799 ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
800 goto out;
801 }
802
803 inline_size = ext4_get_inline_size(inode);
804
805 if (!PageUptodate(page)) {
806 ret = ext4_read_inline_page(inode, page);
807 if (ret < 0)
808 goto out;
809 }
810
811 ret = __block_write_begin(page, 0, inline_size,
812 ext4_da_get_block_prep);
813 if (ret) {
814 ext4_truncate_failed_write(inode);
815 goto out;
816 }
817
818 SetPageDirty(page);
819 SetPageUptodate(page);
820 ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
821 *fsdata = (void *)CONVERT_INLINE_DATA;
822
823out:
824 up_read(&EXT4_I(inode)->xattr_sem);
825 if (page) {
826 unlock_page(page);
827 page_cache_release(page);
828 }
829 return ret;
830}
831
832/*
833 * Prepare the write for the inline data.
834 * If the the data can be written into the inode, we just read
835 * the page and make it uptodate, and start the journal.
836 * Otherwise read the page, makes it dirty so that it can be
837 * handle in writepages(the i_disksize update is left to the
838 * normal ext4_da_write_end).
839 */
840int ext4_da_write_inline_data_begin(struct address_space *mapping,
841 struct inode *inode,
842 loff_t pos, unsigned len,
843 unsigned flags,
844 struct page **pagep,
845 void **fsdata)
846{
847 int ret, inline_size;
848 handle_t *handle;
849 struct page *page;
850 struct ext4_iloc iloc;
851
852 ret = ext4_get_inode_loc(inode, &iloc);
853 if (ret)
854 return ret;
855
856 handle = ext4_journal_start(inode, 1);
857 if (IS_ERR(handle)) {
858 ret = PTR_ERR(handle);
859 handle = NULL;
860 goto out;
861 }
862
863 inline_size = ext4_get_max_inline_size(inode);
864
865 ret = -ENOSPC;
866 if (inline_size >= pos + len) {
867 ret = ext4_prepare_inline_data(handle, inode, pos + len);
868 if (ret && ret != -ENOSPC)
869 goto out;
870 }
871
872 if (ret == -ENOSPC) {
873 ret = ext4_da_convert_inline_data_to_extent(mapping,
874 inode,
875 flags,
876 fsdata);
877 goto out;
878 }
879
880 /*
881 * We cannot recurse into the filesystem as the transaction
882 * is already started.
883 */
884 flags |= AOP_FLAG_NOFS;
885
886 page = grab_cache_page_write_begin(mapping, 0, flags);
887 if (!page) {
888 ret = -ENOMEM;
889 goto out;
890 }
891
892 down_read(&EXT4_I(inode)->xattr_sem);
893 if (!ext4_has_inline_data(inode)) {
894 ret = 0;
895 goto out_release_page;
896 }
897
898 if (!PageUptodate(page)) {
899 ret = ext4_read_inline_page(inode, page);
900 if (ret < 0)
901 goto out_release_page;
902 }
903
904 up_read(&EXT4_I(inode)->xattr_sem);
905 *pagep = page;
906 handle = NULL;
907 brelse(iloc.bh);
908 return 1;
909out_release_page:
910 up_read(&EXT4_I(inode)->xattr_sem);
911 unlock_page(page);
912 page_cache_release(page);
913out:
914 if (handle)
915 ext4_journal_stop(handle);
916 brelse(iloc.bh);
917 return ret;
918}
919
920int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
921 unsigned len, unsigned copied,
922 struct page *page)
923{
924 int i_size_changed = 0;
925
926 copied = ext4_write_inline_data_end(inode, pos, len, copied, page);
927
928 /*
929 * No need to use i_size_read() here, the i_size
930 * cannot change under us because we hold i_mutex.
931 *
932 * But it's important to update i_size while still holding page lock:
933 * page writeout could otherwise come in and zero beyond i_size.
934 */
935 if (pos+copied > inode->i_size) {
936 i_size_write(inode, pos+copied);
937 i_size_changed = 1;
938 }
939 unlock_page(page);
940 page_cache_release(page);
941
942 /*
943 * Don't mark the inode dirty under page lock. First, it unnecessarily
944 * makes the holding time of page lock longer. Second, it forces lock
945 * ordering of page lock and transaction start for journaling
946 * filesystems.
947 */
948 if (i_size_changed)
949 mark_inode_dirty(inode);
950
951 return copied;
952}
953
954#ifdef INLINE_DIR_DEBUG
955void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
956 void *inline_start, int inline_size)
957{
958 int offset;
959 unsigned short de_len;
960 struct ext4_dir_entry_2 *de = inline_start;
961 void *dlimit = inline_start + inline_size;
962
963 trace_printk("inode %lu\n", dir->i_ino);
964 offset = 0;
965 while ((void *)de < dlimit) {
966 de_len = ext4_rec_len_from_disk(de->rec_len, inline_size);
967 trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n",
968 offset, de_len, de->name_len, de->name,
969 de->name_len, le32_to_cpu(de->inode));
970 if (ext4_check_dir_entry(dir, NULL, de, bh,
971 inline_start, inline_size, offset))
972 BUG();
973
974 offset += de_len;
975 de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
976 }
977}
978#else
979#define ext4_show_inline_dir(dir, bh, inline_start, inline_size)
980#endif
981
982/*
983 * Add a new entry into a inline dir.
984 * It will return -ENOSPC if no space is available, and -EIO
985 * and -EEXIST if directory entry already exists.
986 */
987static int ext4_add_dirent_to_inline(handle_t *handle,
988 struct dentry *dentry,
989 struct inode *inode,
990 struct ext4_iloc *iloc,
991 void *inline_start, int inline_size)
992{
993 struct inode *dir = dentry->d_parent->d_inode;
994 const char *name = dentry->d_name.name;
995 int namelen = dentry->d_name.len;
996 unsigned short reclen;
997 int err;
998 struct ext4_dir_entry_2 *de;
999
1000 reclen = EXT4_DIR_REC_LEN(namelen);
1001 err = ext4_find_dest_de(dir, inode, iloc->bh,
1002 inline_start, inline_size,
1003 name, namelen, &de);
1004 if (err)
1005 return err;
1006
1007 err = ext4_journal_get_write_access(handle, iloc->bh);
1008 if (err)
1009 return err;
1010 ext4_insert_dentry(inode, de, inline_size, name, namelen);
1011
1012 ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
1013
1014 /*
1015 * XXX shouldn't update any times until successful
1016 * completion of syscall, but too many callers depend
1017 * on this.
1018 *
1019 * XXX similarly, too many callers depend on
1020 * ext4_new_inode() setting the times, but error
1021 * recovery deletes the inode, so the worst that can
1022 * happen is that the times are slightly out of date
1023 * and/or different from the directory change time.
1024 */
1025 dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
1026 ext4_update_dx_flag(dir);
1027 dir->i_version++;
1028 ext4_mark_inode_dirty(handle, dir);
1029 return 1;
1030}
1031
1032static void *ext4_get_inline_xattr_pos(struct inode *inode,
1033 struct ext4_iloc *iloc)
1034{
1035 struct ext4_xattr_entry *entry;
1036 struct ext4_xattr_ibody_header *header;
1037
1038 BUG_ON(!EXT4_I(inode)->i_inline_off);
1039
1040 header = IHDR(inode, ext4_raw_inode(iloc));
1041 entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
1042 EXT4_I(inode)->i_inline_off);
1043
1044 return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs);
1045}
1046
1047/* Set the final de to cover the whole block. */
1048static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
1049{
1050 struct ext4_dir_entry_2 *de, *prev_de;
1051 void *limit;
1052 int de_len;
1053
1054 de = (struct ext4_dir_entry_2 *)de_buf;
1055 if (old_size) {
1056 limit = de_buf + old_size;
1057 do {
1058 prev_de = de;
1059 de_len = ext4_rec_len_from_disk(de->rec_len, old_size);
1060 de_buf += de_len;
1061 de = (struct ext4_dir_entry_2 *)de_buf;
1062 } while (de_buf < limit);
1063
1064 prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size -
1065 old_size, new_size);
1066 } else {
1067 /* this is just created, so create an empty entry. */
1068 de->inode = 0;
1069 de->rec_len = ext4_rec_len_to_disk(new_size, new_size);
1070 }
1071}
1072
1073static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
1074 struct ext4_iloc *iloc)
1075{
1076 int ret;
1077 int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
1078 int new_size = get_max_inline_xattr_value_size(dir, iloc);
1079
1080 if (new_size - old_size <= EXT4_DIR_REC_LEN(1))
1081 return -ENOSPC;
1082
1083 ret = ext4_update_inline_data(handle, dir,
1084 new_size + EXT4_MIN_INLINE_DATA_SIZE);
1085 if (ret)
1086 return ret;
1087
1088 ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size,
1089 EXT4_I(dir)->i_inline_size -
1090 EXT4_MIN_INLINE_DATA_SIZE);
1091 dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size;
1092 return 0;
1093}
1094
1095static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
1096 struct ext4_iloc *iloc,
1097 void *buf, int inline_size)
1098{
1099 ext4_create_inline_data(handle, inode, inline_size);
1100 ext4_write_inline_data(inode, iloc, buf, 0, inline_size);
1101 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
1102}
1103
1104static int ext4_finish_convert_inline_dir(handle_t *handle,
1105 struct inode *inode,
1106 struct buffer_head *dir_block,
1107 void *buf,
1108 int inline_size)
1109{
1110 int err, csum_size = 0, header_size = 0;
1111 struct ext4_dir_entry_2 *de;
1112 struct ext4_dir_entry_tail *t;
1113 void *target = dir_block->b_data;
1114
1115 /*
1116 * First create "." and ".." and then copy the dir information
1117 * back to the block.
1118 */
1119 de = (struct ext4_dir_entry_2 *)target;
1120 de = ext4_init_dot_dotdot(inode, de,
1121 inode->i_sb->s_blocksize, csum_size,
1122 le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
1123 header_size = (void *)de - target;
1124
1125 memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
1126 inline_size - EXT4_INLINE_DOTDOT_SIZE);
1127
1128 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
1129 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
1130 csum_size = sizeof(struct ext4_dir_entry_tail);
1131
1132 inode->i_size = inode->i_sb->s_blocksize;
1133 i_size_write(inode, inode->i_sb->s_blocksize);
1134 EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
1135 ext4_update_final_de(dir_block->b_data,
1136 inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size,
1137 inode->i_sb->s_blocksize - csum_size);
1138
1139 if (csum_size) {
1140 t = EXT4_DIRENT_TAIL(dir_block->b_data,
1141 inode->i_sb->s_blocksize);
1142 initialize_dirent_tail(t, inode->i_sb->s_blocksize);
1143 }
1144 set_buffer_uptodate(dir_block);
1145 err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
1146 if (err)
1147 goto out;
1148 set_buffer_verified(dir_block);
1149out:
1150 return err;
1151}
1152
1153static int ext4_convert_inline_data_nolock(handle_t *handle,
1154 struct inode *inode,
1155 struct ext4_iloc *iloc)
1156{
1157 int error;
1158 void *buf = NULL;
1159 struct buffer_head *data_bh = NULL;
1160 struct ext4_map_blocks map;
1161 int inline_size;
1162
1163 inline_size = ext4_get_inline_size(inode);
1164 buf = kmalloc(inline_size, GFP_NOFS);
1165 if (!buf) {
1166 error = -ENOMEM;
1167 goto out;
1168 }
1169
1170 error = ext4_read_inline_data(inode, buf, inline_size, iloc);
1171 if (error < 0)
1172 goto out;
1173
1174 error = ext4_destroy_inline_data_nolock(handle, inode);
1175 if (error)
1176 goto out;
1177
1178 map.m_lblk = 0;
1179 map.m_len = 1;
1180 map.m_flags = 0;
1181 error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE);
1182 if (error < 0)
1183 goto out_restore;
1184 if (!(map.m_flags & EXT4_MAP_MAPPED)) {
1185 error = -EIO;
1186 goto out_restore;
1187 }
1188
1189 data_bh = sb_getblk(inode->i_sb, map.m_pblk);
1190 if (!data_bh) {
1191 error = -EIO;
1192 goto out_restore;
1193 }
1194
1195 lock_buffer(data_bh);
1196 error = ext4_journal_get_create_access(handle, data_bh);
1197 if (error) {
1198 unlock_buffer(data_bh);
1199 error = -EIO;
1200 goto out_restore;
1201 }
1202 memset(data_bh->b_data, 0, inode->i_sb->s_blocksize);
1203
1204 if (!S_ISDIR(inode->i_mode)) {
1205 memcpy(data_bh->b_data, buf, inline_size);
1206 set_buffer_uptodate(data_bh);
1207 error = ext4_handle_dirty_metadata(handle,
1208 inode, data_bh);
1209 } else {
1210 error = ext4_finish_convert_inline_dir(handle, inode, data_bh,
1211 buf, inline_size);
1212 }
1213
1214 unlock_buffer(data_bh);
1215out_restore:
1216 if (error)
1217 ext4_restore_inline_data(handle, inode, iloc, buf, inline_size);
1218
1219out:
1220 brelse(data_bh);
1221 kfree(buf);
1222 return error;
1223}
1224
1225/*
1226 * Try to add the new entry to the inline data.
1227 * If succeeds, return 0. If not, extended the inline dir and copied data to
1228 * the new created block.
1229 */
1230int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
1231 struct inode *inode)
1232{
1233 int ret, inline_size;
1234 void *inline_start;
1235 struct ext4_iloc iloc;
1236 struct inode *dir = dentry->d_parent->d_inode;
1237
1238 ret = ext4_get_inode_loc(dir, &iloc);
1239 if (ret)
1240 return ret;
1241
1242 down_write(&EXT4_I(dir)->xattr_sem);
1243 if (!ext4_has_inline_data(dir))
1244 goto out;
1245
1246 inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
1247 EXT4_INLINE_DOTDOT_SIZE;
1248 inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
1249
1250 ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
1251 inline_start, inline_size);
1252 if (ret != -ENOSPC)
1253 goto out;
1254
1255 /* check whether it can be inserted to inline xattr space. */
1256 inline_size = EXT4_I(dir)->i_inline_size -
1257 EXT4_MIN_INLINE_DATA_SIZE;
1258 if (!inline_size) {
1259 /* Try to use the xattr space.*/
1260 ret = ext4_update_inline_dir(handle, dir, &iloc);
1261 if (ret && ret != -ENOSPC)
1262 goto out;
1263
1264 inline_size = EXT4_I(dir)->i_inline_size -
1265 EXT4_MIN_INLINE_DATA_SIZE;
1266 }
1267
1268 if (inline_size) {
1269 inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
1270
1271 ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
1272 inline_start, inline_size);
1273
1274 if (ret != -ENOSPC)
1275 goto out;
1276 }
1277
1278 /*
1279 * The inline space is filled up, so create a new block for it.
1280 * As the extent tree will be created, we have to save the inline
1281 * dir first.
1282 */
1283 ret = ext4_convert_inline_data_nolock(handle, dir, &iloc);
1284
1285out:
1286 ext4_mark_inode_dirty(handle, dir);
1287 up_write(&EXT4_I(dir)->xattr_sem);
1288 brelse(iloc.bh);
1289 return ret;
1290}
1291
1292int ext4_read_inline_dir(struct file *filp,
1293 void *dirent, filldir_t filldir,
1294 int *has_inline_data)
1295{
1296 int error = 0;
1297 unsigned int offset, parent_ino;
1298 int i, stored;
1299 struct ext4_dir_entry_2 *de;
1300 struct super_block *sb;
1301 struct inode *inode = filp->f_path.dentry->d_inode;
1302 int ret, inline_size = 0;
1303 struct ext4_iloc iloc;
1304 void *dir_buf = NULL;
1305
1306 ret = ext4_get_inode_loc(inode, &iloc);
1307 if (ret)
1308 return ret;
1309
1310 down_read(&EXT4_I(inode)->xattr_sem);
1311 if (!ext4_has_inline_data(inode)) {
1312 up_read(&EXT4_I(inode)->xattr_sem);
1313 *has_inline_data = 0;
1314 goto out;
1315 }
1316
1317 inline_size = ext4_get_inline_size(inode);
1318 dir_buf = kmalloc(inline_size, GFP_NOFS);
1319 if (!dir_buf) {
1320 ret = -ENOMEM;
1321 up_read(&EXT4_I(inode)->xattr_sem);
1322 goto out;
1323 }
1324
1325 ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
1326 up_read(&EXT4_I(inode)->xattr_sem);
1327 if (ret < 0)
1328 goto out;
1329
1330 sb = inode->i_sb;
1331 stored = 0;
1332 parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
1333
1334 while (!error && !stored && filp->f_pos < inode->i_size) {
1335revalidate:
1336 /*
1337 * If the version has changed since the last call to
1338 * readdir(2), then we might be pointing to an invalid
1339 * dirent right now. Scan from the start of the inline
1340 * dir to make sure.
1341 */
1342 if (filp->f_version != inode->i_version) {
1343 for (i = 0;
1344 i < inode->i_size && i < offset;) {
1345 if (!i) {
1346 /* skip "." and ".." if needed. */
1347 i += EXT4_INLINE_DOTDOT_SIZE;
1348 continue;
1349 }
1350 de = (struct ext4_dir_entry_2 *)
1351 (dir_buf + i);
1352 /* It's too expensive to do a full
1353 * dirent test each time round this
1354 * loop, but we do have to test at
1355 * least that it is non-zero. A
1356 * failure will be detected in the
1357 * dirent test below. */
1358 if (ext4_rec_len_from_disk(de->rec_len,
1359 inline_size) < EXT4_DIR_REC_LEN(1))
1360 break;
1361 i += ext4_rec_len_from_disk(de->rec_len,
1362 inline_size);
1363 }
1364 offset = i;
1365 filp->f_pos = offset;
1366 filp->f_version = inode->i_version;
1367 }
1368
1369 while (!error && filp->f_pos < inode->i_size) {
1370 if (filp->f_pos == 0) {
1371 error = filldir(dirent, ".", 1, 0, inode->i_ino,
1372 DT_DIR);
1373 if (error)
1374 break;
1375 stored++;
1376
1377 error = filldir(dirent, "..", 2, 0, parent_ino,
1378 DT_DIR);
1379 if (error)
1380 break;
1381 stored++;
1382
1383 filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE;
1384 continue;
1385 }
1386
1387 de = (struct ext4_dir_entry_2 *)(dir_buf + offset);
1388 if (ext4_check_dir_entry(inode, filp, de,
1389 iloc.bh, dir_buf,
1390 inline_size, offset)) {
1391 ret = stored;
1392 goto out;
1393 }
1394 offset += ext4_rec_len_from_disk(de->rec_len,
1395 inline_size);
1396 if (le32_to_cpu(de->inode)) {
1397 /* We might block in the next section
1398 * if the data destination is
1399 * currently swapped out. So, use a
1400 * version stamp to detect whether or
1401 * not the directory has been modified
1402 * during the copy operation.
1403 */
1404 u64 version = filp->f_version;
1405
1406 error = filldir(dirent, de->name,
1407 de->name_len,
1408 filp->f_pos,
1409 le32_to_cpu(de->inode),
1410 get_dtype(sb, de->file_type));
1411 if (error)
1412 break;
1413 if (version != filp->f_version)
1414 goto revalidate;
1415 stored++;
1416 }
1417 filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
1418 inline_size);
1419 }
1420 offset = 0;
1421 }
1422out:
1423 kfree(dir_buf);
1424 brelse(iloc.bh);
1425 return ret;
1426}
1427
1428struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
1429 struct ext4_dir_entry_2 **parent_de,
1430 int *retval)
1431{
1432 struct ext4_iloc iloc;
1433
1434 *retval = ext4_get_inode_loc(inode, &iloc);
1435 if (*retval)
1436 return NULL;
1437
1438 *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
1439
1440 return iloc.bh;
1441}
1442
1443/*
1444 * Try to create the inline data for the new dir.
1445 * If it succeeds, return 0, otherwise return the error.
1446 * In case of ENOSPC, the caller should create the normal disk layout dir.
1447 */
1448int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent,
1449 struct inode *inode)
1450{
1451 int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE;
1452 struct ext4_iloc iloc;
1453 struct ext4_dir_entry_2 *de;
1454
1455 ret = ext4_get_inode_loc(inode, &iloc);
1456 if (ret)
1457 return ret;
1458
1459 ret = ext4_prepare_inline_data(handle, inode, inline_size);
1460 if (ret)
1461 goto out;
1462
1463 /*
1464 * For inline dir, we only save the inode information for the ".."
1465 * and create a fake dentry to cover the left space.
1466 */
1467 de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
1468 de->inode = cpu_to_le32(parent->i_ino);
1469 de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE);
1470 de->inode = 0;
1471 de->rec_len = ext4_rec_len_to_disk(
1472 inline_size - EXT4_INLINE_DOTDOT_SIZE,
1473 inline_size);
1474 set_nlink(inode, 2);
1475 inode->i_size = EXT4_I(inode)->i_disksize = inline_size;
1476out:
1477 brelse(iloc.bh);
1478 return ret;
1479}
1480
1481struct buffer_head *ext4_find_inline_entry(struct inode *dir,
1482 const struct qstr *d_name,
1483 struct ext4_dir_entry_2 **res_dir,
1484 int *has_inline_data)
1485{
1486 int ret;
1487 struct ext4_iloc iloc;
1488 void *inline_start;
1489 int inline_size;
1490
1491 if (ext4_get_inode_loc(dir, &iloc))
1492 return NULL;
1493
1494 down_read(&EXT4_I(dir)->xattr_sem);
1495 if (!ext4_has_inline_data(dir)) {
1496 *has_inline_data = 0;
1497 goto out;
1498 }
1499
1500 inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
1501 EXT4_INLINE_DOTDOT_SIZE;
1502 inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
1503 ret = search_dir(iloc.bh, inline_start, inline_size,
1504 dir, d_name, 0, res_dir);
1505 if (ret == 1)
1506 goto out_find;
1507 if (ret < 0)
1508 goto out;
1509
1510 if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE)
1511 goto out;
1512
1513 inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
1514 inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;
1515
1516 ret = search_dir(iloc.bh, inline_start, inline_size,
1517 dir, d_name, 0, res_dir);
1518 if (ret == 1)
1519 goto out_find;
1520
1521out:
1522 brelse(iloc.bh);
1523 iloc.bh = NULL;
1524out_find:
1525 up_read(&EXT4_I(dir)->xattr_sem);
1526 return iloc.bh;
1527}
1528
1529int ext4_delete_inline_entry(handle_t *handle,
1530 struct inode *dir,
1531 struct ext4_dir_entry_2 *de_del,
1532 struct buffer_head *bh,
1533 int *has_inline_data)
1534{
1535 int err, inline_size;
1536 struct ext4_iloc iloc;
1537 void *inline_start;
1538
1539 err = ext4_get_inode_loc(dir, &iloc);
1540 if (err)
1541 return err;
1542
1543 down_write(&EXT4_I(dir)->xattr_sem);
1544 if (!ext4_has_inline_data(dir)) {
1545 *has_inline_data = 0;
1546 goto out;
1547 }
1548
1549 if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) <
1550 EXT4_MIN_INLINE_DATA_SIZE) {
1551 inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
1552 EXT4_INLINE_DOTDOT_SIZE;
1553 inline_size = EXT4_MIN_INLINE_DATA_SIZE -
1554 EXT4_INLINE_DOTDOT_SIZE;
1555 } else {
1556 inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
1557 inline_size = ext4_get_inline_size(dir) -
1558 EXT4_MIN_INLINE_DATA_SIZE;
1559 }
1560
1561 err = ext4_journal_get_write_access(handle, bh);
1562 if (err)
1563 goto out;
1564
1565 err = ext4_generic_delete_entry(handle, dir, de_del, bh,
1566 inline_start, inline_size, 0);
1567 if (err)
1568 goto out;
1569
1570 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1571 err = ext4_mark_inode_dirty(handle, dir);
1572 if (unlikely(err))
1573 goto out;
1574
1575 ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size);
1576out:
1577 up_write(&EXT4_I(dir)->xattr_sem);
1578 brelse(iloc.bh);
1579 if (err != -ENOENT)
1580 ext4_std_error(dir->i_sb, err);
1581 return err;
1582}
1583
1584/*
1585 * Get the inline dentry at offset.
1586 */
1587static inline struct ext4_dir_entry_2 *
1588ext4_get_inline_entry(struct inode *inode,
1589 struct ext4_iloc *iloc,
1590 unsigned int offset,
1591 void **inline_start,
1592 int *inline_size)
1593{
1594 void *inline_pos;
1595
1596 BUG_ON(offset > ext4_get_inline_size(inode));
1597
1598 if (offset < EXT4_MIN_INLINE_DATA_SIZE) {
1599 inline_pos = (void *)ext4_raw_inode(iloc)->i_block;
1600 *inline_size = EXT4_MIN_INLINE_DATA_SIZE;
1601 } else {
1602 inline_pos = ext4_get_inline_xattr_pos(inode, iloc);
1603 offset -= EXT4_MIN_INLINE_DATA_SIZE;
1604 *inline_size = ext4_get_inline_size(inode) -
1605 EXT4_MIN_INLINE_DATA_SIZE;
1606 }
1607
1608 if (inline_start)
1609 *inline_start = inline_pos;
1610 return (struct ext4_dir_entry_2 *)(inline_pos + offset);
1611}
1612
1613int empty_inline_dir(struct inode *dir, int *has_inline_data)
1614{
1615 int err, inline_size;
1616 struct ext4_iloc iloc;
1617 void *inline_pos;
1618 unsigned int offset;
1619 struct ext4_dir_entry_2 *de;
1620 int ret = 1;
1621
1622 err = ext4_get_inode_loc(dir, &iloc);
1623 if (err) {
1624 EXT4_ERROR_INODE(dir, "error %d getting inode %lu block",
1625 err, dir->i_ino);
1626 return 1;
1627 }
1628
1629 down_read(&EXT4_I(dir)->xattr_sem);
1630 if (!ext4_has_inline_data(dir)) {
1631 *has_inline_data = 0;
1632 goto out;
1633 }
1634
1635 de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
1636 if (!le32_to_cpu(de->inode)) {
1637 ext4_warning(dir->i_sb,
1638 "bad inline directory (dir #%lu) - no `..'",
1639 dir->i_ino);
1640 ret = 1;
1641 goto out;
1642 }
1643
1644 offset = EXT4_INLINE_DOTDOT_SIZE;
1645 while (offset < dir->i_size) {
1646 de = ext4_get_inline_entry(dir, &iloc, offset,
1647 &inline_pos, &inline_size);
1648 if (ext4_check_dir_entry(dir, NULL, de,
1649 iloc.bh, inline_pos,
1650 inline_size, offset)) {
1651 ext4_warning(dir->i_sb,
1652 "bad inline directory (dir #%lu) - "
1653 "inode %u, rec_len %u, name_len %d"
1654 "inline size %d\n",
1655 dir->i_ino, le32_to_cpu(de->inode),
1656 le16_to_cpu(de->rec_len), de->name_len,
1657 inline_size);
1658 ret = 1;
1659 goto out;
1660 }
1661 if (le32_to_cpu(de->inode)) {
1662 ret = 0;
1663 goto out;
1664 }
1665 offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
1666 }
1667
1668out:
1669 up_read(&EXT4_I(dir)->xattr_sem);
1670 brelse(iloc.bh);
1671 return ret;
1672}
1673
1674int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
1675{
1676 int ret;
1677
1678 down_write(&EXT4_I(inode)->xattr_sem);
1679 ret = ext4_destroy_inline_data_nolock(handle, inode);
1680 up_write(&EXT4_I(inode)->xattr_sem);
1681
1682 return ret;
1683}
1684
1685int ext4_inline_data_fiemap(struct inode *inode,
1686 struct fiemap_extent_info *fieinfo,
1687 int *has_inline)
1688{
1689 __u64 physical = 0;
1690 __u64 length;
1691 __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST;
1692 int error = 0;
1693 struct ext4_iloc iloc;
1694
1695 down_read(&EXT4_I(inode)->xattr_sem);
1696 if (!ext4_has_inline_data(inode)) {
1697 *has_inline = 0;
1698 goto out;
1699 }
1700
1701 error = ext4_get_inode_loc(inode, &iloc);
1702 if (error)
1703 goto out;
1704
1705 physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
1706 physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
1707 physical += offsetof(struct ext4_inode, i_block);
1708 length = i_size_read(inode);
1709
1710 if (physical)
1711 error = fiemap_fill_next_extent(fieinfo, 0, physical,
1712 length, flags);
1713 brelse(iloc.bh);
1714out:
1715 up_read(&EXT4_I(inode)->xattr_sem);
1716 return (error < 0 ? error : 0);
1717}
1718
1719/*
1720 * Called during xattr set, and if we can sparse space 'needed',
1721 * just create the extent tree evict the data to the outer block.
1722 *
1723 * We use jbd2 instead of page cache to move data to the 1st block
1724 * so that the whole transaction can be committed as a whole and
1725 * the data isn't lost because of the delayed page cache write.
1726 */
1727int ext4_try_to_evict_inline_data(handle_t *handle,
1728 struct inode *inode,
1729 int needed)
1730{
1731 int error;
1732 struct ext4_xattr_entry *entry;
1733 struct ext4_xattr_ibody_header *header;
1734 struct ext4_inode *raw_inode;
1735 struct ext4_iloc iloc;
1736
1737 error = ext4_get_inode_loc(inode, &iloc);
1738 if (error)
1739 return error;
1740
1741 raw_inode = ext4_raw_inode(&iloc);
1742 header = IHDR(inode, raw_inode);
1743 entry = (struct ext4_xattr_entry *)((void *)raw_inode +
1744 EXT4_I(inode)->i_inline_off);
1745 if (EXT4_XATTR_LEN(entry->e_name_len) +
1746 EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) {
1747 error = -ENOSPC;
1748 goto out;
1749 }
1750
1751 error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
1752out:
1753 brelse(iloc.bh);
1754 return error;
1755}
1756
1757void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
1758{
1759 handle_t *handle;
1760 int inline_size, value_len, needed_blocks;
1761 size_t i_size;
1762 void *value = NULL;
1763 struct ext4_xattr_ibody_find is = {
1764 .s = { .not_found = -ENODATA, },
1765 };
1766 struct ext4_xattr_info i = {
1767 .name_index = EXT4_XATTR_INDEX_SYSTEM,
1768 .name = EXT4_XATTR_SYSTEM_DATA,
1769 };
1770
1771
1772 needed_blocks = ext4_writepage_trans_blocks(inode);
1773 handle = ext4_journal_start(inode, needed_blocks);
1774 if (IS_ERR(handle))
1775 return;
1776
1777 down_write(&EXT4_I(inode)->xattr_sem);
1778 if (!ext4_has_inline_data(inode)) {
1779 *has_inline = 0;
1780 ext4_journal_stop(handle);
1781 return;
1782 }
1783
1784 if (ext4_orphan_add(handle, inode))
1785 goto out;
1786
1787 if (ext4_get_inode_loc(inode, &is.iloc))
1788 goto out;
1789
1790 down_write(&EXT4_I(inode)->i_data_sem);
1791 i_size = inode->i_size;
1792 inline_size = ext4_get_inline_size(inode);
1793 EXT4_I(inode)->i_disksize = i_size;
1794
1795 if (i_size < inline_size) {
1796 /* Clear the content in the xattr space. */
1797 if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
1798 if (ext4_xattr_ibody_find(inode, &i, &is))
1799 goto out_error;
1800
1801 BUG_ON(is.s.not_found);
1802
1803 value_len = le32_to_cpu(is.s.here->e_value_size);
1804 value = kmalloc(value_len, GFP_NOFS);
1805 if (!value)
1806 goto out_error;
1807
1808 if (ext4_xattr_ibody_get(inode, i.name_index, i.name,
1809 value, value_len))
1810 goto out_error;
1811
1812 i.value = value;
1813 i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
1814 i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
1815 if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is))
1816 goto out_error;
1817 }
1818
1819 /* Clear the content within i_blocks. */
1820 if (i_size < EXT4_MIN_INLINE_DATA_SIZE)
1821 memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0,
1822 EXT4_MIN_INLINE_DATA_SIZE - i_size);
1823
1824 EXT4_I(inode)->i_inline_size = i_size <
1825 EXT4_MIN_INLINE_DATA_SIZE ?
1826 EXT4_MIN_INLINE_DATA_SIZE : i_size;
1827 }
1828
1829out_error:
1830 up_write(&EXT4_I(inode)->i_data_sem);
1831out:
1832 brelse(is.iloc.bh);
1833 up_write(&EXT4_I(inode)->xattr_sem);
1834 kfree(value);
1835 if (inode->i_nlink)
1836 ext4_orphan_del(handle, inode);
1837
1838 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
1839 ext4_mark_inode_dirty(handle, inode);
1840 if (IS_SYNC(inode))
1841 ext4_handle_sync(handle);
1842
1843 ext4_journal_stop(handle);
1844 return;
1845}
1846
1847int ext4_convert_inline_data(struct inode *inode)
1848{
1849 int error, needed_blocks;
1850 handle_t *handle;
1851 struct ext4_iloc iloc;
1852
1853 if (!ext4_has_inline_data(inode)) {
1854 ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
1855 return 0;
1856 }
1857
1858 needed_blocks = ext4_writepage_trans_blocks(inode);
1859
1860 iloc.bh = NULL;
1861 error = ext4_get_inode_loc(inode, &iloc);
1862 if (error)
1863 return error;
1864
1865 handle = ext4_journal_start(inode, needed_blocks);
1866 if (IS_ERR(handle)) {
1867 error = PTR_ERR(handle);
1868 goto out_free;
1869 }
1870
1871 down_write(&EXT4_I(inode)->xattr_sem);
1872 if (!ext4_has_inline_data(inode)) {
1873 up_write(&EXT4_I(inode)->xattr_sem);
1874 goto out;
1875 }
1876
1877 error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
1878 up_write(&EXT4_I(inode)->xattr_sem);
1879out:
1880 ext4_journal_stop(handle);
1881out_free:
1882 brelse(iloc.bh);
1883 return error;
1884}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b3c243b9afa5..cb1c1ab2720b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -484,49 +484,6 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
484} 484}
485 485
486/* 486/*
487 * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
488 */
489static void set_buffers_da_mapped(struct inode *inode,
490 struct ext4_map_blocks *map)
491{
492 struct address_space *mapping = inode->i_mapping;
493 struct pagevec pvec;
494 int i, nr_pages;
495 pgoff_t index, end;
496
497 index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
498 end = (map->m_lblk + map->m_len - 1) >>
499 (PAGE_CACHE_SHIFT - inode->i_blkbits);
500
501 pagevec_init(&pvec, 0);
502 while (index <= end) {
503 nr_pages = pagevec_lookup(&pvec, mapping, index,
504 min(end - index + 1,
505 (pgoff_t)PAGEVEC_SIZE));
506 if (nr_pages == 0)
507 break;
508 for (i = 0; i < nr_pages; i++) {
509 struct page *page = pvec.pages[i];
510 struct buffer_head *bh, *head;
511
512 if (unlikely(page->mapping != mapping) ||
513 !PageDirty(page))
514 break;
515
516 if (page_has_buffers(page)) {
517 bh = head = page_buffers(page);
518 do {
519 set_buffer_da_mapped(bh);
520 bh = bh->b_this_page;
521 } while (bh != head);
522 }
523 index++;
524 }
525 pagevec_release(&pvec);
526 }
527}
528
529/*
530 * The ext4_map_blocks() function tries to look up the requested blocks, 487 * The ext4_map_blocks() function tries to look up the requested blocks,
531 * and returns if the blocks are already mapped. 488 * and returns if the blocks are already mapped.
532 * 489 *
@@ -574,7 +531,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
574 up_read((&EXT4_I(inode)->i_data_sem)); 531 up_read((&EXT4_I(inode)->i_data_sem));
575 532
576 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 533 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
577 int ret = check_block_validity(inode, map); 534 int ret;
535 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
536 /* delayed alloc may be allocated by fallocate and
537 * coverted to initialized by directIO.
538 * we need to handle delayed extent here.
539 */
540 down_write((&EXT4_I(inode)->i_data_sem));
541 goto delayed_mapped;
542 }
543 ret = check_block_validity(inode, map);
578 if (ret != 0) 544 if (ret != 0)
579 return ret; 545 return ret;
580 } 546 }
@@ -652,12 +618,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
652 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { 618 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
653 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); 619 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
654 620
655 /* If we have successfully mapped the delayed allocated blocks, 621 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
656 * set the BH_Da_Mapped bit on them. Its important to do this 622 int ret;
657 * under the protection of i_data_sem. 623delayed_mapped:
658 */ 624 /* delayed allocation blocks has been allocated */
659 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) 625 ret = ext4_es_remove_extent(inode, map->m_lblk,
660 set_buffers_da_mapped(inode, map); 626 map->m_len);
627 if (ret < 0)
628 retval = ret;
629 }
661 } 630 }
662 631
663 up_write((&EXT4_I(inode)->i_data_sem)); 632 up_write((&EXT4_I(inode)->i_data_sem));
@@ -680,10 +649,13 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
680 int ret = 0, started = 0; 649 int ret = 0, started = 0;
681 int dio_credits; 650 int dio_credits;
682 651
652 if (ext4_has_inline_data(inode))
653 return -ERANGE;
654
683 map.m_lblk = iblock; 655 map.m_lblk = iblock;
684 map.m_len = bh->b_size >> inode->i_blkbits; 656 map.m_len = bh->b_size >> inode->i_blkbits;
685 657
686 if (flags && !handle) { 658 if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
687 /* Direct IO write... */ 659 /* Direct IO write... */
688 if (map.m_len > DIO_MAX_BLOCKS) 660 if (map.m_len > DIO_MAX_BLOCKS)
689 map.m_len = DIO_MAX_BLOCKS; 661 map.m_len = DIO_MAX_BLOCKS;
@@ -798,13 +770,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
798 return NULL; 770 return NULL;
799} 771}
800 772
801static int walk_page_buffers(handle_t *handle, 773int ext4_walk_page_buffers(handle_t *handle,
802 struct buffer_head *head, 774 struct buffer_head *head,
803 unsigned from, 775 unsigned from,
804 unsigned to, 776 unsigned to,
805 int *partial, 777 int *partial,
806 int (*fn)(handle_t *handle, 778 int (*fn)(handle_t *handle,
807 struct buffer_head *bh)) 779 struct buffer_head *bh))
808{ 780{
809 struct buffer_head *bh; 781 struct buffer_head *bh;
810 unsigned block_start, block_end; 782 unsigned block_start, block_end;
@@ -854,8 +826,8 @@ static int walk_page_buffers(handle_t *handle,
854 * is elevated. We'll still have enough credits for the tiny quotafile 826 * is elevated. We'll still have enough credits for the tiny quotafile
855 * write. 827 * write.
856 */ 828 */
857static int do_journal_get_write_access(handle_t *handle, 829int do_journal_get_write_access(handle_t *handle,
858 struct buffer_head *bh) 830 struct buffer_head *bh)
859{ 831{
860 int dirty = buffer_dirty(bh); 832 int dirty = buffer_dirty(bh);
861 int ret; 833 int ret;
@@ -878,7 +850,7 @@ static int do_journal_get_write_access(handle_t *handle,
878 return ret; 850 return ret;
879} 851}
880 852
881static int ext4_get_block_write(struct inode *inode, sector_t iblock, 853static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
882 struct buffer_head *bh_result, int create); 854 struct buffer_head *bh_result, int create);
883static int ext4_write_begin(struct file *file, struct address_space *mapping, 855static int ext4_write_begin(struct file *file, struct address_space *mapping,
884 loff_t pos, unsigned len, unsigned flags, 856 loff_t pos, unsigned len, unsigned flags,
@@ -902,6 +874,17 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
902 from = pos & (PAGE_CACHE_SIZE - 1); 874 from = pos & (PAGE_CACHE_SIZE - 1);
903 to = from + len; 875 to = from + len;
904 876
877 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
878 ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
879 flags, pagep);
880 if (ret < 0)
881 goto out;
882 if (ret == 1) {
883 ret = 0;
884 goto out;
885 }
886 }
887
905retry: 888retry:
906 handle = ext4_journal_start(inode, needed_blocks); 889 handle = ext4_journal_start(inode, needed_blocks);
907 if (IS_ERR(handle)) { 890 if (IS_ERR(handle)) {
@@ -919,6 +902,7 @@ retry:
919 ret = -ENOMEM; 902 ret = -ENOMEM;
920 goto out; 903 goto out;
921 } 904 }
905
922 *pagep = page; 906 *pagep = page;
923 907
924 if (ext4_should_dioread_nolock(inode)) 908 if (ext4_should_dioread_nolock(inode))
@@ -927,8 +911,9 @@ retry:
927 ret = __block_write_begin(page, pos, len, ext4_get_block); 911 ret = __block_write_begin(page, pos, len, ext4_get_block);
928 912
929 if (!ret && ext4_should_journal_data(inode)) { 913 if (!ret && ext4_should_journal_data(inode)) {
930 ret = walk_page_buffers(handle, page_buffers(page), 914 ret = ext4_walk_page_buffers(handle, page_buffers(page),
931 from, to, NULL, do_journal_get_write_access); 915 from, to, NULL,
916 do_journal_get_write_access);
932 } 917 }
933 918
934 if (ret) { 919 if (ret) {
@@ -983,7 +968,12 @@ static int ext4_generic_write_end(struct file *file,
983 struct inode *inode = mapping->host; 968 struct inode *inode = mapping->host;
984 handle_t *handle = ext4_journal_current_handle(); 969 handle_t *handle = ext4_journal_current_handle();
985 970
986 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 971 if (ext4_has_inline_data(inode))
972 copied = ext4_write_inline_data_end(inode, pos, len,
973 copied, page);
974 else
975 copied = block_write_end(file, mapping, pos,
976 len, copied, page, fsdata);
987 977
988 /* 978 /*
989 * No need to use i_size_read() here, the i_size 979 * No need to use i_size_read() here, the i_size
@@ -1134,16 +1124,21 @@ static int ext4_journalled_write_end(struct file *file,
1134 1124
1135 BUG_ON(!ext4_handle_valid(handle)); 1125 BUG_ON(!ext4_handle_valid(handle));
1136 1126
1137 if (copied < len) { 1127 if (ext4_has_inline_data(inode))
1138 if (!PageUptodate(page)) 1128 copied = ext4_write_inline_data_end(inode, pos, len,
1139 copied = 0; 1129 copied, page);
1140 page_zero_new_buffers(page, from+copied, to); 1130 else {
1141 } 1131 if (copied < len) {
1132 if (!PageUptodate(page))
1133 copied = 0;
1134 page_zero_new_buffers(page, from+copied, to);
1135 }
1142 1136
1143 ret = walk_page_buffers(handle, page_buffers(page), from, 1137 ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
1144 to, &partial, write_end_fn); 1138 to, &partial, write_end_fn);
1145 if (!partial) 1139 if (!partial)
1146 SetPageUptodate(page); 1140 SetPageUptodate(page);
1141 }
1147 new_i_size = pos + copied; 1142 new_i_size = pos + copied;
1148 if (new_i_size > inode->i_size) 1143 if (new_i_size > inode->i_size)
1149 i_size_write(inode, pos+copied); 1144 i_size_write(inode, pos+copied);
@@ -1301,6 +1296,7 @@ static void ext4_da_page_release_reservation(struct page *page,
1301 struct inode *inode = page->mapping->host; 1296 struct inode *inode = page->mapping->host;
1302 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1297 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1303 int num_clusters; 1298 int num_clusters;
1299 ext4_fsblk_t lblk;
1304 1300
1305 head = page_buffers(page); 1301 head = page_buffers(page);
1306 bh = head; 1302 bh = head;
@@ -1310,20 +1306,23 @@ static void ext4_da_page_release_reservation(struct page *page,
1310 if ((offset <= curr_off) && (buffer_delay(bh))) { 1306 if ((offset <= curr_off) && (buffer_delay(bh))) {
1311 to_release++; 1307 to_release++;
1312 clear_buffer_delay(bh); 1308 clear_buffer_delay(bh);
1313 clear_buffer_da_mapped(bh);
1314 } 1309 }
1315 curr_off = next_off; 1310 curr_off = next_off;
1316 } while ((bh = bh->b_this_page) != head); 1311 } while ((bh = bh->b_this_page) != head);
1317 1312
1313 if (to_release) {
1314 lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1315 ext4_es_remove_extent(inode, lblk, to_release);
1316 }
1317
1318 /* If we have released all the blocks belonging to a cluster, then we 1318 /* If we have released all the blocks belonging to a cluster, then we
1319 * need to release the reserved space for that cluster. */ 1319 * need to release the reserved space for that cluster. */
1320 num_clusters = EXT4_NUM_B2C(sbi, to_release); 1320 num_clusters = EXT4_NUM_B2C(sbi, to_release);
1321 while (num_clusters > 0) { 1321 while (num_clusters > 0) {
1322 ext4_fsblk_t lblk;
1323 lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) + 1322 lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
1324 ((num_clusters - 1) << sbi->s_cluster_bits); 1323 ((num_clusters - 1) << sbi->s_cluster_bits);
1325 if (sbi->s_cluster_ratio == 1 || 1324 if (sbi->s_cluster_ratio == 1 ||
1326 !ext4_find_delalloc_cluster(inode, lblk, 1)) 1325 !ext4_find_delalloc_cluster(inode, lblk))
1327 ext4_da_release_space(inode, 1); 1326 ext4_da_release_space(inode, 1);
1328 1327
1329 num_clusters--; 1328 num_clusters--;
@@ -1429,8 +1428,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
1429 clear_buffer_delay(bh); 1428 clear_buffer_delay(bh);
1430 bh->b_blocknr = pblock; 1429 bh->b_blocknr = pblock;
1431 } 1430 }
1432 if (buffer_da_mapped(bh))
1433 clear_buffer_da_mapped(bh);
1434 if (buffer_unwritten(bh) || 1431 if (buffer_unwritten(bh) ||
1435 buffer_mapped(bh)) 1432 buffer_mapped(bh))
1436 BUG_ON(bh->b_blocknr != pblock); 1433 BUG_ON(bh->b_blocknr != pblock);
@@ -1500,9 +1497,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
1500 struct pagevec pvec; 1497 struct pagevec pvec;
1501 struct inode *inode = mpd->inode; 1498 struct inode *inode = mpd->inode;
1502 struct address_space *mapping = inode->i_mapping; 1499 struct address_space *mapping = inode->i_mapping;
1500 ext4_lblk_t start, last;
1503 1501
1504 index = mpd->first_page; 1502 index = mpd->first_page;
1505 end = mpd->next_page - 1; 1503 end = mpd->next_page - 1;
1504
1505 start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1506 last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1507 ext4_es_remove_extent(inode, start, last - start + 1);
1508
1509 pagevec_init(&pvec, 0);
1506 while (index <= end) { 1510 while (index <= end) {
1507 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1511 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1508 if (nr_pages == 0) 1512 if (nr_pages == 0)
@@ -1656,15 +1660,6 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
1656 1660
1657 for (i = 0; i < map.m_len; i++) 1661 for (i = 0; i < map.m_len; i++)
1658 unmap_underlying_metadata(bdev, map.m_pblk + i); 1662 unmap_underlying_metadata(bdev, map.m_pblk + i);
1659
1660 if (ext4_should_order_data(mpd->inode)) {
1661 err = ext4_jbd2_file_inode(handle, mpd->inode);
1662 if (err) {
1663 /* Only if the journal is aborted */
1664 mpd->retval = err;
1665 goto submit_io;
1666 }
1667 }
1668 } 1663 }
1669 1664
1670 /* 1665 /*
@@ -1795,7 +1790,19 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1795 * file system block. 1790 * file system block.
1796 */ 1791 */
1797 down_read((&EXT4_I(inode)->i_data_sem)); 1792 down_read((&EXT4_I(inode)->i_data_sem));
1798 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 1793 if (ext4_has_inline_data(inode)) {
1794 /*
1795 * We will soon create blocks for this page, and let
1796 * us pretend as if the blocks aren't allocated yet.
1797 * In case of clusters, we have to handle the work
1798 * of mapping from cluster so that the reserved space
1799 * is calculated properly.
1800 */
1801 if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) &&
1802 ext4_find_delalloc_cluster(inode, map->m_lblk))
1803 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
1804 retval = 0;
1805 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1799 retval = ext4_ext_map_blocks(NULL, inode, map, 0); 1806 retval = ext4_ext_map_blocks(NULL, inode, map, 0);
1800 else 1807 else
1801 retval = ext4_ind_map_blocks(NULL, inode, map, 0); 1808 retval = ext4_ind_map_blocks(NULL, inode, map, 0);
@@ -1814,6 +1821,10 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1814 goto out_unlock; 1821 goto out_unlock;
1815 } 1822 }
1816 1823
1824 retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len);
1825 if (retval)
1826 goto out_unlock;
1827
1817 /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served 1828 /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
1818 * and it should not appear on the bh->b_state. 1829 * and it should not appear on the bh->b_state.
1819 */ 1830 */
@@ -1842,8 +1853,8 @@ out_unlock:
1842 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev 1853 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
1843 * initialized properly. 1854 * initialized properly.
1844 */ 1855 */
1845static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 1856int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1846 struct buffer_head *bh, int create) 1857 struct buffer_head *bh, int create)
1847{ 1858{
1848 struct ext4_map_blocks map; 1859 struct ext4_map_blocks map;
1849 int ret = 0; 1860 int ret = 0;
@@ -1917,15 +1928,29 @@ static int __ext4_journalled_writepage(struct page *page,
1917{ 1928{
1918 struct address_space *mapping = page->mapping; 1929 struct address_space *mapping = page->mapping;
1919 struct inode *inode = mapping->host; 1930 struct inode *inode = mapping->host;
1920 struct buffer_head *page_bufs; 1931 struct buffer_head *page_bufs = NULL;
1921 handle_t *handle = NULL; 1932 handle_t *handle = NULL;
1922 int ret = 0; 1933 int ret = 0, err = 0;
1923 int err; 1934 int inline_data = ext4_has_inline_data(inode);
1935 struct buffer_head *inode_bh = NULL;
1924 1936
1925 ClearPageChecked(page); 1937 ClearPageChecked(page);
1926 page_bufs = page_buffers(page); 1938
1927 BUG_ON(!page_bufs); 1939 if (inline_data) {
1928 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); 1940 BUG_ON(page->index != 0);
1941 BUG_ON(len > ext4_get_max_inline_size(inode));
1942 inode_bh = ext4_journalled_write_inline_data(inode, len, page);
1943 if (inode_bh == NULL)
1944 goto out;
1945 } else {
1946 page_bufs = page_buffers(page);
1947 if (!page_bufs) {
1948 BUG();
1949 goto out;
1950 }
1951 ext4_walk_page_buffers(handle, page_bufs, 0, len,
1952 NULL, bget_one);
1953 }
1929 /* As soon as we unlock the page, it can go away, but we have 1954 /* As soon as we unlock the page, it can go away, but we have
1930 * references to buffers so we are safe */ 1955 * references to buffers so we are safe */
1931 unlock_page(page); 1956 unlock_page(page);
@@ -1938,11 +1963,18 @@ static int __ext4_journalled_writepage(struct page *page,
1938 1963
1939 BUG_ON(!ext4_handle_valid(handle)); 1964 BUG_ON(!ext4_handle_valid(handle));
1940 1965
1941 ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, 1966 if (inline_data) {
1942 do_journal_get_write_access); 1967 ret = ext4_journal_get_write_access(handle, inode_bh);
1968
1969 err = ext4_handle_dirty_metadata(handle, inode, inode_bh);
1943 1970
1944 err = walk_page_buffers(handle, page_bufs, 0, len, NULL, 1971 } else {
1945 write_end_fn); 1972 ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
1973 do_journal_get_write_access);
1974
1975 err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
1976 write_end_fn);
1977 }
1946 if (ret == 0) 1978 if (ret == 0)
1947 ret = err; 1979 ret = err;
1948 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; 1980 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
@@ -1950,9 +1982,12 @@ static int __ext4_journalled_writepage(struct page *page,
1950 if (!ret) 1982 if (!ret)
1951 ret = err; 1983 ret = err;
1952 1984
1953 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); 1985 if (!ext4_has_inline_data(inode))
1986 ext4_walk_page_buffers(handle, page_bufs, 0, len,
1987 NULL, bput_one);
1954 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 1988 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1955out: 1989out:
1990 brelse(inode_bh);
1956 return ret; 1991 return ret;
1957} 1992}
1958 1993
@@ -2029,8 +2064,8 @@ static int ext4_writepage(struct page *page,
2029 commit_write = 1; 2064 commit_write = 1;
2030 } 2065 }
2031 page_bufs = page_buffers(page); 2066 page_bufs = page_buffers(page);
2032 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2067 if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2033 ext4_bh_delay_or_unwritten)) { 2068 ext4_bh_delay_or_unwritten)) {
2034 /* 2069 /*
2035 * We don't want to do block allocation, so redirty 2070 * We don't want to do block allocation, so redirty
2036 * the page and return. We may reach here when we do 2071 * the page and return. We may reach here when we do
@@ -2096,7 +2131,8 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2096 * mpage_da_map_and_submit to map a single contiguous memory region 2131 * mpage_da_map_and_submit to map a single contiguous memory region
2097 * and then write them. 2132 * and then write them.
2098 */ 2133 */
2099static int write_cache_pages_da(struct address_space *mapping, 2134static int write_cache_pages_da(handle_t *handle,
2135 struct address_space *mapping,
2100 struct writeback_control *wbc, 2136 struct writeback_control *wbc,
2101 struct mpage_da_data *mpd, 2137 struct mpage_da_data *mpd,
2102 pgoff_t *done_index) 2138 pgoff_t *done_index)
@@ -2175,6 +2211,17 @@ static int write_cache_pages_da(struct address_space *mapping,
2175 wait_on_page_writeback(page); 2211 wait_on_page_writeback(page);
2176 BUG_ON(PageWriteback(page)); 2212 BUG_ON(PageWriteback(page));
2177 2213
2214 /*
2215 * If we have inline data and arrive here, it means that
2216 * we will soon create the block for the 1st page, so
2217 * we'd better clear the inline data here.
2218 */
2219 if (ext4_has_inline_data(inode)) {
2220 BUG_ON(ext4_test_inode_state(inode,
2221 EXT4_STATE_MAY_INLINE_DATA));
2222 ext4_destroy_inline_data(handle, inode);
2223 }
2224
2178 if (mpd->next_page != page->index) 2225 if (mpd->next_page != page->index)
2179 mpd->first_page = page->index; 2226 mpd->first_page = page->index;
2180 mpd->next_page = page->index + 1; 2227 mpd->next_page = page->index + 1;
@@ -2381,7 +2428,8 @@ retry:
2381 * contiguous region of logical blocks that need 2428 * contiguous region of logical blocks that need
2382 * blocks to be allocated by ext4 and submit them. 2429 * blocks to be allocated by ext4 and submit them.
2383 */ 2430 */
2384 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); 2431 ret = write_cache_pages_da(handle, mapping,
2432 wbc, &mpd, &done_index);
2385 /* 2433 /*
2386 * If we have a contiguous extent of pages and we 2434 * If we have a contiguous extent of pages and we
2387 * haven't done the I/O yet, map the blocks and submit 2435 * haven't done the I/O yet, map the blocks and submit
@@ -2445,7 +2493,6 @@ out_writepages:
2445 return ret; 2493 return ret;
2446} 2494}
2447 2495
2448#define FALL_BACK_TO_NONDELALLOC 1
2449static int ext4_nonda_switch(struct super_block *sb) 2496static int ext4_nonda_switch(struct super_block *sb)
2450{ 2497{
2451 s64 free_blocks, dirty_blocks; 2498 s64 free_blocks, dirty_blocks;
@@ -2502,6 +2549,19 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2502 } 2549 }
2503 *fsdata = (void *)0; 2550 *fsdata = (void *)0;
2504 trace_ext4_da_write_begin(inode, pos, len, flags); 2551 trace_ext4_da_write_begin(inode, pos, len, flags);
2552
2553 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
2554 ret = ext4_da_write_inline_data_begin(mapping, inode,
2555 pos, len, flags,
2556 pagep, fsdata);
2557 if (ret < 0)
2558 goto out;
2559 if (ret == 1) {
2560 ret = 0;
2561 goto out;
2562 }
2563 }
2564
2505retry: 2565retry:
2506 /* 2566 /*
2507 * With delayed allocation, we don't log the i_disksize update 2567 * With delayed allocation, we don't log the i_disksize update
@@ -2603,22 +2663,13 @@ static int ext4_da_write_end(struct file *file,
2603 * changes. So let's piggyback the i_disksize mark_inode_dirty 2663 * changes. So let's piggyback the i_disksize mark_inode_dirty
2604 * into that. 2664 * into that.
2605 */ 2665 */
2606
2607 new_i_size = pos + copied; 2666 new_i_size = pos + copied;
2608 if (copied && new_i_size > EXT4_I(inode)->i_disksize) { 2667 if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
2609 if (ext4_da_should_update_i_disksize(page, end)) { 2668 if (ext4_has_inline_data(inode) ||
2669 ext4_da_should_update_i_disksize(page, end)) {
2610 down_write(&EXT4_I(inode)->i_data_sem); 2670 down_write(&EXT4_I(inode)->i_data_sem);
2611 if (new_i_size > EXT4_I(inode)->i_disksize) { 2671 if (new_i_size > EXT4_I(inode)->i_disksize)
2612 /*
2613 * Updating i_disksize when extending file
2614 * without needing block allocation
2615 */
2616 if (ext4_should_order_data(inode))
2617 ret = ext4_jbd2_file_inode(handle,
2618 inode);
2619
2620 EXT4_I(inode)->i_disksize = new_i_size; 2672 EXT4_I(inode)->i_disksize = new_i_size;
2621 }
2622 up_write(&EXT4_I(inode)->i_data_sem); 2673 up_write(&EXT4_I(inode)->i_data_sem);
2623 /* We need to mark inode dirty even if 2674 /* We need to mark inode dirty even if
2624 * new_i_size is less that inode->i_size 2675 * new_i_size is less that inode->i_size
@@ -2627,8 +2678,16 @@ static int ext4_da_write_end(struct file *file,
2627 ext4_mark_inode_dirty(handle, inode); 2678 ext4_mark_inode_dirty(handle, inode);
2628 } 2679 }
2629 } 2680 }
2630 ret2 = generic_write_end(file, mapping, pos, len, copied, 2681
2682 if (write_mode != CONVERT_INLINE_DATA &&
2683 ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
2684 ext4_has_inline_data(inode))
2685 ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
2686 page);
2687 else
2688 ret2 = generic_write_end(file, mapping, pos, len, copied,
2631 page, fsdata); 2689 page, fsdata);
2690
2632 copied = ret2; 2691 copied = ret2;
2633 if (ret2 < 0) 2692 if (ret2 < 0)
2634 ret = ret2; 2693 ret = ret2;
@@ -2721,6 +2780,12 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
2721 journal_t *journal; 2780 journal_t *journal;
2722 int err; 2781 int err;
2723 2782
2783 /*
2784 * We can get here for an inline file via the FIBMAP ioctl
2785 */
2786 if (ext4_has_inline_data(inode))
2787 return 0;
2788
2724 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && 2789 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
2725 test_opt(inode->i_sb, DELALLOC)) { 2790 test_opt(inode->i_sb, DELALLOC)) {
2726 /* 2791 /*
@@ -2766,14 +2831,30 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
2766 2831
2767static int ext4_readpage(struct file *file, struct page *page) 2832static int ext4_readpage(struct file *file, struct page *page)
2768{ 2833{
2834 int ret = -EAGAIN;
2835 struct inode *inode = page->mapping->host;
2836
2769 trace_ext4_readpage(page); 2837 trace_ext4_readpage(page);
2770 return mpage_readpage(page, ext4_get_block); 2838
2839 if (ext4_has_inline_data(inode))
2840 ret = ext4_readpage_inline(inode, page);
2841
2842 if (ret == -EAGAIN)
2843 return mpage_readpage(page, ext4_get_block);
2844
2845 return ret;
2771} 2846}
2772 2847
2773static int 2848static int
2774ext4_readpages(struct file *file, struct address_space *mapping, 2849ext4_readpages(struct file *file, struct address_space *mapping,
2775 struct list_head *pages, unsigned nr_pages) 2850 struct list_head *pages, unsigned nr_pages)
2776{ 2851{
2852 struct inode *inode = mapping->host;
2853
2854 /* If the file has inline data, no need to do readpages. */
2855 if (ext4_has_inline_data(inode))
2856 return 0;
2857
2777 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 2858 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
2778} 2859}
2779 2860
@@ -2840,7 +2921,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
2840 * We allocate an uinitialized extent if blocks haven't been allocated. 2921 * We allocate an uinitialized extent if blocks haven't been allocated.
2841 * The extent will be converted to initialized after the IO is complete. 2922 * The extent will be converted to initialized after the IO is complete.
2842 */ 2923 */
2843static int ext4_get_block_write(struct inode *inode, sector_t iblock, 2924int ext4_get_block_write(struct inode *inode, sector_t iblock,
2844 struct buffer_head *bh_result, int create) 2925 struct buffer_head *bh_result, int create)
2845{ 2926{
2846 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", 2927 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
@@ -2850,29 +2931,12 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
2850} 2931}
2851 2932
2852static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, 2933static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
2853 struct buffer_head *bh_result, int flags) 2934 struct buffer_head *bh_result, int create)
2854{ 2935{
2855 handle_t *handle = ext4_journal_current_handle(); 2936 ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
2856 struct ext4_map_blocks map; 2937 inode->i_ino, create);
2857 int ret = 0; 2938 return _ext4_get_block(inode, iblock, bh_result,
2858 2939 EXT4_GET_BLOCKS_NO_LOCK);
2859 ext4_debug("ext4_get_block_write_nolock: inode %lu, flag %d\n",
2860 inode->i_ino, flags);
2861
2862 flags = EXT4_GET_BLOCKS_NO_LOCK;
2863
2864 map.m_lblk = iblock;
2865 map.m_len = bh_result->b_size >> inode->i_blkbits;
2866
2867 ret = ext4_map_blocks(handle, inode, &map, flags);
2868 if (ret > 0) {
2869 map_bh(bh_result, inode->i_sb, map.m_pblk);
2870 bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
2871 map.m_flags;
2872 bh_result->b_size = inode->i_sb->s_blocksize * map.m_len;
2873 ret = 0;
2874 }
2875 return ret;
2876} 2940}
2877 2941
2878static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, 2942static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
@@ -2978,10 +3042,10 @@ retry:
2978 * fall back to buffered IO. 3042 * fall back to buffered IO.
2979 * 3043 *
2980 * For holes, we fallocate those blocks, mark them as uninitialized 3044 * For holes, we fallocate those blocks, mark them as uninitialized
2981 * If those blocks were preallocated, we mark sure they are splited, but 3045 * If those blocks were preallocated, we mark sure they are split, but
2982 * still keep the range to write as uninitialized. 3046 * still keep the range to write as uninitialized.
2983 * 3047 *
2984 * The unwrritten extents will be converted to written when DIO is completed. 3048 * The unwritten extents will be converted to written when DIO is completed.
2985 * For async direct IO, since the IO may still pending when return, we 3049 * For async direct IO, since the IO may still pending when return, we
2986 * set up an end_io call back function, which will do the conversion 3050 * set up an end_io call back function, which will do the conversion
2987 * when async direct IO completed. 3051 * when async direct IO completed.
@@ -2999,125 +3063,120 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
2999 struct inode *inode = file->f_mapping->host; 3063 struct inode *inode = file->f_mapping->host;
3000 ssize_t ret; 3064 ssize_t ret;
3001 size_t count = iov_length(iov, nr_segs); 3065 size_t count = iov_length(iov, nr_segs);
3002 3066 int overwrite = 0;
3067 get_block_t *get_block_func = NULL;
3068 int dio_flags = 0;
3003 loff_t final_size = offset + count; 3069 loff_t final_size = offset + count;
3004 if (rw == WRITE && final_size <= inode->i_size) {
3005 int overwrite = 0;
3006 3070
3007 BUG_ON(iocb->private == NULL); 3071 /* Use the old path for reads and writes beyond i_size. */
3072 if (rw != WRITE || final_size > inode->i_size)
3073 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3008 3074
3009 /* If we do a overwrite dio, i_mutex locking can be released */ 3075 BUG_ON(iocb->private == NULL);
3010 overwrite = *((int *)iocb->private);
3011 3076
3012 if (overwrite) { 3077 /* If we do a overwrite dio, i_mutex locking can be released */
3013 atomic_inc(&inode->i_dio_count); 3078 overwrite = *((int *)iocb->private);
3014 down_read(&EXT4_I(inode)->i_data_sem);
3015 mutex_unlock(&inode->i_mutex);
3016 }
3017 3079
3018 /* 3080 if (overwrite) {
3019 * We could direct write to holes and fallocate. 3081 atomic_inc(&inode->i_dio_count);
3020 * 3082 down_read(&EXT4_I(inode)->i_data_sem);
3021 * Allocated blocks to fill the hole are marked as uninitialized 3083 mutex_unlock(&inode->i_mutex);
3022 * to prevent parallel buffered read to expose the stale data 3084 }
3023 * before DIO complete the data IO.
3024 *
3025 * As to previously fallocated extents, ext4 get_block
3026 * will just simply mark the buffer mapped but still
3027 * keep the extents uninitialized.
3028 *
3029 * for non AIO case, we will convert those unwritten extents
3030 * to written after return back from blockdev_direct_IO.
3031 *
3032 * for async DIO, the conversion needs to be defered when
3033 * the IO is completed. The ext4 end_io callback function
3034 * will be called to take care of the conversion work.
3035 * Here for async case, we allocate an io_end structure to
3036 * hook to the iocb.
3037 */
3038 iocb->private = NULL;
3039 ext4_inode_aio_set(inode, NULL);
3040 if (!is_sync_kiocb(iocb)) {
3041 ext4_io_end_t *io_end =
3042 ext4_init_io_end(inode, GFP_NOFS);
3043 if (!io_end) {
3044 ret = -ENOMEM;
3045 goto retake_lock;
3046 }
3047 io_end->flag |= EXT4_IO_END_DIRECT;
3048 iocb->private = io_end;
3049 /*
3050 * we save the io structure for current async
3051 * direct IO, so that later ext4_map_blocks()
3052 * could flag the io structure whether there
3053 * is a unwritten extents needs to be converted
3054 * when IO is completed.
3055 */
3056 ext4_inode_aio_set(inode, io_end);
3057 }
3058 3085
3059 if (overwrite) 3086 /*
3060 ret = __blockdev_direct_IO(rw, iocb, inode, 3087 * We could direct write to holes and fallocate.
3061 inode->i_sb->s_bdev, iov, 3088 *
3062 offset, nr_segs, 3089 * Allocated blocks to fill the hole are marked as
3063 ext4_get_block_write_nolock, 3090 * uninitialized to prevent parallel buffered read to expose
3064 ext4_end_io_dio, 3091 * the stale data before DIO complete the data IO.
3065 NULL, 3092 *
3066 0); 3093 * As to previously fallocated extents, ext4 get_block will
3067 else 3094 * just simply mark the buffer mapped but still keep the
3068 ret = __blockdev_direct_IO(rw, iocb, inode, 3095 * extents uninitialized.
3069 inode->i_sb->s_bdev, iov, 3096 *
3070 offset, nr_segs, 3097 * For non AIO case, we will convert those unwritten extents
3071 ext4_get_block_write, 3098 * to written after return back from blockdev_direct_IO.
3072 ext4_end_io_dio, 3099 *
3073 NULL, 3100 * For async DIO, the conversion needs to be deferred when the
3074 DIO_LOCKING); 3101 * IO is completed. The ext4 end_io callback function will be
3075 if (iocb->private) 3102 * called to take care of the conversion work. Here for async
3076 ext4_inode_aio_set(inode, NULL); 3103 * case, we allocate an io_end structure to hook to the iocb.
3104 */
3105 iocb->private = NULL;
3106 ext4_inode_aio_set(inode, NULL);
3107 if (!is_sync_kiocb(iocb)) {
3108 ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
3109 if (!io_end) {
3110 ret = -ENOMEM;
3111 goto retake_lock;
3112 }
3113 io_end->flag |= EXT4_IO_END_DIRECT;
3114 iocb->private = io_end;
3077 /* 3115 /*
3078 * The io_end structure takes a reference to the inode, 3116 * we save the io structure for current async direct
3079 * that structure needs to be destroyed and the 3117 * IO, so that later ext4_map_blocks() could flag the
3080 * reference to the inode need to be dropped, when IO is 3118 * io structure whether there is a unwritten extents
3081 * complete, even with 0 byte write, or failed. 3119 * needs to be converted when IO is completed.
3082 *
3083 * In the successful AIO DIO case, the io_end structure will be
3084 * desctroyed and the reference to the inode will be dropped
3085 * after the end_io call back function is called.
3086 *
3087 * In the case there is 0 byte write, or error case, since
3088 * VFS direct IO won't invoke the end_io call back function,
3089 * we need to free the end_io structure here.
3090 */ 3120 */
3091 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { 3121 ext4_inode_aio_set(inode, io_end);
3092 ext4_free_io_end(iocb->private); 3122 }
3093 iocb->private = NULL;
3094 } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
3095 EXT4_STATE_DIO_UNWRITTEN)) {
3096 int err;
3097 /*
3098 * for non AIO case, since the IO is already
3099 * completed, we could do the conversion right here
3100 */
3101 err = ext4_convert_unwritten_extents(inode,
3102 offset, ret);
3103 if (err < 0)
3104 ret = err;
3105 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3106 }
3107 3123
3108 retake_lock: 3124 if (overwrite) {
3109 /* take i_mutex locking again if we do a ovewrite dio */ 3125 get_block_func = ext4_get_block_write_nolock;
3110 if (overwrite) { 3126 } else {
3111 inode_dio_done(inode); 3127 get_block_func = ext4_get_block_write;
3112 up_read(&EXT4_I(inode)->i_data_sem); 3128 dio_flags = DIO_LOCKING;
3113 mutex_lock(&inode->i_mutex); 3129 }
3114 } 3130 ret = __blockdev_direct_IO(rw, iocb, inode,
3131 inode->i_sb->s_bdev, iov,
3132 offset, nr_segs,
3133 get_block_func,
3134 ext4_end_io_dio,
3135 NULL,
3136 dio_flags);
3137
3138 if (iocb->private)
3139 ext4_inode_aio_set(inode, NULL);
3140 /*
3141 * The io_end structure takes a reference to the inode, that
3142 * structure needs to be destroyed and the reference to the
3143 * inode need to be dropped, when IO is complete, even with 0
3144 * byte write, or failed.
3145 *
3146 * In the successful AIO DIO case, the io_end structure will
3147 * be destroyed and the reference to the inode will be dropped
3148 * after the end_io call back function is called.
3149 *
3150 * In the case there is 0 byte write, or error case, since VFS
3151 * direct IO won't invoke the end_io call back function, we
3152 * need to free the end_io structure here.
3153 */
3154 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3155 ext4_free_io_end(iocb->private);
3156 iocb->private = NULL;
3157 } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
3158 EXT4_STATE_DIO_UNWRITTEN)) {
3159 int err;
3160 /*
3161 * for non AIO case, since the IO is already
3162 * completed, we could do the conversion right here
3163 */
3164 err = ext4_convert_unwritten_extents(inode,
3165 offset, ret);
3166 if (err < 0)
3167 ret = err;
3168 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3169 }
3115 3170
3116 return ret; 3171retake_lock:
3172 /* take i_mutex locking again if we do a ovewrite dio */
3173 if (overwrite) {
3174 inode_dio_done(inode);
3175 up_read(&EXT4_I(inode)->i_data_sem);
3176 mutex_lock(&inode->i_mutex);
3117 } 3177 }
3118 3178
3119 /* for write the the end of file case, we fall back to old way */ 3179 return ret;
3120 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3121} 3180}
3122 3181
3123static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3182static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
@@ -3134,6 +3193,10 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3134 if (ext4_should_journal_data(inode)) 3193 if (ext4_should_journal_data(inode))
3135 return 0; 3194 return 0;
3136 3195
3196 /* Let buffer I/O handle the inline data case. */
3197 if (ext4_has_inline_data(inode))
3198 return 0;
3199
3137 trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); 3200 trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
3138 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3201 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3139 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); 3202 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -3531,6 +3594,14 @@ void ext4_truncate(struct inode *inode)
3531 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 3594 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3532 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 3595 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
3533 3596
3597 if (ext4_has_inline_data(inode)) {
3598 int has_inline = 1;
3599
3600 ext4_inline_data_truncate(inode, &has_inline);
3601 if (has_inline)
3602 return;
3603 }
3604
3534 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3605 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3535 ext4_ext_truncate(inode); 3606 ext4_ext_truncate(inode);
3536 else 3607 else
@@ -3756,6 +3827,19 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
3756 } 3827 }
3757} 3828}
3758 3829
3830static inline void ext4_iget_extra_inode(struct inode *inode,
3831 struct ext4_inode *raw_inode,
3832 struct ext4_inode_info *ei)
3833{
3834 __le32 *magic = (void *)raw_inode +
3835 EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
3836 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
3837 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
3838 ext4_find_inline_data_nolock(inode);
3839 } else
3840 EXT4_I(inode)->i_inline_off = 0;
3841}
3842
3759struct inode *ext4_iget(struct super_block *sb, unsigned long ino) 3843struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3760{ 3844{
3761 struct ext4_iloc iloc; 3845 struct ext4_iloc iloc;
@@ -3826,6 +3910,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3826 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); 3910 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
3827 3911
3828 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ 3912 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
3913 ei->i_inline_off = 0;
3829 ei->i_dir_start_lookup = 0; 3914 ei->i_dir_start_lookup = 0;
3830 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 3915 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
3831 /* We now have enough fields to check if the inode was active or not. 3916 /* We now have enough fields to check if the inode was active or not.
@@ -3898,11 +3983,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3898 ei->i_extra_isize = sizeof(struct ext4_inode) - 3983 ei->i_extra_isize = sizeof(struct ext4_inode) -
3899 EXT4_GOOD_OLD_INODE_SIZE; 3984 EXT4_GOOD_OLD_INODE_SIZE;
3900 } else { 3985 } else {
3901 __le32 *magic = (void *)raw_inode + 3986 ext4_iget_extra_inode(inode, raw_inode, ei);
3902 EXT4_GOOD_OLD_INODE_SIZE +
3903 ei->i_extra_isize;
3904 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
3905 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
3906 } 3987 }
3907 } 3988 }
3908 3989
@@ -3925,17 +4006,19 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3925 ei->i_file_acl); 4006 ei->i_file_acl);
3926 ret = -EIO; 4007 ret = -EIO;
3927 goto bad_inode; 4008 goto bad_inode;
3928 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 4009 } else if (!ext4_has_inline_data(inode)) {
3929 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4010 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
3930 (S_ISLNK(inode->i_mode) && 4011 if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
3931 !ext4_inode_is_fast_symlink(inode))) 4012 (S_ISLNK(inode->i_mode) &&
3932 /* Validate extent which is part of inode */ 4013 !ext4_inode_is_fast_symlink(inode))))
3933 ret = ext4_ext_check_inode(inode); 4014 /* Validate extent which is part of inode */
3934 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4015 ret = ext4_ext_check_inode(inode);
3935 (S_ISLNK(inode->i_mode) && 4016 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
3936 !ext4_inode_is_fast_symlink(inode))) { 4017 (S_ISLNK(inode->i_mode) &&
3937 /* Validate block references which are part of inode */ 4018 !ext4_inode_is_fast_symlink(inode))) {
3938 ret = ext4_ind_check_inode(inode); 4019 /* Validate block references which are part of inode */
4020 ret = ext4_ind_check_inode(inode);
4021 }
3939 } 4022 }
3940 if (ret) 4023 if (ret)
3941 goto bad_inode; 4024 goto bad_inode;
@@ -4122,9 +4205,10 @@ static int ext4_do_update_inode(handle_t *handle,
4122 cpu_to_le32(new_encode_dev(inode->i_rdev)); 4205 cpu_to_le32(new_encode_dev(inode->i_rdev));
4123 raw_inode->i_block[2] = 0; 4206 raw_inode->i_block[2] = 0;
4124 } 4207 }
4125 } else 4208 } else if (!ext4_has_inline_data(inode)) {
4126 for (block = 0; block < EXT4_N_BLOCKS; block++) 4209 for (block = 0; block < EXT4_N_BLOCKS; block++)
4127 raw_inode->i_block[block] = ei->i_data[block]; 4210 raw_inode->i_block[block] = ei->i_data[block];
4211 }
4128 4212
4129 raw_inode->i_disk_version = cpu_to_le32(inode->i_version); 4213 raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
4130 if (ei->i_extra_isize) { 4214 if (ei->i_extra_isize) {
@@ -4811,8 +4895,9 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4811 * journal_start/journal_stop which can block and take a long time 4895 * journal_start/journal_stop which can block and take a long time
4812 */ 4896 */
4813 if (page_has_buffers(page)) { 4897 if (page_has_buffers(page)) {
4814 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 4898 if (!ext4_walk_page_buffers(NULL, page_buffers(page),
4815 ext4_bh_unmapped)) { 4899 0, len, NULL,
4900 ext4_bh_unmapped)) {
4816 /* Wait so that we don't change page under IO */ 4901 /* Wait so that we don't change page under IO */
4817 wait_on_page_writeback(page); 4902 wait_on_page_writeback(page);
4818 ret = VM_FAULT_LOCKED; 4903 ret = VM_FAULT_LOCKED;
@@ -4833,7 +4918,7 @@ retry_alloc:
4833 } 4918 }
4834 ret = __block_page_mkwrite(vma, vmf, get_block); 4919 ret = __block_page_mkwrite(vma, vmf, get_block);
4835 if (!ret && ext4_should_journal_data(inode)) { 4920 if (!ret && ext4_should_journal_data(inode)) {
4836 if (walk_page_buffers(handle, page_buffers(page), 0, 4921 if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
4837 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { 4922 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
4838 unlock_page(page); 4923 unlock_page(page);
4839 ret = VM_FAULT_SIGBUS; 4924 ret = VM_FAULT_SIGBUS;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 526e55358606..1bf6fe785c4f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1373,7 +1373,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block,
1373 ex->fe_start += next; 1373 ex->fe_start += next;
1374 1374
1375 while (needed > ex->fe_len && 1375 while (needed > ex->fe_len &&
1376 (buddy = mb_find_buddy(e4b, order, &max))) { 1376 mb_find_buddy(e4b, order, &max)) {
1377 1377
1378 if (block + 1 >= max) 1378 if (block + 1 >= max)
1379 break; 1379 break;
@@ -2607,9 +2607,17 @@ static void ext4_free_data_callback(struct super_block *sb,
2607 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2607 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2608 entry->efd_count, entry->efd_group, entry); 2608 entry->efd_count, entry->efd_group, entry);
2609 2609
2610 if (test_opt(sb, DISCARD)) 2610 if (test_opt(sb, DISCARD)) {
2611 ext4_issue_discard(sb, entry->efd_group, 2611 err = ext4_issue_discard(sb, entry->efd_group,
2612 entry->efd_start_cluster, entry->efd_count); 2612 entry->efd_start_cluster,
2613 entry->efd_count);
2614 if (err && err != -EOPNOTSUPP)
2615 ext4_msg(sb, KERN_WARNING, "discard request in"
2616 " group:%d block:%d count:%d failed"
2617 " with %d", entry->efd_group,
2618 entry->efd_start_cluster,
2619 entry->efd_count, err);
2620 }
2613 2621
2614 err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); 2622 err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
2615 /* we expect to find existing buddy because it's pinned */ 2623 /* we expect to find existing buddy because it's pinned */
@@ -4310,8 +4318,10 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4310repeat: 4318repeat:
4311 /* allocate space in core */ 4319 /* allocate space in core */
4312 *errp = ext4_mb_regular_allocator(ac); 4320 *errp = ext4_mb_regular_allocator(ac);
4313 if (*errp) 4321 if (*errp) {
4322 ext4_discard_allocated_blocks(ac);
4314 goto errout; 4323 goto errout;
4324 }
4315 4325
4316 /* as we've just preallocated more space than 4326 /* as we've just preallocated more space than
4317 * user requested orinally, we store allocated 4327 * user requested orinally, we store allocated
@@ -4333,10 +4343,10 @@ repeat:
4333 ac->ac_b_ex.fe_len = 0; 4343 ac->ac_b_ex.fe_len = 0;
4334 ac->ac_status = AC_STATUS_CONTINUE; 4344 ac->ac_status = AC_STATUS_CONTINUE;
4335 goto repeat; 4345 goto repeat;
4336 } else if (*errp) 4346 } else if (*errp) {
4337 errout:
4338 ext4_discard_allocated_blocks(ac); 4347 ext4_discard_allocated_blocks(ac);
4339 else { 4348 goto errout;
4349 } else {
4340 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4350 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4341 ar->len = ac->ac_b_ex.fe_len; 4351 ar->len = ac->ac_b_ex.fe_len;
4342 } 4352 }
@@ -4347,6 +4357,7 @@ repeat:
4347 *errp = -ENOSPC; 4357 *errp = -ENOSPC;
4348 } 4358 }
4349 4359
4360errout:
4350 if (*errp) { 4361 if (*errp) {
4351 ac->ac_b_ex.fe_len = 0; 4362 ac->ac_b_ex.fe_len = 0;
4352 ar->len = 0; 4363 ar->len = 0;
@@ -4656,8 +4667,16 @@ do_more:
4656 * with group lock held. generate_buddy look at 4667 * with group lock held. generate_buddy look at
4657 * them with group lock_held 4668 * them with group lock_held
4658 */ 4669 */
4659 if (test_opt(sb, DISCARD)) 4670 if (test_opt(sb, DISCARD)) {
4660 ext4_issue_discard(sb, block_group, bit, count); 4671 err = ext4_issue_discard(sb, block_group, bit, count);
4672 if (err && err != -EOPNOTSUPP)
4673 ext4_msg(sb, KERN_WARNING, "discard request in"
4674 " group:%d block:%d count:%lu failed"
4675 " with %d", block_group, bit, count,
4676 err);
4677 }
4678
4679
4661 ext4_lock_group(sb, block_group); 4680 ext4_lock_group(sb, block_group);
4662 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 4681 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4663 mb_free_blocks(inode, &e4b, bit, count_clusters); 4682 mb_free_blocks(inode, &e4b, bit, count_clusters);
@@ -4851,10 +4870,11 @@ error_return:
4851 * one will allocate those blocks, mark it as used in buddy bitmap. This must 4870 * one will allocate those blocks, mark it as used in buddy bitmap. This must
4852 * be called with under the group lock. 4871 * be called with under the group lock.
4853 */ 4872 */
4854static void ext4_trim_extent(struct super_block *sb, int start, int count, 4873static int ext4_trim_extent(struct super_block *sb, int start, int count,
4855 ext4_group_t group, struct ext4_buddy *e4b) 4874 ext4_group_t group, struct ext4_buddy *e4b)
4856{ 4875{
4857 struct ext4_free_extent ex; 4876 struct ext4_free_extent ex;
4877 int ret = 0;
4858 4878
4859 trace_ext4_trim_extent(sb, group, start, count); 4879 trace_ext4_trim_extent(sb, group, start, count);
4860 4880
@@ -4870,9 +4890,10 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count,
4870 */ 4890 */
4871 mb_mark_used(e4b, &ex); 4891 mb_mark_used(e4b, &ex);
4872 ext4_unlock_group(sb, group); 4892 ext4_unlock_group(sb, group);
4873 ext4_issue_discard(sb, group, start, count); 4893 ret = ext4_issue_discard(sb, group, start, count);
4874 ext4_lock_group(sb, group); 4894 ext4_lock_group(sb, group);
4875 mb_free_blocks(NULL, e4b, start, ex.fe_len); 4895 mb_free_blocks(NULL, e4b, start, ex.fe_len);
4896 return ret;
4876} 4897}
4877 4898
4878/** 4899/**
@@ -4901,7 +4922,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4901 void *bitmap; 4922 void *bitmap;
4902 ext4_grpblk_t next, count = 0, free_count = 0; 4923 ext4_grpblk_t next, count = 0, free_count = 0;
4903 struct ext4_buddy e4b; 4924 struct ext4_buddy e4b;
4904 int ret; 4925 int ret = 0;
4905 4926
4906 trace_ext4_trim_all_free(sb, group, start, max); 4927 trace_ext4_trim_all_free(sb, group, start, max);
4907 4928
@@ -4928,8 +4949,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4928 next = mb_find_next_bit(bitmap, max + 1, start); 4949 next = mb_find_next_bit(bitmap, max + 1, start);
4929 4950
4930 if ((next - start) >= minblocks) { 4951 if ((next - start) >= minblocks) {
4931 ext4_trim_extent(sb, start, 4952 ret = ext4_trim_extent(sb, start,
4932 next - start, group, &e4b); 4953 next - start, group, &e4b);
4954 if (ret && ret != -EOPNOTSUPP)
4955 break;
4956 ret = 0;
4933 count += next - start; 4957 count += next - start;
4934 } 4958 }
4935 free_count += next - start; 4959 free_count += next - start;
@@ -4950,8 +4974,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4950 break; 4974 break;
4951 } 4975 }
4952 4976
4953 if (!ret) 4977 if (!ret) {
4978 ret = count;
4954 EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); 4979 EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
4980 }
4955out: 4981out:
4956 ext4_unlock_group(sb, group); 4982 ext4_unlock_group(sb, group);
4957 ext4_mb_unload_buddy(&e4b); 4983 ext4_mb_unload_buddy(&e4b);
@@ -4959,7 +4985,7 @@ out:
4959 ext4_debug("trimmed %d blocks in the group %d\n", 4985 ext4_debug("trimmed %d blocks in the group %d\n",
4960 count, group); 4986 count, group);
4961 4987
4962 return count; 4988 return ret;
4963} 4989}
4964 4990
4965/** 4991/**
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index f1bb32ec0169..db8226d595fa 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -14,6 +14,7 @@
14 14
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include "ext4_jbd2.h" 16#include "ext4_jbd2.h"
17#include "ext4_extents.h"
17 18
18/* 19/*
19 * The contiguous blocks details which can be 20 * The contiguous blocks details which can be
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 292daeeed455..d9cc5ee42f53 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -18,6 +18,7 @@
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include "ext4_jbd2.h" 19#include "ext4_jbd2.h"
20#include "ext4.h" 20#include "ext4.h"
21#include "ext4_extents.h"
21 22
22/** 23/**
23 * get_ext_path - Find an extent path for designated logical block number. 24 * get_ext_path - Find an extent path for designated logical block number.
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6d600a69fc9d..cac448282331 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -202,13 +202,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
202 struct inode *inode); 202 struct inode *inode);
203 203
204/* checksumming functions */ 204/* checksumming functions */
205#define EXT4_DIRENT_TAIL(block, blocksize) \ 205void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
206 ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ 206 unsigned int blocksize)
207 ((blocksize) - \
208 sizeof(struct ext4_dir_entry_tail))))
209
210static void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
211 unsigned int blocksize)
212{ 207{
213 memset(t, 0, sizeof(struct ext4_dir_entry_tail)); 208 memset(t, 0, sizeof(struct ext4_dir_entry_tail));
214 t->det_rec_len = ext4_rec_len_to_disk( 209 t->det_rec_len = ext4_rec_len_to_disk(
@@ -261,6 +256,12 @@ static __le32 ext4_dirent_csum(struct inode *inode,
261 return cpu_to_le32(csum); 256 return cpu_to_le32(csum);
262} 257}
263 258
259static void warn_no_space_for_csum(struct inode *inode)
260{
261 ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for "
262 "checksum. Please run e2fsck -D.", inode->i_ino);
263}
264
264int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) 265int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
265{ 266{
266 struct ext4_dir_entry_tail *t; 267 struct ext4_dir_entry_tail *t;
@@ -271,8 +272,7 @@ int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
271 272
272 t = get_dirent_tail(inode, dirent); 273 t = get_dirent_tail(inode, dirent);
273 if (!t) { 274 if (!t) {
274 EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir " 275 warn_no_space_for_csum(inode);
275 "leaf for checksum. Please run e2fsck -D.");
276 return 0; 276 return 0;
277 } 277 }
278 278
@@ -294,8 +294,7 @@ static void ext4_dirent_csum_set(struct inode *inode,
294 294
295 t = get_dirent_tail(inode, dirent); 295 t = get_dirent_tail(inode, dirent);
296 if (!t) { 296 if (!t) {
297 EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir " 297 warn_no_space_for_csum(inode);
298 "leaf for checksum. Please run e2fsck -D.");
299 return; 298 return;
300 } 299 }
301 300
@@ -303,9 +302,9 @@ static void ext4_dirent_csum_set(struct inode *inode,
303 (void *)t - (void *)dirent); 302 (void *)t - (void *)dirent);
304} 303}
305 304
306static inline int ext4_handle_dirty_dirent_node(handle_t *handle, 305int ext4_handle_dirty_dirent_node(handle_t *handle,
307 struct inode *inode, 306 struct inode *inode,
308 struct buffer_head *bh) 307 struct buffer_head *bh)
309{ 308{
310 ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); 309 ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
311 return ext4_handle_dirty_metadata(handle, inode, bh); 310 return ext4_handle_dirty_metadata(handle, inode, bh);
@@ -377,8 +376,7 @@ static int ext4_dx_csum_verify(struct inode *inode,
377 count = le16_to_cpu(c->count); 376 count = le16_to_cpu(c->count);
378 if (count_offset + (limit * sizeof(struct dx_entry)) > 377 if (count_offset + (limit * sizeof(struct dx_entry)) >
379 EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { 378 EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
380 EXT4_ERROR_INODE(inode, "metadata_csum set but no space for " 379 warn_no_space_for_csum(inode);
381 "tree checksum found. Run e2fsck -D.");
382 return 1; 380 return 1;
383 } 381 }
384 t = (struct dx_tail *)(((struct dx_entry *)c) + limit); 382 t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
@@ -408,8 +406,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
408 count = le16_to_cpu(c->count); 406 count = le16_to_cpu(c->count);
409 if (count_offset + (limit * sizeof(struct dx_entry)) > 407 if (count_offset + (limit * sizeof(struct dx_entry)) >
410 EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { 408 EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
411 EXT4_ERROR_INODE(inode, "metadata_csum set but no space for " 409 warn_no_space_for_csum(inode);
412 "tree checksum. Run e2fsck -D.");
413 return; 410 return;
414 } 411 }
415 t = (struct dx_tail *)(((struct dx_entry *)c) + limit); 412 t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
@@ -890,6 +887,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
890 EXT4_DIR_REC_LEN(0)); 887 EXT4_DIR_REC_LEN(0));
891 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { 888 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
892 if (ext4_check_dir_entry(dir, NULL, de, bh, 889 if (ext4_check_dir_entry(dir, NULL, de, bh,
890 bh->b_data, bh->b_size,
893 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 891 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
894 + ((char *)de - bh->b_data))) { 892 + ((char *)de - bh->b_data))) {
895 /* On error, skip the f_pos to the next block. */ 893 /* On error, skip the f_pos to the next block. */
@@ -1007,6 +1005,15 @@ errout:
1007 return (err); 1005 return (err);
1008} 1006}
1009 1007
1008static inline int search_dirblock(struct buffer_head *bh,
1009 struct inode *dir,
1010 const struct qstr *d_name,
1011 unsigned int offset,
1012 struct ext4_dir_entry_2 **res_dir)
1013{
1014 return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir,
1015 d_name, offset, res_dir);
1016}
1010 1017
1011/* 1018/*
1012 * Directory block splitting, compacting 1019 * Directory block splitting, compacting
@@ -1081,13 +1088,6 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
1081 dx_set_count(entries, count + 1); 1088 dx_set_count(entries, count + 1);
1082} 1089}
1083 1090
1084static void ext4_update_dx_flag(struct inode *inode)
1085{
1086 if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
1087 EXT4_FEATURE_COMPAT_DIR_INDEX))
1088 ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
1089}
1090
1091/* 1091/*
1092 * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure. 1092 * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
1093 * 1093 *
@@ -1107,11 +1107,13 @@ static inline int ext4_match (int len, const char * const name,
1107/* 1107/*
1108 * Returns 0 if not found, -1 on failure, and 1 on success 1108 * Returns 0 if not found, -1 on failure, and 1 on success
1109 */ 1109 */
1110static inline int search_dirblock(struct buffer_head *bh, 1110int search_dir(struct buffer_head *bh,
1111 struct inode *dir, 1111 char *search_buf,
1112 const struct qstr *d_name, 1112 int buf_size,
1113 unsigned int offset, 1113 struct inode *dir,
1114 struct ext4_dir_entry_2 ** res_dir) 1114 const struct qstr *d_name,
1115 unsigned int offset,
1116 struct ext4_dir_entry_2 **res_dir)
1115{ 1117{
1116 struct ext4_dir_entry_2 * de; 1118 struct ext4_dir_entry_2 * de;
1117 char * dlimit; 1119 char * dlimit;
@@ -1119,8 +1121,8 @@ static inline int search_dirblock(struct buffer_head *bh,
1119 const char *name = d_name->name; 1121 const char *name = d_name->name;
1120 int namelen = d_name->len; 1122 int namelen = d_name->len;
1121 1123
1122 de = (struct ext4_dir_entry_2 *) bh->b_data; 1124 de = (struct ext4_dir_entry_2 *)search_buf;
1123 dlimit = bh->b_data + dir->i_sb->s_blocksize; 1125 dlimit = search_buf + buf_size;
1124 while ((char *) de < dlimit) { 1126 while ((char *) de < dlimit) {
1125 /* this code is executed quadratically often */ 1127 /* this code is executed quadratically often */
1126 /* do minimal checking `by hand' */ 1128 /* do minimal checking `by hand' */
@@ -1128,7 +1130,8 @@ static inline int search_dirblock(struct buffer_head *bh,
1128 if ((char *) de + namelen <= dlimit && 1130 if ((char *) de + namelen <= dlimit &&
1129 ext4_match (namelen, name, de)) { 1131 ext4_match (namelen, name, de)) {
1130 /* found a match - just to be sure, do a full check */ 1132 /* found a match - just to be sure, do a full check */
1131 if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) 1133 if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
1134 bh->b_size, offset))
1132 return -1; 1135 return -1;
1133 *res_dir = de; 1136 *res_dir = de;
1134 return 1; 1137 return 1;
@@ -1144,6 +1147,21 @@ static inline int search_dirblock(struct buffer_head *bh,
1144 return 0; 1147 return 0;
1145} 1148}
1146 1149
1150static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
1151 struct ext4_dir_entry *de)
1152{
1153 struct super_block *sb = dir->i_sb;
1154
1155 if (!is_dx(dir))
1156 return 0;
1157 if (block == 0)
1158 return 1;
1159 if (de->inode == 0 &&
1160 ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) ==
1161 sb->s_blocksize)
1162 return 1;
1163 return 0;
1164}
1147 1165
1148/* 1166/*
1149 * ext4_find_entry() 1167 * ext4_find_entry()
@@ -1158,7 +1176,8 @@ static inline int search_dirblock(struct buffer_head *bh,
1158 */ 1176 */
1159static struct buffer_head * ext4_find_entry (struct inode *dir, 1177static struct buffer_head * ext4_find_entry (struct inode *dir,
1160 const struct qstr *d_name, 1178 const struct qstr *d_name,
1161 struct ext4_dir_entry_2 ** res_dir) 1179 struct ext4_dir_entry_2 **res_dir,
1180 int *inlined)
1162{ 1181{
1163 struct super_block *sb; 1182 struct super_block *sb;
1164 struct buffer_head *bh_use[NAMEI_RA_SIZE]; 1183 struct buffer_head *bh_use[NAMEI_RA_SIZE];
@@ -1179,6 +1198,18 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
1179 namelen = d_name->len; 1198 namelen = d_name->len;
1180 if (namelen > EXT4_NAME_LEN) 1199 if (namelen > EXT4_NAME_LEN)
1181 return NULL; 1200 return NULL;
1201
1202 if (ext4_has_inline_data(dir)) {
1203 int has_inline_data = 1;
1204 ret = ext4_find_inline_entry(dir, d_name, res_dir,
1205 &has_inline_data);
1206 if (has_inline_data) {
1207 if (inlined)
1208 *inlined = 1;
1209 return ret;
1210 }
1211 }
1212
1182 if ((namelen <= 2) && (name[0] == '.') && 1213 if ((namelen <= 2) && (name[0] == '.') &&
1183 (name[1] == '.' || name[1] == '\0')) { 1214 (name[1] == '.' || name[1] == '\0')) {
1184 /* 1215 /*
@@ -1244,6 +1275,8 @@ restart:
1244 goto next; 1275 goto next;
1245 } 1276 }
1246 if (!buffer_verified(bh) && 1277 if (!buffer_verified(bh) &&
1278 !is_dx_internal_node(dir, block,
1279 (struct ext4_dir_entry *)bh->b_data) &&
1247 !ext4_dirent_csum_verify(dir, 1280 !ext4_dirent_csum_verify(dir,
1248 (struct ext4_dir_entry *)bh->b_data)) { 1281 (struct ext4_dir_entry *)bh->b_data)) {
1249 EXT4_ERROR_INODE(dir, "checksumming directory " 1282 EXT4_ERROR_INODE(dir, "checksumming directory "
@@ -1361,7 +1394,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
1361 if (dentry->d_name.len > EXT4_NAME_LEN) 1394 if (dentry->d_name.len > EXT4_NAME_LEN)
1362 return ERR_PTR(-ENAMETOOLONG); 1395 return ERR_PTR(-ENAMETOOLONG);
1363 1396
1364 bh = ext4_find_entry(dir, &dentry->d_name, &de); 1397 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
1365 inode = NULL; 1398 inode = NULL;
1366 if (bh) { 1399 if (bh) {
1367 __u32 ino = le32_to_cpu(de->inode); 1400 __u32 ino = le32_to_cpu(de->inode);
@@ -1395,7 +1428,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
1395 struct ext4_dir_entry_2 * de; 1428 struct ext4_dir_entry_2 * de;
1396 struct buffer_head *bh; 1429 struct buffer_head *bh;
1397 1430
1398 bh = ext4_find_entry(child->d_inode, &dotdot, &de); 1431 bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
1399 if (!bh) 1432 if (!bh)
1400 return ERR_PTR(-ENOENT); 1433 return ERR_PTR(-ENOENT);
1401 ino = le32_to_cpu(de->inode); 1434 ino = le32_to_cpu(de->inode);
@@ -1593,6 +1626,63 @@ errout:
1593 return NULL; 1626 return NULL;
1594} 1627}
1595 1628
1629int ext4_find_dest_de(struct inode *dir, struct inode *inode,
1630 struct buffer_head *bh,
1631 void *buf, int buf_size,
1632 const char *name, int namelen,
1633 struct ext4_dir_entry_2 **dest_de)
1634{
1635 struct ext4_dir_entry_2 *de;
1636 unsigned short reclen = EXT4_DIR_REC_LEN(namelen);
1637 int nlen, rlen;
1638 unsigned int offset = 0;
1639 char *top;
1640
1641 de = (struct ext4_dir_entry_2 *)buf;
1642 top = buf + buf_size - reclen;
1643 while ((char *) de <= top) {
1644 if (ext4_check_dir_entry(dir, NULL, de, bh,
1645 buf, buf_size, offset))
1646 return -EIO;
1647 if (ext4_match(namelen, name, de))
1648 return -EEXIST;
1649 nlen = EXT4_DIR_REC_LEN(de->name_len);
1650 rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
1651 if ((de->inode ? rlen - nlen : rlen) >= reclen)
1652 break;
1653 de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
1654 offset += rlen;
1655 }
1656 if ((char *) de > top)
1657 return -ENOSPC;
1658
1659 *dest_de = de;
1660 return 0;
1661}
1662
1663void ext4_insert_dentry(struct inode *inode,
1664 struct ext4_dir_entry_2 *de,
1665 int buf_size,
1666 const char *name, int namelen)
1667{
1668
1669 int nlen, rlen;
1670
1671 nlen = EXT4_DIR_REC_LEN(de->name_len);
1672 rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
1673 if (de->inode) {
1674 struct ext4_dir_entry_2 *de1 =
1675 (struct ext4_dir_entry_2 *)((char *)de + nlen);
1676 de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size);
1677 de->rec_len = ext4_rec_len_to_disk(nlen, buf_size);
1678 de = de1;
1679 }
1680 de->file_type = EXT4_FT_UNKNOWN;
1681 de->inode = cpu_to_le32(inode->i_ino);
1682 ext4_set_de_type(inode->i_sb, de, inode->i_mode);
1683 de->name_len = namelen;
1684 memcpy(de->name, name, namelen);
1685}
1596/* 1686/*
1597 * Add a new entry into a directory (leaf) block. If de is non-NULL, 1687 * Add a new entry into a directory (leaf) block. If de is non-NULL,
1598 * it points to a directory entry which is guaranteed to be large 1688 * it points to a directory entry which is guaranteed to be large
@@ -1608,12 +1698,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1608 struct inode *dir = dentry->d_parent->d_inode; 1698 struct inode *dir = dentry->d_parent->d_inode;
1609 const char *name = dentry->d_name.name; 1699 const char *name = dentry->d_name.name;
1610 int namelen = dentry->d_name.len; 1700 int namelen = dentry->d_name.len;
1611 unsigned int offset = 0;
1612 unsigned int blocksize = dir->i_sb->s_blocksize; 1701 unsigned int blocksize = dir->i_sb->s_blocksize;
1613 unsigned short reclen; 1702 unsigned short reclen;
1614 int nlen, rlen, err;
1615 char *top;
1616 int csum_size = 0; 1703 int csum_size = 0;
1704 int err;
1617 1705
1618 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 1706 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
1619 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 1707 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
@@ -1621,22 +1709,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1621 1709
1622 reclen = EXT4_DIR_REC_LEN(namelen); 1710 reclen = EXT4_DIR_REC_LEN(namelen);
1623 if (!de) { 1711 if (!de) {
1624 de = (struct ext4_dir_entry_2 *)bh->b_data; 1712 err = ext4_find_dest_de(dir, inode,
1625 top = bh->b_data + (blocksize - csum_size) - reclen; 1713 bh, bh->b_data, blocksize - csum_size,
1626 while ((char *) de <= top) { 1714 name, namelen, &de);
1627 if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) 1715 if (err)
1628 return -EIO; 1716 return err;
1629 if (ext4_match(namelen, name, de))
1630 return -EEXIST;
1631 nlen = EXT4_DIR_REC_LEN(de->name_len);
1632 rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
1633 if ((de->inode? rlen - nlen: rlen) >= reclen)
1634 break;
1635 de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
1636 offset += rlen;
1637 }
1638 if ((char *) de > top)
1639 return -ENOSPC;
1640 } 1717 }
1641 BUFFER_TRACE(bh, "get_write_access"); 1718 BUFFER_TRACE(bh, "get_write_access");
1642 err = ext4_journal_get_write_access(handle, bh); 1719 err = ext4_journal_get_write_access(handle, bh);
@@ -1646,19 +1723,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1646 } 1723 }
1647 1724
1648 /* By now the buffer is marked for journaling */ 1725 /* By now the buffer is marked for journaling */
1649 nlen = EXT4_DIR_REC_LEN(de->name_len); 1726 ext4_insert_dentry(inode, de, blocksize, name, namelen);
1650 rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); 1727
1651 if (de->inode) {
1652 struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
1653 de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
1654 de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
1655 de = de1;
1656 }
1657 de->file_type = EXT4_FT_UNKNOWN;
1658 de->inode = cpu_to_le32(inode->i_ino);
1659 ext4_set_de_type(dir->i_sb, de, inode->i_mode);
1660 de->name_len = namelen;
1661 memcpy(de->name, name, namelen);
1662 /* 1728 /*
1663 * XXX shouldn't update any times until successful 1729 * XXX shouldn't update any times until successful
1664 * completion of syscall, but too many callers depend 1730 * completion of syscall, but too many callers depend
@@ -1831,6 +1897,17 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1831 blocksize = sb->s_blocksize; 1897 blocksize = sb->s_blocksize;
1832 if (!dentry->d_name.len) 1898 if (!dentry->d_name.len)
1833 return -EINVAL; 1899 return -EINVAL;
1900
1901 if (ext4_has_inline_data(dir)) {
1902 retval = ext4_try_add_inline_entry(handle, dentry, inode);
1903 if (retval < 0)
1904 return retval;
1905 if (retval == 1) {
1906 retval = 0;
1907 return retval;
1908 }
1909 }
1910
1834 if (is_dx(dir)) { 1911 if (is_dx(dir)) {
1835 retval = ext4_dx_add_entry(handle, dentry, inode); 1912 retval = ext4_dx_add_entry(handle, dentry, inode);
1836 if (!retval || (retval != ERR_BAD_DX_DIR)) 1913 if (!retval || (retval != ERR_BAD_DX_DIR))
@@ -2036,36 +2113,29 @@ cleanup:
2036} 2113}
2037 2114
2038/* 2115/*
2039 * ext4_delete_entry deletes a directory entry by merging it with the 2116 * ext4_generic_delete_entry deletes a directory entry by merging it
2040 * previous entry 2117 * with the previous entry
2041 */ 2118 */
2042static int ext4_delete_entry(handle_t *handle, 2119int ext4_generic_delete_entry(handle_t *handle,
2043 struct inode *dir, 2120 struct inode *dir,
2044 struct ext4_dir_entry_2 *de_del, 2121 struct ext4_dir_entry_2 *de_del,
2045 struct buffer_head *bh) 2122 struct buffer_head *bh,
2123 void *entry_buf,
2124 int buf_size,
2125 int csum_size)
2046{ 2126{
2047 struct ext4_dir_entry_2 *de, *pde; 2127 struct ext4_dir_entry_2 *de, *pde;
2048 unsigned int blocksize = dir->i_sb->s_blocksize; 2128 unsigned int blocksize = dir->i_sb->s_blocksize;
2049 int csum_size = 0; 2129 int i;
2050 int i, err;
2051
2052 if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
2053 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
2054 csum_size = sizeof(struct ext4_dir_entry_tail);
2055 2130
2056 i = 0; 2131 i = 0;
2057 pde = NULL; 2132 pde = NULL;
2058 de = (struct ext4_dir_entry_2 *) bh->b_data; 2133 de = (struct ext4_dir_entry_2 *)entry_buf;
2059 while (i < bh->b_size - csum_size) { 2134 while (i < buf_size - csum_size) {
2060 if (ext4_check_dir_entry(dir, NULL, de, bh, i)) 2135 if (ext4_check_dir_entry(dir, NULL, de, bh,
2136 bh->b_data, bh->b_size, i))
2061 return -EIO; 2137 return -EIO;
2062 if (de == de_del) { 2138 if (de == de_del) {
2063 BUFFER_TRACE(bh, "get_write_access");
2064 err = ext4_journal_get_write_access(handle, bh);
2065 if (unlikely(err)) {
2066 ext4_std_error(dir->i_sb, err);
2067 return err;
2068 }
2069 if (pde) 2139 if (pde)
2070 pde->rec_len = ext4_rec_len_to_disk( 2140 pde->rec_len = ext4_rec_len_to_disk(
2071 ext4_rec_len_from_disk(pde->rec_len, 2141 ext4_rec_len_from_disk(pde->rec_len,
@@ -2076,12 +2146,6 @@ static int ext4_delete_entry(handle_t *handle,
2076 else 2146 else
2077 de->inode = 0; 2147 de->inode = 0;
2078 dir->i_version++; 2148 dir->i_version++;
2079 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
2080 err = ext4_handle_dirty_dirent_node(handle, dir, bh);
2081 if (unlikely(err)) {
2082 ext4_std_error(dir->i_sb, err);
2083 return err;
2084 }
2085 return 0; 2149 return 0;
2086 } 2150 }
2087 i += ext4_rec_len_from_disk(de->rec_len, blocksize); 2151 i += ext4_rec_len_from_disk(de->rec_len, blocksize);
@@ -2091,6 +2155,48 @@ static int ext4_delete_entry(handle_t *handle,
2091 return -ENOENT; 2155 return -ENOENT;
2092} 2156}
2093 2157
2158static int ext4_delete_entry(handle_t *handle,
2159 struct inode *dir,
2160 struct ext4_dir_entry_2 *de_del,
2161 struct buffer_head *bh)
2162{
2163 int err, csum_size = 0;
2164
2165 if (ext4_has_inline_data(dir)) {
2166 int has_inline_data = 1;
2167 err = ext4_delete_inline_entry(handle, dir, de_del, bh,
2168 &has_inline_data);
2169 if (has_inline_data)
2170 return err;
2171 }
2172
2173 if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
2174 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
2175 csum_size = sizeof(struct ext4_dir_entry_tail);
2176
2177 BUFFER_TRACE(bh, "get_write_access");
2178 err = ext4_journal_get_write_access(handle, bh);
2179 if (unlikely(err))
2180 goto out;
2181
2182 err = ext4_generic_delete_entry(handle, dir, de_del,
2183 bh, bh->b_data,
2184 dir->i_sb->s_blocksize, csum_size);
2185 if (err)
2186 goto out;
2187
2188 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
2189 err = ext4_handle_dirty_dirent_node(handle, dir, bh);
2190 if (unlikely(err))
2191 goto out;
2192
2193 return 0;
2194out:
2195 if (err != -ENOENT)
2196 ext4_std_error(dir->i_sb, err);
2197 return err;
2198}
2199
2094/* 2200/*
2095 * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2, 2201 * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
2096 * since this indicates that nlinks count was previously 1. 2202 * since this indicates that nlinks count was previously 1.
@@ -2211,21 +2317,95 @@ retry:
2211 return err; 2317 return err;
2212} 2318}
2213 2319
2214static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 2320struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
2321 struct ext4_dir_entry_2 *de,
2322 int blocksize, int csum_size,
2323 unsigned int parent_ino, int dotdot_real_len)
2324{
2325 de->inode = cpu_to_le32(inode->i_ino);
2326 de->name_len = 1;
2327 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
2328 blocksize);
2329 strcpy(de->name, ".");
2330 ext4_set_de_type(inode->i_sb, de, S_IFDIR);
2331
2332 de = ext4_next_entry(de, blocksize);
2333 de->inode = cpu_to_le32(parent_ino);
2334 de->name_len = 2;
2335 if (!dotdot_real_len)
2336 de->rec_len = ext4_rec_len_to_disk(blocksize -
2337 (csum_size + EXT4_DIR_REC_LEN(1)),
2338 blocksize);
2339 else
2340 de->rec_len = ext4_rec_len_to_disk(
2341 EXT4_DIR_REC_LEN(de->name_len), blocksize);
2342 strcpy(de->name, "..");
2343 ext4_set_de_type(inode->i_sb, de, S_IFDIR);
2344
2345 return ext4_next_entry(de, blocksize);
2346}
2347
2348static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
2349 struct inode *inode)
2215{ 2350{
2216 handle_t *handle;
2217 struct inode *inode;
2218 struct buffer_head *dir_block = NULL; 2351 struct buffer_head *dir_block = NULL;
2219 struct ext4_dir_entry_2 *de; 2352 struct ext4_dir_entry_2 *de;
2220 struct ext4_dir_entry_tail *t; 2353 struct ext4_dir_entry_tail *t;
2221 unsigned int blocksize = dir->i_sb->s_blocksize; 2354 unsigned int blocksize = dir->i_sb->s_blocksize;
2222 int csum_size = 0; 2355 int csum_size = 0;
2223 int err, retries = 0; 2356 int err;
2224 2357
2225 if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, 2358 if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
2226 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 2359 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
2227 csum_size = sizeof(struct ext4_dir_entry_tail); 2360 csum_size = sizeof(struct ext4_dir_entry_tail);
2228 2361
2362 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
2363 err = ext4_try_create_inline_dir(handle, dir, inode);
2364 if (err < 0 && err != -ENOSPC)
2365 goto out;
2366 if (!err)
2367 goto out;
2368 }
2369
2370 inode->i_size = EXT4_I(inode)->i_disksize = blocksize;
2371 dir_block = ext4_bread(handle, inode, 0, 1, &err);
2372 if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
2373 if (!err) {
2374 err = -EIO;
2375 ext4_error(inode->i_sb,
2376 "Directory hole detected on inode %lu\n",
2377 inode->i_ino);
2378 }
2379 goto out;
2380 }
2381 BUFFER_TRACE(dir_block, "get_write_access");
2382 err = ext4_journal_get_write_access(handle, dir_block);
2383 if (err)
2384 goto out;
2385 de = (struct ext4_dir_entry_2 *)dir_block->b_data;
2386 ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
2387 set_nlink(inode, 2);
2388 if (csum_size) {
2389 t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
2390 initialize_dirent_tail(t, blocksize);
2391 }
2392
2393 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
2394 err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
2395 if (err)
2396 goto out;
2397 set_buffer_verified(dir_block);
2398out:
2399 brelse(dir_block);
2400 return err;
2401}
2402
2403static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
2404{
2405 handle_t *handle;
2406 struct inode *inode;
2407 int err, retries = 0;
2408
2229 if (EXT4_DIR_LINK_MAX(dir)) 2409 if (EXT4_DIR_LINK_MAX(dir))
2230 return -EMLINK; 2410 return -EMLINK;
2231 2411
@@ -2249,47 +2429,9 @@ retry:
2249 2429
2250 inode->i_op = &ext4_dir_inode_operations; 2430 inode->i_op = &ext4_dir_inode_operations;
2251 inode->i_fop = &ext4_dir_operations; 2431 inode->i_fop = &ext4_dir_operations;
2252 inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; 2432 err = ext4_init_new_dir(handle, dir, inode);
2253 if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
2254 if (!err) {
2255 err = -EIO;
2256 ext4_error(inode->i_sb,
2257 "Directory hole detected on inode %lu\n",
2258 inode->i_ino);
2259 }
2260 goto out_clear_inode;
2261 }
2262 BUFFER_TRACE(dir_block, "get_write_access");
2263 err = ext4_journal_get_write_access(handle, dir_block);
2264 if (err)
2265 goto out_clear_inode;
2266 de = (struct ext4_dir_entry_2 *) dir_block->b_data;
2267 de->inode = cpu_to_le32(inode->i_ino);
2268 de->name_len = 1;
2269 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
2270 blocksize);
2271 strcpy(de->name, ".");
2272 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
2273 de = ext4_next_entry(de, blocksize);
2274 de->inode = cpu_to_le32(dir->i_ino);
2275 de->rec_len = ext4_rec_len_to_disk(blocksize -
2276 (csum_size + EXT4_DIR_REC_LEN(1)),
2277 blocksize);
2278 de->name_len = 2;
2279 strcpy(de->name, "..");
2280 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
2281 set_nlink(inode, 2);
2282
2283 if (csum_size) {
2284 t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
2285 initialize_dirent_tail(t, blocksize);
2286 }
2287
2288 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
2289 err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
2290 if (err) 2433 if (err)
2291 goto out_clear_inode; 2434 goto out_clear_inode;
2292 set_buffer_verified(dir_block);
2293 err = ext4_mark_inode_dirty(handle, inode); 2435 err = ext4_mark_inode_dirty(handle, inode);
2294 if (!err) 2436 if (!err)
2295 err = ext4_add_entry(handle, dentry, inode); 2437 err = ext4_add_entry(handle, dentry, inode);
@@ -2309,7 +2451,6 @@ out_clear_inode:
2309 unlock_new_inode(inode); 2451 unlock_new_inode(inode);
2310 d_instantiate(dentry, inode); 2452 d_instantiate(dentry, inode);
2311out_stop: 2453out_stop:
2312 brelse(dir_block);
2313 ext4_journal_stop(handle); 2454 ext4_journal_stop(handle);
2314 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) 2455 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2315 goto retry; 2456 goto retry;
@@ -2327,6 +2468,14 @@ static int empty_dir(struct inode *inode)
2327 struct super_block *sb; 2468 struct super_block *sb;
2328 int err = 0; 2469 int err = 0;
2329 2470
2471 if (ext4_has_inline_data(inode)) {
2472 int has_inline_data = 1;
2473
2474 err = empty_inline_dir(inode, &has_inline_data);
2475 if (has_inline_data)
2476 return err;
2477 }
2478
2330 sb = inode->i_sb; 2479 sb = inode->i_sb;
2331 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || 2480 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
2332 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { 2481 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
@@ -2393,7 +2542,8 @@ static int empty_dir(struct inode *inode)
2393 set_buffer_verified(bh); 2542 set_buffer_verified(bh);
2394 de = (struct ext4_dir_entry_2 *) bh->b_data; 2543 de = (struct ext4_dir_entry_2 *) bh->b_data;
2395 } 2544 }
2396 if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) { 2545 if (ext4_check_dir_entry(inode, NULL, de, bh,
2546 bh->b_data, bh->b_size, offset)) {
2397 de = (struct ext4_dir_entry_2 *)(bh->b_data + 2547 de = (struct ext4_dir_entry_2 *)(bh->b_data +
2398 sb->s_blocksize); 2548 sb->s_blocksize);
2399 offset = (offset | (sb->s_blocksize - 1)) + 1; 2549 offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2579,7 +2729,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2579 return PTR_ERR(handle); 2729 return PTR_ERR(handle);
2580 2730
2581 retval = -ENOENT; 2731 retval = -ENOENT;
2582 bh = ext4_find_entry(dir, &dentry->d_name, &de); 2732 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
2583 if (!bh) 2733 if (!bh)
2584 goto end_rmdir; 2734 goto end_rmdir;
2585 2735
@@ -2644,7 +2794,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2644 ext4_handle_sync(handle); 2794 ext4_handle_sync(handle);
2645 2795
2646 retval = -ENOENT; 2796 retval = -ENOENT;
2647 bh = ext4_find_entry(dir, &dentry->d_name, &de); 2797 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
2648 if (!bh) 2798 if (!bh)
2649 goto end_unlink; 2799 goto end_unlink;
2650 2800
@@ -2826,8 +2976,39 @@ retry:
2826 return err; 2976 return err;
2827} 2977}
2828 2978
2829#define PARENT_INO(buffer, size) \ 2979
2830 (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode) 2980/*
2981 * Try to find buffer head where contains the parent block.
2982 * It should be the inode block if it is inlined or the 1st block
2983 * if it is a normal dir.
2984 */
2985static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
2986 struct inode *inode,
2987 int *retval,
2988 struct ext4_dir_entry_2 **parent_de,
2989 int *inlined)
2990{
2991 struct buffer_head *bh;
2992
2993 if (!ext4_has_inline_data(inode)) {
2994 if (!(bh = ext4_bread(handle, inode, 0, 0, retval))) {
2995 if (!*retval) {
2996 *retval = -EIO;
2997 ext4_error(inode->i_sb,
2998 "Directory hole detected on inode %lu\n",
2999 inode->i_ino);
3000 }
3001 return NULL;
3002 }
3003 *parent_de = ext4_next_entry(
3004 (struct ext4_dir_entry_2 *)bh->b_data,
3005 inode->i_sb->s_blocksize);
3006 return bh;
3007 }
3008
3009 *inlined = 1;
3010 return ext4_get_first_inline_block(inode, parent_de, retval);
3011}
2831 3012
2832/* 3013/*
2833 * Anybody can rename anything with this: the permission checks are left to the 3014 * Anybody can rename anything with this: the permission checks are left to the
@@ -2841,6 +3022,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2841 struct buffer_head *old_bh, *new_bh, *dir_bh; 3022 struct buffer_head *old_bh, *new_bh, *dir_bh;
2842 struct ext4_dir_entry_2 *old_de, *new_de; 3023 struct ext4_dir_entry_2 *old_de, *new_de;
2843 int retval, force_da_alloc = 0; 3024 int retval, force_da_alloc = 0;
3025 int inlined = 0, new_inlined = 0;
3026 struct ext4_dir_entry_2 *parent_de;
2844 3027
2845 dquot_initialize(old_dir); 3028 dquot_initialize(old_dir);
2846 dquot_initialize(new_dir); 3029 dquot_initialize(new_dir);
@@ -2860,7 +3043,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2860 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) 3043 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2861 ext4_handle_sync(handle); 3044 ext4_handle_sync(handle);
2862 3045
2863 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); 3046 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL);
2864 /* 3047 /*
2865 * Check for inode number is _not_ due to possible IO errors. 3048 * Check for inode number is _not_ due to possible IO errors.
2866 * We might rmdir the source, keep it as pwd of some process 3049 * We might rmdir the source, keep it as pwd of some process
@@ -2873,7 +3056,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2873 goto end_rename; 3056 goto end_rename;
2874 3057
2875 new_inode = new_dentry->d_inode; 3058 new_inode = new_dentry->d_inode;
2876 new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de); 3059 new_bh = ext4_find_entry(new_dir, &new_dentry->d_name,
3060 &new_de, &new_inlined);
2877 if (new_bh) { 3061 if (new_bh) {
2878 if (!new_inode) { 3062 if (!new_inode) {
2879 brelse(new_bh); 3063 brelse(new_bh);
@@ -2887,22 +3071,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2887 goto end_rename; 3071 goto end_rename;
2888 } 3072 }
2889 retval = -EIO; 3073 retval = -EIO;
2890 if (!(dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval))) { 3074 dir_bh = ext4_get_first_dir_block(handle, old_inode,
2891 if (!retval) { 3075 &retval, &parent_de,
2892 retval = -EIO; 3076 &inlined);
2893 ext4_error(old_inode->i_sb, 3077 if (!dir_bh)
2894 "Directory hole detected on inode %lu\n",
2895 old_inode->i_ino);
2896 }
2897 goto end_rename; 3078 goto end_rename;
2898 } 3079 if (!inlined && !buffer_verified(dir_bh) &&
2899 if (!buffer_verified(dir_bh) &&
2900 !ext4_dirent_csum_verify(old_inode, 3080 !ext4_dirent_csum_verify(old_inode,
2901 (struct ext4_dir_entry *)dir_bh->b_data)) 3081 (struct ext4_dir_entry *)dir_bh->b_data))
2902 goto end_rename; 3082 goto end_rename;
2903 set_buffer_verified(dir_bh); 3083 set_buffer_verified(dir_bh);
2904 if (le32_to_cpu(PARENT_INO(dir_bh->b_data, 3084 if (le32_to_cpu(parent_de->inode) != old_dir->i_ino)
2905 old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
2906 goto end_rename; 3085 goto end_rename;
2907 retval = -EMLINK; 3086 retval = -EMLINK;
2908 if (!new_inode && new_dir != old_dir && 3087 if (!new_inode && new_dir != old_dir &&
@@ -2931,10 +3110,13 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2931 ext4_current_time(new_dir); 3110 ext4_current_time(new_dir);
2932 ext4_mark_inode_dirty(handle, new_dir); 3111 ext4_mark_inode_dirty(handle, new_dir);
2933 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); 3112 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
2934 retval = ext4_handle_dirty_dirent_node(handle, new_dir, new_bh); 3113 if (!new_inlined) {
2935 if (unlikely(retval)) { 3114 retval = ext4_handle_dirty_dirent_node(handle,
2936 ext4_std_error(new_dir->i_sb, retval); 3115 new_dir, new_bh);
2937 goto end_rename; 3116 if (unlikely(retval)) {
3117 ext4_std_error(new_dir->i_sb, retval);
3118 goto end_rename;
3119 }
2938 } 3120 }
2939 brelse(new_bh); 3121 brelse(new_bh);
2940 new_bh = NULL; 3122 new_bh = NULL;
@@ -2962,7 +3144,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2962 struct buffer_head *old_bh2; 3144 struct buffer_head *old_bh2;
2963 struct ext4_dir_entry_2 *old_de2; 3145 struct ext4_dir_entry_2 *old_de2;
2964 3146
2965 old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2); 3147 old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name,
3148 &old_de2, NULL);
2966 if (old_bh2) { 3149 if (old_bh2) {
2967 retval = ext4_delete_entry(handle, old_dir, 3150 retval = ext4_delete_entry(handle, old_dir,
2968 old_de2, old_bh2); 3151 old_de2, old_bh2);
@@ -2982,17 +3165,19 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2982 old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); 3165 old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
2983 ext4_update_dx_flag(old_dir); 3166 ext4_update_dx_flag(old_dir);
2984 if (dir_bh) { 3167 if (dir_bh) {
2985 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = 3168 parent_de->inode = cpu_to_le32(new_dir->i_ino);
2986 cpu_to_le32(new_dir->i_ino);
2987 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); 3169 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2988 if (is_dx(old_inode)) { 3170 if (!inlined) {
2989 retval = ext4_handle_dirty_dx_node(handle, 3171 if (is_dx(old_inode)) {
2990 old_inode, 3172 retval = ext4_handle_dirty_dx_node(handle,
2991 dir_bh); 3173 old_inode,
3174 dir_bh);
3175 } else {
3176 retval = ext4_handle_dirty_dirent_node(handle,
3177 old_inode, dir_bh);
3178 }
2992 } else { 3179 } else {
2993 retval = ext4_handle_dirty_dirent_node(handle, 3180 retval = ext4_mark_inode_dirty(handle, old_inode);
2994 old_inode,
2995 dir_bh);
2996 } 3181 }
2997 if (retval) { 3182 if (retval) {
2998 ext4_std_error(old_dir->i_sb, retval); 3183 ext4_std_error(old_dir->i_sb, retval);
@@ -3043,23 +3228,19 @@ const struct inode_operations ext4_dir_inode_operations = {
3043 .mknod = ext4_mknod, 3228 .mknod = ext4_mknod,
3044 .rename = ext4_rename, 3229 .rename = ext4_rename,
3045 .setattr = ext4_setattr, 3230 .setattr = ext4_setattr,
3046#ifdef CONFIG_EXT4_FS_XATTR
3047 .setxattr = generic_setxattr, 3231 .setxattr = generic_setxattr,
3048 .getxattr = generic_getxattr, 3232 .getxattr = generic_getxattr,
3049 .listxattr = ext4_listxattr, 3233 .listxattr = ext4_listxattr,
3050 .removexattr = generic_removexattr, 3234 .removexattr = generic_removexattr,
3051#endif
3052 .get_acl = ext4_get_acl, 3235 .get_acl = ext4_get_acl,
3053 .fiemap = ext4_fiemap, 3236 .fiemap = ext4_fiemap,
3054}; 3237};
3055 3238
3056const struct inode_operations ext4_special_inode_operations = { 3239const struct inode_operations ext4_special_inode_operations = {
3057 .setattr = ext4_setattr, 3240 .setattr = ext4_setattr,
3058#ifdef CONFIG_EXT4_FS_XATTR
3059 .setxattr = generic_setxattr, 3241 .setxattr = generic_setxattr,
3060 .getxattr = generic_getxattr, 3242 .getxattr = generic_getxattr,
3061 .listxattr = ext4_listxattr, 3243 .listxattr = ext4_listxattr,
3062 .removexattr = generic_removexattr, 3244 .removexattr = generic_removexattr,
3063#endif
3064 .get_acl = ext4_get_acl, 3245 .get_acl = ext4_get_acl,
3065}; 3246};
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 68e896e12a67..0016fbca2a40 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -27,7 +27,6 @@
27#include "ext4_jbd2.h" 27#include "ext4_jbd2.h"
28#include "xattr.h" 28#include "xattr.h"
29#include "acl.h" 29#include "acl.h"
30#include "ext4_extents.h"
31 30
32static struct kmem_cache *io_page_cachep, *io_end_cachep; 31static struct kmem_cache *io_page_cachep, *io_end_cachep;
33 32
@@ -111,7 +110,7 @@ static int ext4_end_io(ext4_io_end_t *io)
111 inode_dio_done(inode); 110 inode_dio_done(inode);
112 /* Wake up anyone waiting on unwritten extent conversion */ 111 /* Wake up anyone waiting on unwritten extent conversion */
113 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) 112 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
114 wake_up_all(ext4_ioend_wq(io->inode)); 113 wake_up_all(ext4_ioend_wq(inode));
115 return ret; 114 return ret;
116} 115}
117 116
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 47bf06a2765d..d99387b89edd 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -783,7 +783,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
783 783
784 err = ext4_journal_get_write_access(handle, gdb_bh); 784 err = ext4_journal_get_write_access(handle, gdb_bh);
785 if (unlikely(err)) 785 if (unlikely(err))
786 goto exit_sbh; 786 goto exit_dind;
787 787
788 err = ext4_journal_get_write_access(handle, dind); 788 err = ext4_journal_get_write_access(handle, dind);
789 if (unlikely(err)) 789 if (unlikely(err))
@@ -792,7 +792,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
792 /* ext4_reserve_inode_write() gets a reference on the iloc */ 792 /* ext4_reserve_inode_write() gets a reference on the iloc */
793 err = ext4_reserve_inode_write(handle, inode, &iloc); 793 err = ext4_reserve_inode_write(handle, inode, &iloc);
794 if (unlikely(err)) 794 if (unlikely(err))
795 goto exit_dindj; 795 goto exit_dind;
796 796
797 n_group_desc = ext4_kvmalloc((gdb_num + 1) * 797 n_group_desc = ext4_kvmalloc((gdb_num + 1) *
798 sizeof(struct buffer_head *), 798 sizeof(struct buffer_head *),
@@ -846,12 +846,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
846 846
847exit_inode: 847exit_inode:
848 ext4_kvfree(n_group_desc); 848 ext4_kvfree(n_group_desc);
849 /* ext4_handle_release_buffer(handle, iloc.bh); */
850 brelse(iloc.bh); 849 brelse(iloc.bh);
851exit_dindj:
852 /* ext4_handle_release_buffer(handle, dind); */
853exit_sbh:
854 /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
855exit_dind: 850exit_dind:
856 brelse(dind); 851 brelse(dind);
857exit_bh: 852exit_bh:
@@ -969,14 +964,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
969 } 964 }
970 965
971 for (i = 0; i < reserved_gdb; i++) { 966 for (i = 0; i < reserved_gdb; i++) {
972 if ((err = ext4_journal_get_write_access(handle, primary[i]))) { 967 if ((err = ext4_journal_get_write_access(handle, primary[i])))
973 /*
974 int j;
975 for (j = 0; j < i; j++)
976 ext4_handle_release_buffer(handle, primary[j]);
977 */
978 goto exit_bh; 968 goto exit_bh;
979 }
980 } 969 }
981 970
982 if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) 971 if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 80928f716850..3cdb0a2fc648 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,7 +45,7 @@
45#include <linux/freezer.h> 45#include <linux/freezer.h>
46 46
47#include "ext4.h" 47#include "ext4.h"
48#include "ext4_extents.h" 48#include "ext4_extents.h" /* Needed for trace points definition */
49#include "ext4_jbd2.h" 49#include "ext4_jbd2.h"
50#include "xattr.h" 50#include "xattr.h"
51#include "acl.h" 51#include "acl.h"
@@ -939,10 +939,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
939 return NULL; 939 return NULL;
940 940
941 ei->vfs_inode.i_version = 1; 941 ei->vfs_inode.i_version = 1;
942 ei->vfs_inode.i_data.writeback_index = 0;
943 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 942 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
944 INIT_LIST_HEAD(&ei->i_prealloc_list); 943 INIT_LIST_HEAD(&ei->i_prealloc_list);
945 spin_lock_init(&ei->i_prealloc_lock); 944 spin_lock_init(&ei->i_prealloc_lock);
945 ext4_es_init_tree(&ei->i_es_tree);
946 rwlock_init(&ei->i_es_lock);
946 ei->i_reserved_data_blocks = 0; 947 ei->i_reserved_data_blocks = 0;
947 ei->i_reserved_meta_blocks = 0; 948 ei->i_reserved_meta_blocks = 0;
948 ei->i_allocated_meta_blocks = 0; 949 ei->i_allocated_meta_blocks = 0;
@@ -996,9 +997,7 @@ static void init_once(void *foo)
996 struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; 997 struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
997 998
998 INIT_LIST_HEAD(&ei->i_orphan); 999 INIT_LIST_HEAD(&ei->i_orphan);
999#ifdef CONFIG_EXT4_FS_XATTR
1000 init_rwsem(&ei->xattr_sem); 1000 init_rwsem(&ei->xattr_sem);
1001#endif
1002 init_rwsem(&ei->i_data_sem); 1001 init_rwsem(&ei->i_data_sem);
1003 inode_init_once(&ei->vfs_inode); 1002 inode_init_once(&ei->vfs_inode);
1004} 1003}
@@ -1031,6 +1030,7 @@ void ext4_clear_inode(struct inode *inode)
1031 clear_inode(inode); 1030 clear_inode(inode);
1032 dquot_drop(inode); 1031 dquot_drop(inode);
1033 ext4_discard_preallocations(inode); 1032 ext4_discard_preallocations(inode);
1033 ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
1034 if (EXT4_I(inode)->jinode) { 1034 if (EXT4_I(inode)->jinode) {
1035 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), 1035 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
1036 EXT4_I(inode)->jinode); 1036 EXT4_I(inode)->jinode);
@@ -1447,13 +1447,8 @@ static const struct mount_opts {
1447 {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ}, 1447 {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ},
1448 {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ}, 1448 {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ},
1449 {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ}, 1449 {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ},
1450#ifdef CONFIG_EXT4_FS_XATTR
1451 {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, 1450 {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
1452 {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, 1451 {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
1453#else
1454 {Opt_user_xattr, 0, MOPT_NOSUPPORT},
1455 {Opt_nouser_xattr, 0, MOPT_NOSUPPORT},
1456#endif
1457#ifdef CONFIG_EXT4_FS_POSIX_ACL 1452#ifdef CONFIG_EXT4_FS_POSIX_ACL
1458 {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, 1453 {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
1459 {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR}, 1454 {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
@@ -3202,7 +3197,6 @@ int ext4_calculate_overhead(struct super_block *sb)
3202 ext4_fsblk_t overhead = 0; 3197 ext4_fsblk_t overhead = 0;
3203 char *buf = (char *) get_zeroed_page(GFP_KERNEL); 3198 char *buf = (char *) get_zeroed_page(GFP_KERNEL);
3204 3199
3205 memset(buf, 0, PAGE_SIZE);
3206 if (!buf) 3200 if (!buf)
3207 return -ENOMEM; 3201 return -ENOMEM;
3208 3202
@@ -3256,7 +3250,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3256 unsigned int i; 3250 unsigned int i;
3257 int needs_recovery, has_huge_files, has_bigalloc; 3251 int needs_recovery, has_huge_files, has_bigalloc;
3258 __u64 blocks_count; 3252 __u64 blocks_count;
3259 int err; 3253 int err = 0;
3260 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 3254 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
3261 ext4_group_t first_not_zeroed; 3255 ext4_group_t first_not_zeroed;
3262 3256
@@ -3272,9 +3266,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3272 } 3266 }
3273 sb->s_fs_info = sbi; 3267 sb->s_fs_info = sbi;
3274 sbi->s_sb = sb; 3268 sbi->s_sb = sb;
3275 sbi->s_mount_opt = 0;
3276 sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID);
3277 sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID);
3278 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; 3269 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
3279 sbi->s_sb_block = sb_block; 3270 sbi->s_sb_block = sb_block;
3280 if (sb->s_bdev->bd_part) 3271 if (sb->s_bdev->bd_part)
@@ -3285,6 +3276,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3285 for (cp = sb->s_id; (cp = strchr(cp, '/'));) 3276 for (cp = sb->s_id; (cp = strchr(cp, '/'));)
3286 *cp = '!'; 3277 *cp = '!';
3287 3278
3279 /* -EINVAL is default */
3288 ret = -EINVAL; 3280 ret = -EINVAL;
3289 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); 3281 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
3290 if (!blocksize) { 3282 if (!blocksize) {
@@ -3369,9 +3361,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3369 if (def_mount_opts & EXT4_DEFM_UID16) 3361 if (def_mount_opts & EXT4_DEFM_UID16)
3370 set_opt(sb, NO_UID32); 3362 set_opt(sb, NO_UID32);
3371 /* xattr user namespace & acls are now defaulted on */ 3363 /* xattr user namespace & acls are now defaulted on */
3372#ifdef CONFIG_EXT4_FS_XATTR
3373 set_opt(sb, XATTR_USER); 3364 set_opt(sb, XATTR_USER);
3374#endif
3375#ifdef CONFIG_EXT4_FS_POSIX_ACL 3365#ifdef CONFIG_EXT4_FS_POSIX_ACL
3376 set_opt(sb, POSIX_ACL); 3366 set_opt(sb, POSIX_ACL);
3377#endif 3367#endif
@@ -3662,7 +3652,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3662 " too large to mount safely on this system"); 3652 " too large to mount safely on this system");
3663 if (sizeof(sector_t) < 8) 3653 if (sizeof(sector_t) < 8)
3664 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); 3654 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
3665 ret = err;
3666 goto failed_mount; 3655 goto failed_mount;
3667 } 3656 }
3668 3657
@@ -3770,7 +3759,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3770 } 3759 }
3771 if (err) { 3760 if (err) {
3772 ext4_msg(sb, KERN_ERR, "insufficient memory"); 3761 ext4_msg(sb, KERN_ERR, "insufficient memory");
3773 ret = err;
3774 goto failed_mount3; 3762 goto failed_mount3;
3775 } 3763 }
3776 3764
@@ -3801,7 +3789,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3801 3789
3802 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ 3790 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
3803 mutex_init(&sbi->s_orphan_lock); 3791 mutex_init(&sbi->s_orphan_lock);
3804 sbi->s_resize_flags = 0;
3805 3792
3806 sb->s_root = NULL; 3793 sb->s_root = NULL;
3807 3794
@@ -3897,8 +3884,8 @@ no_journal:
3897 if (es->s_overhead_clusters) 3884 if (es->s_overhead_clusters)
3898 sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); 3885 sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
3899 else { 3886 else {
3900 ret = ext4_calculate_overhead(sb); 3887 err = ext4_calculate_overhead(sb);
3901 if (ret) 3888 if (err)
3902 goto failed_mount_wq; 3889 goto failed_mount_wq;
3903 } 3890 }
3904 3891
@@ -3910,6 +3897,7 @@ no_journal:
3910 alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); 3897 alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3911 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3898 if (!EXT4_SB(sb)->dio_unwritten_wq) {
3912 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3899 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
3900 ret = -ENOMEM;
3913 goto failed_mount_wq; 3901 goto failed_mount_wq;
3914 } 3902 }
3915 3903
@@ -4012,12 +4000,20 @@ no_journal:
4012 /* Enable quota usage during mount. */ 4000 /* Enable quota usage during mount. */
4013 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && 4001 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
4014 !(sb->s_flags & MS_RDONLY)) { 4002 !(sb->s_flags & MS_RDONLY)) {
4015 ret = ext4_enable_quotas(sb); 4003 err = ext4_enable_quotas(sb);
4016 if (ret) 4004 if (err)
4017 goto failed_mount7; 4005 goto failed_mount7;
4018 } 4006 }
4019#endif /* CONFIG_QUOTA */ 4007#endif /* CONFIG_QUOTA */
4020 4008
4009 if (test_opt(sb, DISCARD)) {
4010 struct request_queue *q = bdev_get_queue(sb->s_bdev);
4011 if (!blk_queue_discard(q))
4012 ext4_msg(sb, KERN_WARNING,
4013 "mounting with \"discard\" option, but "
4014 "the device does not support discard");
4015 }
4016
4021 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " 4017 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
4022 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, 4018 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
4023 *sbi->s_es->s_mount_opts ? "; " : "", orig_data); 4019 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
@@ -4084,7 +4080,7 @@ out_fail:
4084 kfree(sbi); 4080 kfree(sbi);
4085out_free_orig: 4081out_free_orig:
4086 kfree(orig_data); 4082 kfree(orig_data);
4087 return ret; 4083 return err ? err : ret;
4088} 4084}
4089 4085
4090/* 4086/*
@@ -4790,7 +4786,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4790 4786
4791 buf->f_type = EXT4_SUPER_MAGIC; 4787 buf->f_type = EXT4_SUPER_MAGIC;
4792 buf->f_bsize = sb->s_blocksize; 4788 buf->f_bsize = sb->s_blocksize;
4793 buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, sbi->s_overhead); 4789 buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
4794 bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - 4790 bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
4795 percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); 4791 percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
4796 /* prevent underflow in case that few free space is available */ 4792 /* prevent underflow in case that few free space is available */
@@ -5282,6 +5278,7 @@ static int __init ext4_init_fs(void)
5282 ext4_li_info = NULL; 5278 ext4_li_info = NULL;
5283 mutex_init(&ext4_li_mtx); 5279 mutex_init(&ext4_li_mtx);
5284 5280
5281 /* Build-time check for flags consistency */
5285 ext4_check_flag_values(); 5282 ext4_check_flag_values();
5286 5283
5287 for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { 5284 for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
@@ -5289,9 +5286,14 @@ static int __init ext4_init_fs(void)
5289 init_waitqueue_head(&ext4__ioend_wq[i]); 5286 init_waitqueue_head(&ext4__ioend_wq[i]);
5290 } 5287 }
5291 5288
5292 err = ext4_init_pageio(); 5289 err = ext4_init_es();
5293 if (err) 5290 if (err)
5294 return err; 5291 return err;
5292
5293 err = ext4_init_pageio();
5294 if (err)
5295 goto out7;
5296
5295 err = ext4_init_system_zone(); 5297 err = ext4_init_system_zone();
5296 if (err) 5298 if (err)
5297 goto out6; 5299 goto out6;
@@ -5341,6 +5343,9 @@ out5:
5341 ext4_exit_system_zone(); 5343 ext4_exit_system_zone();
5342out6: 5344out6:
5343 ext4_exit_pageio(); 5345 ext4_exit_pageio();
5346out7:
5347 ext4_exit_es();
5348
5344 return err; 5349 return err;
5345} 5350}
5346 5351
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index ed9354aff279..ff3711932018 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -35,22 +35,18 @@ const struct inode_operations ext4_symlink_inode_operations = {
35 .follow_link = page_follow_link_light, 35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link, 36 .put_link = page_put_link,
37 .setattr = ext4_setattr, 37 .setattr = ext4_setattr,
38#ifdef CONFIG_EXT4_FS_XATTR
39 .setxattr = generic_setxattr, 38 .setxattr = generic_setxattr,
40 .getxattr = generic_getxattr, 39 .getxattr = generic_getxattr,
41 .listxattr = ext4_listxattr, 40 .listxattr = ext4_listxattr,
42 .removexattr = generic_removexattr, 41 .removexattr = generic_removexattr,
43#endif
44}; 42};
45 43
46const struct inode_operations ext4_fast_symlink_inode_operations = { 44const struct inode_operations ext4_fast_symlink_inode_operations = {
47 .readlink = generic_readlink, 45 .readlink = generic_readlink,
48 .follow_link = ext4_follow_link, 46 .follow_link = ext4_follow_link,
49 .setattr = ext4_setattr, 47 .setattr = ext4_setattr,
50#ifdef CONFIG_EXT4_FS_XATTR
51 .setxattr = generic_setxattr, 48 .setxattr = generic_setxattr,
52 .getxattr = generic_getxattr, 49 .getxattr = generic_getxattr,
53 .listxattr = ext4_listxattr, 50 .listxattr = ext4_listxattr,
54 .removexattr = generic_removexattr, 51 .removexattr = generic_removexattr,
55#endif
56}; 52};
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 2cdb98d62980..3a91ebc2b66f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -61,11 +61,6 @@
61#include "xattr.h" 61#include "xattr.h"
62#include "acl.h" 62#include "acl.h"
63 63
64#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
65#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
66#define BFIRST(bh) ENTRY(BHDR(bh)+1)
67#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
68
69#ifdef EXT4_XATTR_DEBUG 64#ifdef EXT4_XATTR_DEBUG
70# define ea_idebug(inode, f...) do { \ 65# define ea_idebug(inode, f...) do { \
71 printk(KERN_DEBUG "inode %s:%lu: ", \ 66 printk(KERN_DEBUG "inode %s:%lu: ", \
@@ -312,7 +307,7 @@ cleanup:
312 return error; 307 return error;
313} 308}
314 309
315static int 310int
316ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, 311ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
317 void *buffer, size_t buffer_size) 312 void *buffer, size_t buffer_size)
318{ 313{
@@ -581,21 +576,6 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
581 return (*min_offs - ((void *)last - base) - sizeof(__u32)); 576 return (*min_offs - ((void *)last - base) - sizeof(__u32));
582} 577}
583 578
584struct ext4_xattr_info {
585 int name_index;
586 const char *name;
587 const void *value;
588 size_t value_len;
589};
590
591struct ext4_xattr_search {
592 struct ext4_xattr_entry *first;
593 void *base;
594 void *end;
595 struct ext4_xattr_entry *here;
596 int not_found;
597};
598
599static int 579static int
600ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) 580ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
601{ 581{
@@ -648,9 +628,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
648 size. Just replace. */ 628 size. Just replace. */
649 s->here->e_value_size = 629 s->here->e_value_size =
650 cpu_to_le32(i->value_len); 630 cpu_to_le32(i->value_len);
651 memset(val + size - EXT4_XATTR_PAD, 0, 631 if (i->value == EXT4_ZERO_XATTR_VALUE) {
652 EXT4_XATTR_PAD); /* Clear pad bytes. */ 632 memset(val, 0, size);
653 memcpy(val, i->value, i->value_len); 633 } else {
634 /* Clear pad bytes first. */
635 memset(val + size - EXT4_XATTR_PAD, 0,
636 EXT4_XATTR_PAD);
637 memcpy(val, i->value, i->value_len);
638 }
654 return 0; 639 return 0;
655 } 640 }
656 641
@@ -689,9 +674,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
689 size_t size = EXT4_XATTR_SIZE(i->value_len); 674 size_t size = EXT4_XATTR_SIZE(i->value_len);
690 void *val = s->base + min_offs - size; 675 void *val = s->base + min_offs - size;
691 s->here->e_value_offs = cpu_to_le16(min_offs - size); 676 s->here->e_value_offs = cpu_to_le16(min_offs - size);
692 memset(val + size - EXT4_XATTR_PAD, 0, 677 if (i->value == EXT4_ZERO_XATTR_VALUE) {
693 EXT4_XATTR_PAD); /* Clear the pad bytes. */ 678 memset(val, 0, size);
694 memcpy(val, i->value, i->value_len); 679 } else {
680 /* Clear the pad bytes first. */
681 memset(val + size - EXT4_XATTR_PAD, 0,
682 EXT4_XATTR_PAD);
683 memcpy(val, i->value, i->value_len);
684 }
695 } 685 }
696 } 686 }
697 return 0; 687 return 0;
@@ -794,7 +784,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
794 int offset = (char *)s->here - bs->bh->b_data; 784 int offset = (char *)s->here - bs->bh->b_data;
795 785
796 unlock_buffer(bs->bh); 786 unlock_buffer(bs->bh);
797 ext4_handle_release_buffer(handle, bs->bh);
798 if (ce) { 787 if (ce) {
799 mb_cache_entry_release(ce); 788 mb_cache_entry_release(ce);
800 ce = NULL; 789 ce = NULL;
@@ -950,14 +939,8 @@ bad_block:
950#undef header 939#undef header
951} 940}
952 941
953struct ext4_xattr_ibody_find { 942int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
954 struct ext4_xattr_search s; 943 struct ext4_xattr_ibody_find *is)
955 struct ext4_iloc iloc;
956};
957
958static int
959ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
960 struct ext4_xattr_ibody_find *is)
961{ 944{
962 struct ext4_xattr_ibody_header *header; 945 struct ext4_xattr_ibody_header *header;
963 struct ext4_inode *raw_inode; 946 struct ext4_inode *raw_inode;
@@ -985,10 +968,47 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
985 return 0; 968 return 0;
986} 969}
987 970
988static int 971int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
989ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, 972 struct ext4_xattr_info *i,
990 struct ext4_xattr_info *i, 973 struct ext4_xattr_ibody_find *is)
991 struct ext4_xattr_ibody_find *is) 974{
975 struct ext4_xattr_ibody_header *header;
976 struct ext4_xattr_search *s = &is->s;
977 int error;
978
979 if (EXT4_I(inode)->i_extra_isize == 0)
980 return -ENOSPC;
981 error = ext4_xattr_set_entry(i, s);
982 if (error) {
983 if (error == -ENOSPC &&
984 ext4_has_inline_data(inode)) {
985 error = ext4_try_to_evict_inline_data(handle, inode,
986 EXT4_XATTR_LEN(strlen(i->name) +
987 EXT4_XATTR_SIZE(i->value_len)));
988 if (error)
989 return error;
990 error = ext4_xattr_ibody_find(inode, i, is);
991 if (error)
992 return error;
993 error = ext4_xattr_set_entry(i, s);
994 }
995 if (error)
996 return error;
997 }
998 header = IHDR(inode, ext4_raw_inode(&is->iloc));
999 if (!IS_LAST_ENTRY(s->first)) {
1000 header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
1001 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
1002 } else {
1003 header->h_magic = cpu_to_le32(0);
1004 ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
1005 }
1006 return 0;
1007}
1008
1009static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
1010 struct ext4_xattr_info *i,
1011 struct ext4_xattr_ibody_find *is)
992{ 1012{
993 struct ext4_xattr_ibody_header *header; 1013 struct ext4_xattr_ibody_header *header;
994 struct ext4_xattr_search *s = &is->s; 1014 struct ext4_xattr_search *s = &is->s;
@@ -1144,9 +1164,17 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
1144{ 1164{
1145 handle_t *handle; 1165 handle_t *handle;
1146 int error, retries = 0; 1166 int error, retries = 0;
1167 int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
1147 1168
1148retry: 1169retry:
1149 handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); 1170 /*
1171 * In case of inline data, we may push out the data to a block,
1172 * So reserve the journal space first.
1173 */
1174 if (ext4_has_inline_data(inode))
1175 credits += ext4_writepage_trans_blocks(inode) + 1;
1176
1177 handle = ext4_journal_start(inode, credits);
1150 if (IS_ERR(handle)) { 1178 if (IS_ERR(handle)) {
1151 error = PTR_ERR(handle); 1179 error = PTR_ERR(handle);
1152 } else { 1180 } else {
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 91f31ca7d9af..69eda787a96a 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -21,6 +21,7 @@
21#define EXT4_XATTR_INDEX_TRUSTED 4 21#define EXT4_XATTR_INDEX_TRUSTED 4
22#define EXT4_XATTR_INDEX_LUSTRE 5 22#define EXT4_XATTR_INDEX_LUSTRE 5
23#define EXT4_XATTR_INDEX_SECURITY 6 23#define EXT4_XATTR_INDEX_SECURITY 6
24#define EXT4_XATTR_INDEX_SYSTEM 7
24 25
25struct ext4_xattr_header { 26struct ext4_xattr_header {
26 __le32 h_magic; /* magic number for identification */ 27 __le32 h_magic; /* magic number for identification */
@@ -65,7 +66,32 @@ struct ext4_xattr_entry {
65 EXT4_I(inode)->i_extra_isize)) 66 EXT4_I(inode)->i_extra_isize))
66#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) 67#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
67 68
68# ifdef CONFIG_EXT4_FS_XATTR 69#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
70#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
71#define BFIRST(bh) ENTRY(BHDR(bh)+1)
72#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
73
74#define EXT4_ZERO_XATTR_VALUE ((void *)-1)
75
76struct ext4_xattr_info {
77 int name_index;
78 const char *name;
79 const void *value;
80 size_t value_len;
81};
82
83struct ext4_xattr_search {
84 struct ext4_xattr_entry *first;
85 void *base;
86 void *end;
87 struct ext4_xattr_entry *here;
88 int not_found;
89};
90
91struct ext4_xattr_ibody_find {
92 struct ext4_xattr_search s;
93 struct ext4_iloc iloc;
94};
69 95
70extern const struct xattr_handler ext4_xattr_user_handler; 96extern const struct xattr_handler ext4_xattr_user_handler;
71extern const struct xattr_handler ext4_xattr_trusted_handler; 97extern const struct xattr_handler ext4_xattr_trusted_handler;
@@ -90,60 +116,82 @@ extern void ext4_exit_xattr(void);
90 116
91extern const struct xattr_handler *ext4_xattr_handlers[]; 117extern const struct xattr_handler *ext4_xattr_handlers[];
92 118
93# else /* CONFIG_EXT4_FS_XATTR */ 119extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
94 120 struct ext4_xattr_ibody_find *is);
95static inline int 121extern int ext4_xattr_ibody_get(struct inode *inode, int name_index,
96ext4_xattr_get(struct inode *inode, int name_index, const char *name, 122 const char *name,
97 void *buffer, size_t size, int flags) 123 void *buffer, size_t buffer_size);
98{ 124extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
99 return -EOPNOTSUPP; 125 struct ext4_xattr_info *i,
100} 126 struct ext4_xattr_ibody_find *is);
101 127
102static inline int 128extern int ext4_has_inline_data(struct inode *inode);
103ext4_xattr_set(struct inode *inode, int name_index, const char *name, 129extern int ext4_get_inline_size(struct inode *inode);
104 const void *value, size_t size, int flags) 130extern int ext4_get_max_inline_size(struct inode *inode);
105{ 131extern int ext4_find_inline_data_nolock(struct inode *inode);
106 return -EOPNOTSUPP; 132extern void ext4_write_inline_data(struct inode *inode,
107} 133 struct ext4_iloc *iloc,
108 134 void *buffer, loff_t pos,
109static inline int 135 unsigned int len);
110ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, 136extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
111 const char *name, const void *value, size_t size, int flags) 137 unsigned int len);
112{ 138extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
113 return -EOPNOTSUPP; 139 unsigned int len);
114} 140extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
115 141
116static inline void 142extern int ext4_readpage_inline(struct inode *inode, struct page *page);
117ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) 143extern int ext4_try_to_write_inline_data(struct address_space *mapping,
118{ 144 struct inode *inode,
119} 145 loff_t pos, unsigned len,
120 146 unsigned flags,
121static inline void 147 struct page **pagep);
122ext4_xattr_put_super(struct super_block *sb) 148extern int ext4_write_inline_data_end(struct inode *inode,
123{ 149 loff_t pos, unsigned len,
124} 150 unsigned copied,
125 151 struct page *page);
126static __init inline int 152extern struct buffer_head *
127ext4_init_xattr(void) 153ext4_journalled_write_inline_data(struct inode *inode,
128{ 154 unsigned len,
129 return 0; 155 struct page *page);
130} 156extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
131 157 struct inode *inode,
132static inline void 158 loff_t pos, unsigned len,
133ext4_exit_xattr(void) 159 unsigned flags,
134{ 160 struct page **pagep,
135} 161 void **fsdata);
136 162extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
137static inline int 163 unsigned len, unsigned copied,
138ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, 164 struct page *page);
139 struct ext4_inode *raw_inode, handle_t *handle) 165extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
140{ 166 struct inode *inode);
141 return -EOPNOTSUPP; 167extern int ext4_try_create_inline_dir(handle_t *handle,
142} 168 struct inode *parent,
143 169 struct inode *inode);
144#define ext4_xattr_handlers NULL 170extern int ext4_read_inline_dir(struct file *filp,
145 171 void *dirent, filldir_t filldir,
146# endif /* CONFIG_EXT4_FS_XATTR */ 172 int *has_inline_data);
173extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
174 const struct qstr *d_name,
175 struct ext4_dir_entry_2 **res_dir,
176 int *has_inline_data);
177extern int ext4_delete_inline_entry(handle_t *handle,
178 struct inode *dir,
179 struct ext4_dir_entry_2 *de_del,
180 struct buffer_head *bh,
181 int *has_inline_data);
182extern int empty_inline_dir(struct inode *dir, int *has_inline_data);
183extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
184 struct ext4_dir_entry_2 **parent_de,
185 int *retval);
186extern int ext4_inline_data_fiemap(struct inode *inode,
187 struct fiemap_extent_info *fieinfo,
188 int *has_inline);
189extern int ext4_try_to_evict_inline_data(handle_t *handle,
190 struct inode *inode,
191 int needed);
192extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
193
194extern int ext4_convert_inline_data(struct inode *inode);
147 195
148#ifdef CONFIG_EXT4_FS_SECURITY 196#ifdef CONFIG_EXT4_FS_SECURITY
149extern int ext4_init_security(handle_t *handle, struct inode *inode, 197extern int ext4_init_security(handle_t *handle, struct inode *inode,
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 623f36f0423b..12701a567752 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -29,6 +29,7 @@ struct fat_mount_options {
29 unsigned short fs_fmask; 29 unsigned short fs_fmask;
30 unsigned short fs_dmask; 30 unsigned short fs_dmask;
31 unsigned short codepage; /* Codepage for shortname conversions */ 31 unsigned short codepage; /* Codepage for shortname conversions */
32 int time_offset; /* Offset of timestamps from UTC (in minutes) */
32 char *iocharset; /* Charset used for filename input/display */ 33 char *iocharset; /* Charset used for filename input/display */
33 unsigned short shortname; /* flags for shortname display/create rule */ 34 unsigned short shortname; /* flags for shortname display/create rule */
34 unsigned char name_check; /* r = relaxed, n = normal, s = strict */ 35 unsigned char name_check; /* r = relaxed, n = normal, s = strict */
@@ -45,7 +46,7 @@ struct fat_mount_options {
45 flush:1, /* write things quickly */ 46 flush:1, /* write things quickly */
46 nocase:1, /* Does this need case conversion? 0=need case conversion*/ 47 nocase:1, /* Does this need case conversion? 0=need case conversion*/
47 usefree:1, /* Use free_clusters for FAT32 */ 48 usefree:1, /* Use free_clusters for FAT32 */
48 tz_utc:1, /* Filesystem timestamps are in UTC */ 49 tz_set:1, /* Filesystem timestamps' offset set */
49 rodir:1, /* allow ATTR_RO for directory */ 50 rodir:1, /* allow ATTR_RO for directory */
50 discard:1, /* Issue discard requests on deletions */ 51 discard:1, /* Issue discard requests on deletions */
51 nfs:1; /* Do extra work needed for NFS export */ 52 nfs:1; /* Do extra work needed for NFS export */
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 5bafaad00530..35806813ea4e 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -26,6 +26,7 @@
26#include <linux/writeback.h> 26#include <linux/writeback.h>
27#include <linux/log2.h> 27#include <linux/log2.h>
28#include <linux/hash.h> 28#include <linux/hash.h>
29#include <linux/blkdev.h>
29#include <asm/unaligned.h> 30#include <asm/unaligned.h>
30#include "fat.h" 31#include "fat.h"
31 32
@@ -725,7 +726,8 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
725 if (opts->allow_utime) 726 if (opts->allow_utime)
726 seq_printf(m, ",allow_utime=%04o", opts->allow_utime); 727 seq_printf(m, ",allow_utime=%04o", opts->allow_utime);
727 if (sbi->nls_disk) 728 if (sbi->nls_disk)
728 seq_printf(m, ",codepage=%s", sbi->nls_disk->charset); 729 /* strip "cp" prefix from displayed option */
730 seq_printf(m, ",codepage=%s", &sbi->nls_disk->charset[2]);
729 if (isvfat) { 731 if (isvfat) {
730 if (sbi->nls_io) 732 if (sbi->nls_io)
731 seq_printf(m, ",iocharset=%s", sbi->nls_io->charset); 733 seq_printf(m, ",iocharset=%s", sbi->nls_io->charset);
@@ -777,8 +779,12 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
777 } 779 }
778 if (opts->flush) 780 if (opts->flush)
779 seq_puts(m, ",flush"); 781 seq_puts(m, ",flush");
780 if (opts->tz_utc) 782 if (opts->tz_set) {
781 seq_puts(m, ",tz=UTC"); 783 if (opts->time_offset)
784 seq_printf(m, ",time_offset=%d", opts->time_offset);
785 else
786 seq_puts(m, ",tz=UTC");
787 }
782 if (opts->errors == FAT_ERRORS_CONT) 788 if (opts->errors == FAT_ERRORS_CONT)
783 seq_puts(m, ",errors=continue"); 789 seq_puts(m, ",errors=continue");
784 else if (opts->errors == FAT_ERRORS_PANIC) 790 else if (opts->errors == FAT_ERRORS_PANIC)
@@ -800,7 +806,8 @@ enum {
800 Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, 806 Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
801 Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, 807 Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
802 Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, 808 Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
803 Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_err, 809 Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset,
810 Opt_err,
804}; 811};
805 812
806static const match_table_t fat_tokens = { 813static const match_table_t fat_tokens = {
@@ -825,6 +832,7 @@ static const match_table_t fat_tokens = {
825 {Opt_immutable, "sys_immutable"}, 832 {Opt_immutable, "sys_immutable"},
826 {Opt_flush, "flush"}, 833 {Opt_flush, "flush"},
827 {Opt_tz_utc, "tz=UTC"}, 834 {Opt_tz_utc, "tz=UTC"},
835 {Opt_time_offset, "time_offset=%d"},
828 {Opt_err_cont, "errors=continue"}, 836 {Opt_err_cont, "errors=continue"},
829 {Opt_err_panic, "errors=panic"}, 837 {Opt_err_panic, "errors=panic"},
830 {Opt_err_ro, "errors=remount-ro"}, 838 {Opt_err_ro, "errors=remount-ro"},
@@ -909,7 +917,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
909 opts->utf8 = opts->unicode_xlate = 0; 917 opts->utf8 = opts->unicode_xlate = 0;
910 opts->numtail = 1; 918 opts->numtail = 1;
911 opts->usefree = opts->nocase = 0; 919 opts->usefree = opts->nocase = 0;
912 opts->tz_utc = 0; 920 opts->tz_set = 0;
913 opts->nfs = 0; 921 opts->nfs = 0;
914 opts->errors = FAT_ERRORS_RO; 922 opts->errors = FAT_ERRORS_RO;
915 *debug = 0; 923 *debug = 0;
@@ -965,48 +973,57 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
965 break; 973 break;
966 case Opt_uid: 974 case Opt_uid:
967 if (match_int(&args[0], &option)) 975 if (match_int(&args[0], &option))
968 return 0; 976 return -EINVAL;
969 opts->fs_uid = make_kuid(current_user_ns(), option); 977 opts->fs_uid = make_kuid(current_user_ns(), option);
970 if (!uid_valid(opts->fs_uid)) 978 if (!uid_valid(opts->fs_uid))
971 return 0; 979 return -EINVAL;
972 break; 980 break;
973 case Opt_gid: 981 case Opt_gid:
974 if (match_int(&args[0], &option)) 982 if (match_int(&args[0], &option))
975 return 0; 983 return -EINVAL;
976 opts->fs_gid = make_kgid(current_user_ns(), option); 984 opts->fs_gid = make_kgid(current_user_ns(), option);
977 if (!gid_valid(opts->fs_gid)) 985 if (!gid_valid(opts->fs_gid))
978 return 0; 986 return -EINVAL;
979 break; 987 break;
980 case Opt_umask: 988 case Opt_umask:
981 if (match_octal(&args[0], &option)) 989 if (match_octal(&args[0], &option))
982 return 0; 990 return -EINVAL;
983 opts->fs_fmask = opts->fs_dmask = option; 991 opts->fs_fmask = opts->fs_dmask = option;
984 break; 992 break;
985 case Opt_dmask: 993 case Opt_dmask:
986 if (match_octal(&args[0], &option)) 994 if (match_octal(&args[0], &option))
987 return 0; 995 return -EINVAL;
988 opts->fs_dmask = option; 996 opts->fs_dmask = option;
989 break; 997 break;
990 case Opt_fmask: 998 case Opt_fmask:
991 if (match_octal(&args[0], &option)) 999 if (match_octal(&args[0], &option))
992 return 0; 1000 return -EINVAL;
993 opts->fs_fmask = option; 1001 opts->fs_fmask = option;
994 break; 1002 break;
995 case Opt_allow_utime: 1003 case Opt_allow_utime:
996 if (match_octal(&args[0], &option)) 1004 if (match_octal(&args[0], &option))
997 return 0; 1005 return -EINVAL;
998 opts->allow_utime = option & (S_IWGRP | S_IWOTH); 1006 opts->allow_utime = option & (S_IWGRP | S_IWOTH);
999 break; 1007 break;
1000 case Opt_codepage: 1008 case Opt_codepage:
1001 if (match_int(&args[0], &option)) 1009 if (match_int(&args[0], &option))
1002 return 0; 1010 return -EINVAL;
1003 opts->codepage = option; 1011 opts->codepage = option;
1004 break; 1012 break;
1005 case Opt_flush: 1013 case Opt_flush:
1006 opts->flush = 1; 1014 opts->flush = 1;
1007 break; 1015 break;
1016 case Opt_time_offset:
1017 if (match_int(&args[0], &option))
1018 return -EINVAL;
1019 if (option < -12 * 60 || option > 12 * 60)
1020 return -EINVAL;
1021 opts->tz_set = 1;
1022 opts->time_offset = option;
1023 break;
1008 case Opt_tz_utc: 1024 case Opt_tz_utc:
1009 opts->tz_utc = 1; 1025 opts->tz_set = 1;
1026 opts->time_offset = 0;
1010 break; 1027 break;
1011 case Opt_err_cont: 1028 case Opt_err_cont:
1012 opts->errors = FAT_ERRORS_CONT; 1029 opts->errors = FAT_ERRORS_CONT;
@@ -1431,6 +1448,14 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
1431 goto out_fail; 1448 goto out_fail;
1432 } 1449 }
1433 1450
1451 if (sbi->options.discard) {
1452 struct request_queue *q = bdev_get_queue(sb->s_bdev);
1453 if (!blk_queue_discard(q))
1454 fat_msg(sb, KERN_WARNING,
1455 "mounting with \"discard\" option, but "
1456 "the device does not support discard");
1457 }
1458
1434 return 0; 1459 return 0;
1435 1460
1436out_invalid: 1461out_invalid:
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 6d93360ca0cc..5eb600dc43a9 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -212,8 +212,10 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
212 + days_in_year[month] + day 212 + days_in_year[month] + day
213 + DAYS_DELTA) * SECS_PER_DAY; 213 + DAYS_DELTA) * SECS_PER_DAY;
214 214
215 if (!sbi->options.tz_utc) 215 if (!sbi->options.tz_set)
216 second += sys_tz.tz_minuteswest * SECS_PER_MIN; 216 second += sys_tz.tz_minuteswest * SECS_PER_MIN;
217 else
218 second -= sbi->options.time_offset * SECS_PER_MIN;
217 219
218 if (time_cs) { 220 if (time_cs) {
219 ts->tv_sec = second + (time_cs / 100); 221 ts->tv_sec = second + (time_cs / 100);
@@ -229,8 +231,9 @@ void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
229 __le16 *time, __le16 *date, u8 *time_cs) 231 __le16 *time, __le16 *date, u8 *time_cs)
230{ 232{
231 struct tm tm; 233 struct tm tm;
232 time_to_tm(ts->tv_sec, sbi->options.tz_utc ? 0 : 234 time_to_tm(ts->tv_sec,
233 -sys_tz.tz_minuteswest * 60, &tm); 235 (sbi->options.tz_set ? sbi->options.time_offset :
236 -sys_tz.tz_minuteswest) * SECS_PER_MIN, &tm);
234 237
235 /* FAT can only support year between 1980 to 2107 */ 238 /* FAT can only support year between 1980 to 2107 */
236 if (tm.tm_year < 1980 - 1900) { 239 if (tm.tm_year < 1980 - 1900) {
diff --git a/fs/fhandle.c b/fs/fhandle.c
index f775bfdd6e4a..cccdc874bb55 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -22,7 +22,7 @@ static long do_sys_name_to_handle(struct path *path,
22 struct file_handle *handle = NULL; 22 struct file_handle *handle = NULL;
23 23
24 /* 24 /*
25 * We need t make sure wether the file system 25 * We need to make sure whether the file system
26 * support decoding of the file handle 26 * support decoding of the file handle
27 */ 27 */
28 if (!path->dentry->d_sb->s_export_op || 28 if (!path->dentry->d_sb->s_export_op ||
@@ -40,7 +40,7 @@ static long do_sys_name_to_handle(struct path *path,
40 if (!handle) 40 if (!handle)
41 return -ENOMEM; 41 return -ENOMEM;
42 42
43 /* convert handle size to multiple of sizeof(u32) */ 43 /* convert handle size to multiple of sizeof(u32) */
44 handle_dwords = f_handle.handle_bytes >> 2; 44 handle_dwords = f_handle.handle_bytes >> 2;
45 45
46 /* we ask for a non connected handle */ 46 /* we ask for a non connected handle */
diff --git a/fs/file.c b/fs/file.c
index 708d997a7748..15cb8618e95d 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -519,12 +519,6 @@ struct files_struct init_files = {
519 .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), 519 .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
520}; 520};
521 521
522void daemonize_descriptors(void)
523{
524 atomic_inc(&init_files.count);
525 reset_files_struct(&init_files);
526}
527
528/* 522/*
529 * allocate a file descriptor, mark it busy. 523 * allocate a file descriptor, mark it busy.
530 */ 524 */
@@ -685,7 +679,6 @@ void do_close_on_exec(struct files_struct *files)
685 struct fdtable *fdt; 679 struct fdtable *fdt;
686 680
687 /* exec unshares first */ 681 /* exec unshares first */
688 BUG_ON(atomic_read(&files->count) != 1);
689 spin_lock(&files->file_lock); 682 spin_lock(&files->file_lock);
690 for (i = 0; ; i++) { 683 for (i = 0; ; i++) {
691 unsigned long set; 684 unsigned long set;
@@ -995,16 +988,18 @@ int iterate_fd(struct files_struct *files, unsigned n,
995 const void *p) 988 const void *p)
996{ 989{
997 struct fdtable *fdt; 990 struct fdtable *fdt;
998 struct file *file;
999 int res = 0; 991 int res = 0;
1000 if (!files) 992 if (!files)
1001 return 0; 993 return 0;
1002 spin_lock(&files->file_lock); 994 spin_lock(&files->file_lock);
1003 fdt = files_fdtable(files); 995 for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
1004 while (!res && n < fdt->max_fds) { 996 struct file *file;
1005 file = rcu_dereference_check_fdtable(files, fdt->fd[n++]); 997 file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
1006 if (file) 998 if (!file)
1007 res = f(p, file, n); 999 continue;
1000 res = f(p, file, n);
1001 if (res)
1002 break;
1008 } 1003 }
1009 spin_unlock(&files->file_lock); 1004 spin_unlock(&files->file_lock);
1010 return res; 1005 return res;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 51ea267d444c..310972b72a66 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -228,6 +228,8 @@ static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
228static void inode_sync_complete(struct inode *inode) 228static void inode_sync_complete(struct inode *inode)
229{ 229{
230 inode->i_state &= ~I_SYNC; 230 inode->i_state &= ~I_SYNC;
231 /* If inode is clean an unused, put it into LRU now... */
232 inode_add_lru(inode);
231 /* Waiters must see I_SYNC cleared before being woken up */ 233 /* Waiters must see I_SYNC cleared before being woken up */
232 smp_mb(); 234 smp_mb();
233 wake_up_bit(&inode->i_state, __I_SYNC); 235 wake_up_bit(&inode->i_state, __I_SYNC);
@@ -1032,7 +1034,7 @@ int bdi_writeback_thread(void *data)
1032 while (!kthread_freezable_should_stop(NULL)) { 1034 while (!kthread_freezable_should_stop(NULL)) {
1033 /* 1035 /*
1034 * Remove own delayed wake-up timer, since we are already awake 1036 * Remove own delayed wake-up timer, since we are already awake
1035 * and we'll take care of the preriodic write-back. 1037 * and we'll take care of the periodic write-back.
1036 */ 1038 */
1037 del_timer(&wb->wakeup_timer); 1039 del_timer(&wb->wakeup_timer);
1038 1040
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 5df4775fea03..fe6ca583bbc0 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -164,27 +164,3 @@ struct fs_struct init_fs = {
164 .seq = SEQCNT_ZERO, 164 .seq = SEQCNT_ZERO,
165 .umask = 0022, 165 .umask = 0022,
166}; 166};
167
168void daemonize_fs_struct(void)
169{
170 struct fs_struct *fs = current->fs;
171
172 if (fs) {
173 int kill;
174
175 task_lock(current);
176
177 spin_lock(&init_fs.lock);
178 init_fs.users++;
179 spin_unlock(&init_fs.lock);
180
181 spin_lock(&fs->lock);
182 current->fs = &init_fs;
183 kill = !--fs->users;
184 spin_unlock(&fs->lock);
185
186 task_unlock(current);
187 if (kill)
188 free_fs_struct(fs);
189 }
190}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8c23fa7a91e6..c16335315e5d 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -92,8 +92,8 @@ static void __fuse_put_request(struct fuse_req *req)
92 92
93static void fuse_req_init_context(struct fuse_req *req) 93static void fuse_req_init_context(struct fuse_req *req)
94{ 94{
95 req->in.h.uid = current_fsuid(); 95 req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
96 req->in.h.gid = current_fsgid(); 96 req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
97 req->in.h.pid = current->pid; 97 req->in.h.pid = current->pid;
98} 98}
99 99
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 324bc0850534..b7c09f9eb40c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -818,8 +818,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
818 stat->ino = attr->ino; 818 stat->ino = attr->ino;
819 stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); 819 stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
820 stat->nlink = attr->nlink; 820 stat->nlink = attr->nlink;
821 stat->uid = attr->uid; 821 stat->uid = make_kuid(&init_user_ns, attr->uid);
822 stat->gid = attr->gid; 822 stat->gid = make_kgid(&init_user_ns, attr->gid);
823 stat->rdev = inode->i_rdev; 823 stat->rdev = inode->i_rdev;
824 stat->atime.tv_sec = attr->atime; 824 stat->atime.tv_sec = attr->atime;
825 stat->atime.tv_nsec = attr->atimensec; 825 stat->atime.tv_nsec = attr->atimensec;
@@ -1007,12 +1007,12 @@ int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
1007 rcu_read_lock(); 1007 rcu_read_lock();
1008 ret = 0; 1008 ret = 0;
1009 cred = __task_cred(task); 1009 cred = __task_cred(task);
1010 if (cred->euid == fc->user_id && 1010 if (uid_eq(cred->euid, fc->user_id) &&
1011 cred->suid == fc->user_id && 1011 uid_eq(cred->suid, fc->user_id) &&
1012 cred->uid == fc->user_id && 1012 uid_eq(cred->uid, fc->user_id) &&
1013 cred->egid == fc->group_id && 1013 gid_eq(cred->egid, fc->group_id) &&
1014 cred->sgid == fc->group_id && 1014 gid_eq(cred->sgid, fc->group_id) &&
1015 cred->gid == fc->group_id) 1015 gid_eq(cred->gid, fc->group_id))
1016 ret = 1; 1016 ret = 1;
1017 rcu_read_unlock(); 1017 rcu_read_unlock();
1018 1018
@@ -1306,9 +1306,9 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
1306 if (ivalid & ATTR_MODE) 1306 if (ivalid & ATTR_MODE)
1307 arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; 1307 arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode;
1308 if (ivalid & ATTR_UID) 1308 if (ivalid & ATTR_UID)
1309 arg->valid |= FATTR_UID, arg->uid = iattr->ia_uid; 1309 arg->valid |= FATTR_UID, arg->uid = from_kuid(&init_user_ns, iattr->ia_uid);
1310 if (ivalid & ATTR_GID) 1310 if (ivalid & ATTR_GID)
1311 arg->valid |= FATTR_GID, arg->gid = iattr->ia_gid; 1311 arg->valid |= FATTR_GID, arg->gid = from_kgid(&init_user_ns, iattr->ia_gid);
1312 if (ivalid & ATTR_SIZE) 1312 if (ivalid & ATTR_SIZE)
1313 arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; 1313 arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size;
1314 if (ivalid & ATTR_ATIME) { 1314 if (ivalid & ATTR_ATIME) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 78d2837bc940..e21d4d8f87e3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1599,19 +1599,19 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
1599 return err ? 0 : outarg.block; 1599 return err ? 0 : outarg.block;
1600} 1600}
1601 1601
1602static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) 1602static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
1603{ 1603{
1604 loff_t retval; 1604 loff_t retval;
1605 struct inode *inode = file->f_path.dentry->d_inode; 1605 struct inode *inode = file->f_path.dentry->d_inode;
1606 1606
1607 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */ 1607 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
1608 if (origin == SEEK_CUR || origin == SEEK_SET) 1608 if (whence == SEEK_CUR || whence == SEEK_SET)
1609 return generic_file_llseek(file, offset, origin); 1609 return generic_file_llseek(file, offset, whence);
1610 1610
1611 mutex_lock(&inode->i_mutex); 1611 mutex_lock(&inode->i_mutex);
1612 retval = fuse_update_attributes(inode, NULL, file, NULL); 1612 retval = fuse_update_attributes(inode, NULL, file, NULL);
1613 if (!retval) 1613 if (!retval)
1614 retval = generic_file_llseek(file, offset, origin); 1614 retval = generic_file_llseek(file, offset, whence);
1615 mutex_unlock(&inode->i_mutex); 1615 mutex_unlock(&inode->i_mutex);
1616 1616
1617 return retval; 1617 return retval;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e24dd74e3068..e105a53fc72d 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -333,10 +333,10 @@ struct fuse_conn {
333 atomic_t count; 333 atomic_t count;
334 334
335 /** The user id for this mount */ 335 /** The user id for this mount */
336 uid_t user_id; 336 kuid_t user_id;
337 337
338 /** The group id for this mount */ 338 /** The group id for this mount */
339 gid_t group_id; 339 kgid_t group_id;
340 340
341 /** The fuse mount flags for this mount */ 341 /** The fuse mount flags for this mount */
342 unsigned flags; 342 unsigned flags;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f0eda124cffb..73ca6b72beaf 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -60,8 +60,8 @@ MODULE_PARM_DESC(max_user_congthresh,
60struct fuse_mount_data { 60struct fuse_mount_data {
61 int fd; 61 int fd;
62 unsigned rootmode; 62 unsigned rootmode;
63 unsigned user_id; 63 kuid_t user_id;
64 unsigned group_id; 64 kgid_t group_id;
65 unsigned fd_present:1; 65 unsigned fd_present:1;
66 unsigned rootmode_present:1; 66 unsigned rootmode_present:1;
67 unsigned user_id_present:1; 67 unsigned user_id_present:1;
@@ -164,8 +164,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
164 inode->i_ino = fuse_squash_ino(attr->ino); 164 inode->i_ino = fuse_squash_ino(attr->ino);
165 inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); 165 inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
166 set_nlink(inode, attr->nlink); 166 set_nlink(inode, attr->nlink);
167 inode->i_uid = attr->uid; 167 inode->i_uid = make_kuid(&init_user_ns, attr->uid);
168 inode->i_gid = attr->gid; 168 inode->i_gid = make_kgid(&init_user_ns, attr->gid);
169 inode->i_blocks = attr->blocks; 169 inode->i_blocks = attr->blocks;
170 inode->i_atime.tv_sec = attr->atime; 170 inode->i_atime.tv_sec = attr->atime;
171 inode->i_atime.tv_nsec = attr->atimensec; 171 inode->i_atime.tv_nsec = attr->atimensec;
@@ -492,14 +492,18 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
492 case OPT_USER_ID: 492 case OPT_USER_ID:
493 if (match_int(&args[0], &value)) 493 if (match_int(&args[0], &value))
494 return 0; 494 return 0;
495 d->user_id = value; 495 d->user_id = make_kuid(current_user_ns(), value);
496 if (!uid_valid(d->user_id))
497 return 0;
496 d->user_id_present = 1; 498 d->user_id_present = 1;
497 break; 499 break;
498 500
499 case OPT_GROUP_ID: 501 case OPT_GROUP_ID:
500 if (match_int(&args[0], &value)) 502 if (match_int(&args[0], &value))
501 return 0; 503 return 0;
502 d->group_id = value; 504 d->group_id = make_kgid(current_user_ns(), value);
505 if (!gid_valid(d->group_id))
506 return 0;
503 d->group_id_present = 1; 507 d->group_id_present = 1;
504 break; 508 break;
505 509
@@ -540,8 +544,8 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
540 struct super_block *sb = root->d_sb; 544 struct super_block *sb = root->d_sb;
541 struct fuse_conn *fc = get_fuse_conn_super(sb); 545 struct fuse_conn *fc = get_fuse_conn_super(sb);
542 546
543 seq_printf(m, ",user_id=%u", fc->user_id); 547 seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id));
544 seq_printf(m, ",group_id=%u", fc->group_id); 548 seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id));
545 if (fc->flags & FUSE_DEFAULT_PERMISSIONS) 549 if (fc->flags & FUSE_DEFAULT_PERMISSIONS)
546 seq_puts(m, ",default_permissions"); 550 seq_puts(m, ",default_permissions");
547 if (fc->flags & FUSE_ALLOW_OTHER) 551 if (fc->flags & FUSE_ALLOW_OTHER)
@@ -989,7 +993,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
989 if (!file) 993 if (!file)
990 goto err; 994 goto err;
991 995
992 if (file->f_op != &fuse_dev_operations) 996 if ((file->f_op != &fuse_dev_operations) ||
997 (file->f_cred->user_ns != &init_user_ns))
993 goto err_fput; 998 goto err_fput;
994 999
995 fc = kmalloc(sizeof(*fc), GFP_KERNEL); 1000 fc = kmalloc(sizeof(*fc), GFP_KERNEL);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 01c4975da4bc..30de4f2a2ea9 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -643,7 +643,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
643 goto out_unlock; 643 goto out_unlock;
644 644
645 requested = data_blocks + ind_blocks; 645 requested = data_blocks + ind_blocks;
646 error = gfs2_inplace_reserve(ip, requested); 646 error = gfs2_inplace_reserve(ip, requested, 0);
647 if (error) 647 if (error)
648 goto out_qunlock; 648 goto out_qunlock;
649 } 649 }
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 1fd3ae237bdd..a68e91bcef3d 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -991,6 +991,41 @@ unlock:
991 return err; 991 return err;
992} 992}
993 993
994/**
995 * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
996 * @inode: The inode being truncated
997 * @oldsize: The original (larger) size
998 * @newsize: The new smaller size
999 *
1000 * With jdata files, we have to journal a revoke for each block which is
1001 * truncated. As a result, we need to split this into separate transactions
1002 * if the number of pages being truncated gets too large.
1003 */
1004
1005#define GFS2_JTRUNC_REVOKES 8192
1006
1007static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
1008{
1009 struct gfs2_sbd *sdp = GFS2_SB(inode);
1010 u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1011 u64 chunk;
1012 int error;
1013
1014 while (oldsize != newsize) {
1015 chunk = oldsize - newsize;
1016 if (chunk > max_chunk)
1017 chunk = max_chunk;
1018 truncate_pagecache(inode, oldsize, oldsize - chunk);
1019 oldsize -= chunk;
1020 gfs2_trans_end(sdp);
1021 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1022 if (error)
1023 return error;
1024 }
1025
1026 return 0;
1027}
1028
994static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) 1029static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
995{ 1030{
996 struct gfs2_inode *ip = GFS2_I(inode); 1031 struct gfs2_inode *ip = GFS2_I(inode);
@@ -1000,8 +1035,10 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
1000 int journaled = gfs2_is_jdata(ip); 1035 int journaled = gfs2_is_jdata(ip);
1001 int error; 1036 int error;
1002 1037
1003 error = gfs2_trans_begin(sdp, 1038 if (journaled)
1004 RES_DINODE + (journaled ? RES_JDATA : 0), 0); 1039 error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1040 else
1041 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1005 if (error) 1042 if (error)
1006 return error; 1043 return error;
1007 1044
@@ -1026,7 +1063,16 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
1026 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1063 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1027 gfs2_dinode_out(ip, dibh->b_data); 1064 gfs2_dinode_out(ip, dibh->b_data);
1028 1065
1029 truncate_pagecache(inode, oldsize, newsize); 1066 if (journaled)
1067 error = gfs2_journaled_truncate(inode, oldsize, newsize);
1068 else
1069 truncate_pagecache(inode, oldsize, newsize);
1070
1071 if (error) {
1072 brelse(dibh);
1073 return error;
1074 }
1075
1030out_brelse: 1076out_brelse:
1031 brelse(dibh); 1077 brelse(dibh);
1032out: 1078out:
@@ -1178,7 +1224,7 @@ static int do_grow(struct inode *inode, u64 size)
1178 if (error) 1224 if (error)
1179 return error; 1225 return error;
1180 1226
1181 error = gfs2_inplace_reserve(ip, 1); 1227 error = gfs2_inplace_reserve(ip, 1, 0);
1182 if (error) 1228 if (error)
1183 goto do_grow_qunlock; 1229 goto do_grow_qunlock;
1184 unstuff = 1; 1230 unstuff = 1;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 259b088cfc4c..9a35670fdc38 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1676,16 +1676,11 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1676 be16_add_cpu(&leaf->lf_entries, 1); 1676 be16_add_cpu(&leaf->lf_entries, 1);
1677 } 1677 }
1678 brelse(bh); 1678 brelse(bh);
1679 error = gfs2_meta_inode_buffer(ip, &bh);
1680 if (error)
1681 break;
1682 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1683 ip->i_entries++; 1679 ip->i_entries++;
1684 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1680 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1685 if (S_ISDIR(nip->i_inode.i_mode)) 1681 if (S_ISDIR(nip->i_inode.i_mode))
1686 inc_nlink(&ip->i_inode); 1682 inc_nlink(&ip->i_inode);
1687 gfs2_dinode_out(ip, bh->b_data); 1683 mark_inode_dirty(inode);
1688 brelse(bh);
1689 error = 0; 1684 error = 0;
1690 break; 1685 break;
1691 } 1686 }
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 0def0504afc1..991ab2d484dd 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -44,7 +44,7 @@
44 * gfs2_llseek - seek to a location in a file 44 * gfs2_llseek - seek to a location in a file
45 * @file: the file 45 * @file: the file
46 * @offset: the offset 46 * @offset: the offset
47 * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END) 47 * @whence: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
48 * 48 *
49 * SEEK_END requires the glock for the file because it references the 49 * SEEK_END requires the glock for the file because it references the
50 * file's size. 50 * file's size.
@@ -52,26 +52,26 @@
52 * Returns: The new offset, or errno 52 * Returns: The new offset, or errno
53 */ 53 */
54 54
55static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin) 55static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence)
56{ 56{
57 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); 57 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
58 struct gfs2_holder i_gh; 58 struct gfs2_holder i_gh;
59 loff_t error; 59 loff_t error;
60 60
61 switch (origin) { 61 switch (whence) {
62 case SEEK_END: /* These reference inode->i_size */ 62 case SEEK_END: /* These reference inode->i_size */
63 case SEEK_DATA: 63 case SEEK_DATA:
64 case SEEK_HOLE: 64 case SEEK_HOLE:
65 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, 65 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
66 &i_gh); 66 &i_gh);
67 if (!error) { 67 if (!error) {
68 error = generic_file_llseek(file, offset, origin); 68 error = generic_file_llseek(file, offset, whence);
69 gfs2_glock_dq_uninit(&i_gh); 69 gfs2_glock_dq_uninit(&i_gh);
70 } 70 }
71 break; 71 break;
72 case SEEK_CUR: 72 case SEEK_CUR:
73 case SEEK_SET: 73 case SEEK_SET:
74 error = generic_file_llseek(file, offset, origin); 74 error = generic_file_llseek(file, offset, whence);
75 break; 75 break;
76 default: 76 default:
77 error = -EINVAL; 77 error = -EINVAL;
@@ -432,7 +432,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
432 if (ret) 432 if (ret)
433 goto out_unlock; 433 goto out_unlock;
434 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); 434 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
435 ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks); 435 ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0);
436 if (ret) 436 if (ret)
437 goto out_quota_unlock; 437 goto out_quota_unlock;
438 438
@@ -516,15 +516,13 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
516 struct gfs2_holder i_gh; 516 struct gfs2_holder i_gh;
517 int error; 517 int error;
518 518
519 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); 519 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
520 error = gfs2_glock_nq(&i_gh); 520 &i_gh);
521 if (error == 0) {
522 file_accessed(file);
523 gfs2_glock_dq(&i_gh);
524 }
525 gfs2_holder_uninit(&i_gh);
526 if (error) 521 if (error)
527 return error; 522 return error;
523 /* grab lock to update inode */
524 gfs2_glock_dq_uninit(&i_gh);
525 file_accessed(file);
528 } 526 }
529 vma->vm_ops = &gfs2_vm_ops; 527 vma->vm_ops = &gfs2_vm_ops;
530 528
@@ -677,10 +675,8 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
677 size_t writesize = iov_length(iov, nr_segs); 675 size_t writesize = iov_length(iov, nr_segs);
678 struct dentry *dentry = file->f_dentry; 676 struct dentry *dentry = file->f_dentry;
679 struct gfs2_inode *ip = GFS2_I(dentry->d_inode); 677 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
680 struct gfs2_sbd *sdp;
681 int ret; 678 int ret;
682 679
683 sdp = GFS2_SB(file->f_mapping->host);
684 ret = gfs2_rs_alloc(ip); 680 ret = gfs2_rs_alloc(ip);
685 if (ret) 681 if (ret)
686 return ret; 682 return ret;
@@ -829,7 +825,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
829retry: 825retry:
830 gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); 826 gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
831 827
832 error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks); 828 error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0);
833 if (error) { 829 if (error) {
834 if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { 830 if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
835 bytes >>= 1; 831 bytes >>= 1;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index e6c2fd53cab2..992c5c0cb504 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -55,8 +55,6 @@ struct gfs2_glock_iter {
55 55
56typedef void (*glock_examiner) (struct gfs2_glock * gl); 56typedef void (*glock_examiner) (struct gfs2_glock * gl);
57 57
58static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
59#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
60static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); 58static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
61 59
62static struct dentry *gfs2_root; 60static struct dentry *gfs2_root;
@@ -107,10 +105,12 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu)
107{ 105{
108 struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); 106 struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
109 107
110 if (gl->gl_ops->go_flags & GLOF_ASPACE) 108 if (gl->gl_ops->go_flags & GLOF_ASPACE) {
111 kmem_cache_free(gfs2_glock_aspace_cachep, gl); 109 kmem_cache_free(gfs2_glock_aspace_cachep, gl);
112 else 110 } else {
111 kfree(gl->gl_lksb.sb_lvbptr);
113 kmem_cache_free(gfs2_glock_cachep, gl); 112 kmem_cache_free(gfs2_glock_cachep, gl);
113 }
114} 114}
115 115
116void gfs2_glock_free(struct gfs2_glock *gl) 116void gfs2_glock_free(struct gfs2_glock *gl)
@@ -537,8 +537,8 @@ __acquires(&gl->gl_spin)
537 (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB))) 537 (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB)))
538 clear_bit(GLF_BLOCKING, &gl->gl_flags); 538 clear_bit(GLF_BLOCKING, &gl->gl_flags);
539 spin_unlock(&gl->gl_spin); 539 spin_unlock(&gl->gl_spin);
540 if (glops->go_xmote_th) 540 if (glops->go_sync)
541 glops->go_xmote_th(gl); 541 glops->go_sync(gl);
542 if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) 542 if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
543 glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA); 543 glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
544 clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); 544 clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
@@ -547,7 +547,10 @@ __acquires(&gl->gl_spin)
547 if (sdp->sd_lockstruct.ls_ops->lm_lock) { 547 if (sdp->sd_lockstruct.ls_ops->lm_lock) {
548 /* lock_dlm */ 548 /* lock_dlm */
549 ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags); 549 ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
550 GLOCK_BUG_ON(gl, ret); 550 if (ret) {
551 printk(KERN_ERR "GFS2: lm_lock ret %d\n", ret);
552 GLOCK_BUG_ON(gl, 1);
553 }
551 } else { /* lock_nolock */ 554 } else { /* lock_nolock */
552 finish_xmote(gl, target); 555 finish_xmote(gl, target);
553 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 556 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
@@ -736,6 +739,16 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
736 if (!gl) 739 if (!gl)
737 return -ENOMEM; 740 return -ENOMEM;
738 741
742 memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));
743
744 if (glops->go_flags & GLOF_LVB) {
745 gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_KERNEL);
746 if (!gl->gl_lksb.sb_lvbptr) {
747 kmem_cache_free(cachep, gl);
748 return -ENOMEM;
749 }
750 }
751
739 atomic_inc(&sdp->sd_glock_disposal); 752 atomic_inc(&sdp->sd_glock_disposal);
740 gl->gl_sbd = sdp; 753 gl->gl_sbd = sdp;
741 gl->gl_flags = 0; 754 gl->gl_flags = 0;
@@ -753,9 +766,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
753 preempt_enable(); 766 preempt_enable();
754 gl->gl_stats.stats[GFS2_LKS_DCOUNT] = 0; 767 gl->gl_stats.stats[GFS2_LKS_DCOUNT] = 0;
755 gl->gl_stats.stats[GFS2_LKS_QCOUNT] = 0; 768 gl->gl_stats.stats[GFS2_LKS_QCOUNT] = 0;
756 memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));
757 memset(gl->gl_lvb, 0, 32 * sizeof(char));
758 gl->gl_lksb.sb_lvbptr = gl->gl_lvb;
759 gl->gl_tchange = jiffies; 769 gl->gl_tchange = jiffies;
760 gl->gl_object = NULL; 770 gl->gl_object = NULL;
761 gl->gl_hold_time = GL_GLOCK_DFT_HOLD; 771 gl->gl_hold_time = GL_GLOCK_DFT_HOLD;
@@ -768,7 +778,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
768 mapping->host = s->s_bdev->bd_inode; 778 mapping->host = s->s_bdev->bd_inode;
769 mapping->flags = 0; 779 mapping->flags = 0;
770 mapping_set_gfp_mask(mapping, GFP_NOFS); 780 mapping_set_gfp_mask(mapping, GFP_NOFS);
771 mapping->assoc_mapping = NULL; 781 mapping->private_data = NULL;
772 mapping->backing_dev_info = s->s_bdi; 782 mapping->backing_dev_info = s->s_bdi;
773 mapping->writeback_index = 0; 783 mapping->writeback_index = 0;
774 } 784 }
@@ -777,6 +787,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
777 tmp = search_bucket(hash, sdp, &name); 787 tmp = search_bucket(hash, sdp, &name);
778 if (tmp) { 788 if (tmp) {
779 spin_unlock_bucket(hash); 789 spin_unlock_bucket(hash);
790 kfree(gl->gl_lksb.sb_lvbptr);
780 kmem_cache_free(cachep, gl); 791 kmem_cache_free(cachep, gl);
781 atomic_dec(&sdp->sd_glock_disposal); 792 atomic_dec(&sdp->sd_glock_disposal);
782 gl = tmp; 793 gl = tmp;
@@ -1013,7 +1024,7 @@ trap_recursive:
1013 printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid)); 1024 printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid));
1014 printk(KERN_ERR "lock type: %d req lock state : %d\n", 1025 printk(KERN_ERR "lock type: %d req lock state : %d\n",
1015 gh->gh_gl->gl_name.ln_type, gh->gh_state); 1026 gh->gh_gl->gl_name.ln_type, gh->gh_state);
1016 __dump_glock(NULL, gl); 1027 gfs2_dump_glock(NULL, gl);
1017 BUG(); 1028 BUG();
1018} 1029}
1019 1030
@@ -1508,7 +1519,7 @@ static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
1508{ 1519{
1509 int ret; 1520 int ret;
1510 spin_lock(&gl->gl_spin); 1521 spin_lock(&gl->gl_spin);
1511 ret = __dump_glock(seq, gl); 1522 ret = gfs2_dump_glock(seq, gl);
1512 spin_unlock(&gl->gl_spin); 1523 spin_unlock(&gl->gl_spin);
1513 return ret; 1524 return ret;
1514} 1525}
@@ -1528,6 +1539,7 @@ static void dump_glock_func(struct gfs2_glock *gl)
1528 1539
1529void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) 1540void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
1530{ 1541{
1542 set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags);
1531 glock_hash_walk(clear_glock, sdp); 1543 glock_hash_walk(clear_glock, sdp);
1532 flush_workqueue(glock_workqueue); 1544 flush_workqueue(glock_workqueue);
1533 wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); 1545 wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
@@ -1655,7 +1667,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
1655} 1667}
1656 1668
1657/** 1669/**
1658 * __dump_glock - print information about a glock 1670 * gfs2_dump_glock - print information about a glock
1659 * @seq: The seq_file struct 1671 * @seq: The seq_file struct
1660 * @gl: the glock 1672 * @gl: the glock
1661 * 1673 *
@@ -1672,7 +1684,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
1672 * Returns: 0 on success, -ENOBUFS when we run out of space 1684 * Returns: 0 on success, -ENOBUFS when we run out of space
1673 */ 1685 */
1674 1686
1675static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) 1687int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
1676{ 1688{
1677 const struct gfs2_glock_operations *glops = gl->gl_ops; 1689 const struct gfs2_glock_operations *glops = gl->gl_ops;
1678 unsigned long long dtime; 1690 unsigned long long dtime;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 307ac31df781..fd580b7861d5 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -178,33 +178,33 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
178 return NULL; 178 return NULL;
179} 179}
180 180
181int gfs2_glock_get(struct gfs2_sbd *sdp, 181extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
182 u64 number, const struct gfs2_glock_operations *glops, 182 const struct gfs2_glock_operations *glops,
183 int create, struct gfs2_glock **glp); 183 int create, struct gfs2_glock **glp);
184void gfs2_glock_hold(struct gfs2_glock *gl); 184extern void gfs2_glock_hold(struct gfs2_glock *gl);
185void gfs2_glock_put_nolock(struct gfs2_glock *gl); 185extern void gfs2_glock_put_nolock(struct gfs2_glock *gl);
186void gfs2_glock_put(struct gfs2_glock *gl); 186extern void gfs2_glock_put(struct gfs2_glock *gl);
187void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, 187extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
188 struct gfs2_holder *gh); 188 unsigned flags, struct gfs2_holder *gh);
189void gfs2_holder_reinit(unsigned int state, unsigned flags, 189extern void gfs2_holder_reinit(unsigned int state, unsigned flags,
190 struct gfs2_holder *gh); 190 struct gfs2_holder *gh);
191void gfs2_holder_uninit(struct gfs2_holder *gh); 191extern void gfs2_holder_uninit(struct gfs2_holder *gh);
192int gfs2_glock_nq(struct gfs2_holder *gh); 192extern int gfs2_glock_nq(struct gfs2_holder *gh);
193int gfs2_glock_poll(struct gfs2_holder *gh); 193extern int gfs2_glock_poll(struct gfs2_holder *gh);
194int gfs2_glock_wait(struct gfs2_holder *gh); 194extern int gfs2_glock_wait(struct gfs2_holder *gh);
195void gfs2_glock_dq(struct gfs2_holder *gh); 195extern void gfs2_glock_dq(struct gfs2_holder *gh);
196void gfs2_glock_dq_wait(struct gfs2_holder *gh); 196extern void gfs2_glock_dq_wait(struct gfs2_holder *gh);
197 197extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
198void gfs2_glock_dq_uninit(struct gfs2_holder *gh); 198extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
199int gfs2_glock_nq_num(struct gfs2_sbd *sdp, 199 const struct gfs2_glock_operations *glops,
200 u64 number, const struct gfs2_glock_operations *glops, 200 unsigned int state, int flags,
201 unsigned int state, int flags, struct gfs2_holder *gh); 201 struct gfs2_holder *gh);
202 202extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
203int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); 203extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
204void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); 204extern void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
205void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); 205extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
206 206#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0)
207__printf(2, 3) 207extern __printf(2, 3)
208void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); 208void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
209 209
210/** 210/**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 32cc4fde975c..78d4184ffc7d 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -74,7 +74,7 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
74 74
75 gfs2_trans_add_revoke(sdp, bd); 75 gfs2_trans_add_revoke(sdp, bd);
76 } 76 }
77 BUG_ON(!fsync && atomic_read(&gl->gl_ail_count)); 77 GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count));
78 spin_unlock(&sdp->sd_ail_lock); 78 spin_unlock(&sdp->sd_ail_lock);
79 gfs2_log_unlock(sdp); 79 gfs2_log_unlock(sdp);
80} 80}
@@ -96,7 +96,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
96 tr.tr_ip = (unsigned long)__builtin_return_address(0); 96 tr.tr_ip = (unsigned long)__builtin_return_address(0);
97 sb_start_intwrite(sdp->sd_vfs); 97 sb_start_intwrite(sdp->sd_vfs);
98 gfs2_log_reserve(sdp, tr.tr_reserved); 98 gfs2_log_reserve(sdp, tr.tr_reserved);
99 BUG_ON(current->journal_info); 99 WARN_ON_ONCE(current->journal_info);
100 current->journal_info = &tr; 100 current->journal_info = &tr;
101 101
102 __gfs2_ail_flush(gl, 0); 102 __gfs2_ail_flush(gl, 0);
@@ -139,7 +139,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
139 139
140 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) 140 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
141 return; 141 return;
142 BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE); 142 GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
143 143
144 gfs2_log_flush(gl->gl_sbd, gl); 144 gfs2_log_flush(gl->gl_sbd, gl);
145 filemap_fdatawrite(metamapping); 145 filemap_fdatawrite(metamapping);
@@ -168,7 +168,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
168{ 168{
169 struct address_space *mapping = gfs2_glock2aspace(gl); 169 struct address_space *mapping = gfs2_glock2aspace(gl);
170 170
171 BUG_ON(!(flags & DIO_METADATA)); 171 WARN_ON_ONCE(!(flags & DIO_METADATA));
172 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); 172 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
173 truncate_inode_pages(mapping, 0); 173 truncate_inode_pages(mapping, 0);
174 174
@@ -197,7 +197,7 @@ static void inode_go_sync(struct gfs2_glock *gl)
197 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) 197 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
198 return; 198 return;
199 199
200 BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE); 200 GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
201 201
202 gfs2_log_flush(gl->gl_sbd, gl); 202 gfs2_log_flush(gl->gl_sbd, gl);
203 filemap_fdatawrite(metamapping); 203 filemap_fdatawrite(metamapping);
@@ -536,7 +536,7 @@ const struct gfs2_glock_operations gfs2_meta_glops = {
536}; 536};
537 537
538const struct gfs2_glock_operations gfs2_inode_glops = { 538const struct gfs2_glock_operations gfs2_inode_glops = {
539 .go_xmote_th = inode_go_sync, 539 .go_sync = inode_go_sync,
540 .go_inval = inode_go_inval, 540 .go_inval = inode_go_inval,
541 .go_demote_ok = inode_go_demote_ok, 541 .go_demote_ok = inode_go_demote_ok,
542 .go_lock = inode_go_lock, 542 .go_lock = inode_go_lock,
@@ -546,17 +546,17 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
546}; 546};
547 547
548const struct gfs2_glock_operations gfs2_rgrp_glops = { 548const struct gfs2_glock_operations gfs2_rgrp_glops = {
549 .go_xmote_th = rgrp_go_sync, 549 .go_sync = rgrp_go_sync,
550 .go_inval = rgrp_go_inval, 550 .go_inval = rgrp_go_inval,
551 .go_lock = gfs2_rgrp_go_lock, 551 .go_lock = gfs2_rgrp_go_lock,
552 .go_unlock = gfs2_rgrp_go_unlock, 552 .go_unlock = gfs2_rgrp_go_unlock,
553 .go_dump = gfs2_rgrp_dump, 553 .go_dump = gfs2_rgrp_dump,
554 .go_type = LM_TYPE_RGRP, 554 .go_type = LM_TYPE_RGRP,
555 .go_flags = GLOF_ASPACE, 555 .go_flags = GLOF_ASPACE | GLOF_LVB,
556}; 556};
557 557
558const struct gfs2_glock_operations gfs2_trans_glops = { 558const struct gfs2_glock_operations gfs2_trans_glops = {
559 .go_xmote_th = trans_go_sync, 559 .go_sync = trans_go_sync,
560 .go_xmote_bh = trans_go_xmote_bh, 560 .go_xmote_bh = trans_go_xmote_bh,
561 .go_demote_ok = trans_go_demote_ok, 561 .go_demote_ok = trans_go_demote_ok,
562 .go_type = LM_TYPE_NONDISK, 562 .go_type = LM_TYPE_NONDISK,
@@ -577,6 +577,7 @@ const struct gfs2_glock_operations gfs2_nondisk_glops = {
577 577
578const struct gfs2_glock_operations gfs2_quota_glops = { 578const struct gfs2_glock_operations gfs2_quota_glops = {
579 .go_type = LM_TYPE_QUOTA, 579 .go_type = LM_TYPE_QUOTA,
580 .go_flags = GLOF_LVB,
580}; 581};
581 582
582const struct gfs2_glock_operations gfs2_journal_glops = { 583const struct gfs2_glock_operations gfs2_journal_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 3d469d37345e..c373a24fedd9 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -205,7 +205,7 @@ struct lm_lockname {
205 205
206 206
207struct gfs2_glock_operations { 207struct gfs2_glock_operations {
208 void (*go_xmote_th) (struct gfs2_glock *gl); 208 void (*go_sync) (struct gfs2_glock *gl);
209 int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh); 209 int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
210 void (*go_inval) (struct gfs2_glock *gl, int flags); 210 void (*go_inval) (struct gfs2_glock *gl, int flags);
211 int (*go_demote_ok) (const struct gfs2_glock *gl); 211 int (*go_demote_ok) (const struct gfs2_glock *gl);
@@ -216,6 +216,7 @@ struct gfs2_glock_operations {
216 const int go_type; 216 const int go_type;
217 const unsigned long go_flags; 217 const unsigned long go_flags;
218#define GLOF_ASPACE 1 218#define GLOF_ASPACE 1
219#define GLOF_LVB 2
219}; 220};
220 221
221enum { 222enum {
@@ -321,7 +322,6 @@ struct gfs2_glock {
321 ktime_t gl_dstamp; 322 ktime_t gl_dstamp;
322 struct gfs2_lkstats gl_stats; 323 struct gfs2_lkstats gl_stats;
323 struct dlm_lksb gl_lksb; 324 struct dlm_lksb gl_lksb;
324 char gl_lvb[32];
325 unsigned long gl_tchange; 325 unsigned long gl_tchange;
326 void *gl_object; 326 void *gl_object;
327 327
@@ -539,6 +539,7 @@ enum {
539 SDF_DEMOTE = 5, 539 SDF_DEMOTE = 5,
540 SDF_NOJOURNALID = 6, 540 SDF_NOJOURNALID = 6,
541 SDF_RORECOVERY = 7, /* read only recovery */ 541 SDF_RORECOVERY = 7, /* read only recovery */
542 SDF_SKIP_DLM_UNLOCK = 8,
542}; 543};
543 544
544#define GFS2_FSNAME_LEN 256 545#define GFS2_FSNAME_LEN 256
@@ -621,6 +622,7 @@ struct gfs2_sbd {
621 u32 sd_hash_bsize_shift; 622 u32 sd_hash_bsize_shift;
622 u32 sd_hash_ptrs; /* Number of pointers in a hash block */ 623 u32 sd_hash_ptrs; /* Number of pointers in a hash block */
623 u32 sd_qc_per_block; 624 u32 sd_qc_per_block;
625 u32 sd_blocks_per_bitmap;
624 u32 sd_max_dirres; /* Max blocks needed to add a directory entry */ 626 u32 sd_max_dirres; /* Max blocks needed to add a directory entry */
625 u32 sd_max_height; /* Max height of a file's metadata tree */ 627 u32 sd_max_height; /* Max height of a file's metadata tree */
626 u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1]; 628 u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 381893ceefa4..2b6f5698ef18 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -364,34 +364,34 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
364 return 0; 364 return 0;
365} 365}
366 366
367static void munge_mode_uid_gid(struct gfs2_inode *dip, umode_t *mode, 367static void munge_mode_uid_gid(const struct gfs2_inode *dip,
368 unsigned int *uid, unsigned int *gid) 368 struct inode *inode)
369{ 369{
370 if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir && 370 if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
371 (dip->i_inode.i_mode & S_ISUID) && dip->i_inode.i_uid) { 371 (dip->i_inode.i_mode & S_ISUID) && dip->i_inode.i_uid) {
372 if (S_ISDIR(*mode)) 372 if (S_ISDIR(inode->i_mode))
373 *mode |= S_ISUID; 373 inode->i_mode |= S_ISUID;
374 else if (dip->i_inode.i_uid != current_fsuid()) 374 else if (dip->i_inode.i_uid != current_fsuid())
375 *mode &= ~07111; 375 inode->i_mode &= ~07111;
376 *uid = dip->i_inode.i_uid; 376 inode->i_uid = dip->i_inode.i_uid;
377 } else 377 } else
378 *uid = current_fsuid(); 378 inode->i_uid = current_fsuid();
379 379
380 if (dip->i_inode.i_mode & S_ISGID) { 380 if (dip->i_inode.i_mode & S_ISGID) {
381 if (S_ISDIR(*mode)) 381 if (S_ISDIR(inode->i_mode))
382 *mode |= S_ISGID; 382 inode->i_mode |= S_ISGID;
383 *gid = dip->i_inode.i_gid; 383 inode->i_gid = dip->i_inode.i_gid;
384 } else 384 } else
385 *gid = current_fsgid(); 385 inode->i_gid = current_fsgid();
386} 386}
387 387
388static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation) 388static int alloc_dinode(struct gfs2_inode *ip, u32 flags)
389{ 389{
390 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 390 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
391 int error; 391 int error;
392 int dblocks = 1; 392 int dblocks = 1;
393 393
394 error = gfs2_inplace_reserve(dip, RES_DINODE); 394 error = gfs2_inplace_reserve(ip, RES_DINODE, flags);
395 if (error) 395 if (error)
396 goto out; 396 goto out;
397 397
@@ -399,12 +399,15 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
399 if (error) 399 if (error)
400 goto out_ipreserv; 400 goto out_ipreserv;
401 401
402 error = gfs2_alloc_blocks(dip, no_addr, &dblocks, 1, generation); 402 error = gfs2_alloc_blocks(ip, &ip->i_no_addr, &dblocks, 1, &ip->i_generation);
403 ip->i_no_formal_ino = ip->i_generation;
404 ip->i_inode.i_ino = ip->i_no_addr;
405 ip->i_goal = ip->i_no_addr;
403 406
404 gfs2_trans_end(sdp); 407 gfs2_trans_end(sdp);
405 408
406out_ipreserv: 409out_ipreserv:
407 gfs2_inplace_release(dip); 410 gfs2_inplace_release(ip);
408out: 411out:
409 return error; 412 return error;
410} 413}
@@ -429,52 +432,42 @@ static void gfs2_init_dir(struct buffer_head *dibh,
429/** 432/**
430 * init_dinode - Fill in a new dinode structure 433 * init_dinode - Fill in a new dinode structure
431 * @dip: The directory this inode is being created in 434 * @dip: The directory this inode is being created in
432 * @gl: The glock covering the new inode 435 * @ip: The inode
433 * @inum: The inode number
434 * @mode: The file permissions
435 * @uid: The uid of the new inode
436 * @gid: The gid of the new inode
437 * @generation: The generation number of the new inode
438 * @dev: The device number (if a device node)
439 * @symname: The symlink destination (if a symlink) 436 * @symname: The symlink destination (if a symlink)
440 * @size: The inode size (ignored for directories)
441 * @bhp: The buffer head (returned to caller) 437 * @bhp: The buffer head (returned to caller)
442 * 438 *
443 */ 439 */
444 440
445static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, 441static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
446 const struct gfs2_inum_host *inum, umode_t mode, 442 const char *symname, struct buffer_head **bhp)
447 unsigned int uid, unsigned int gid,
448 const u64 *generation, dev_t dev, const char *symname,
449 unsigned size, struct buffer_head **bhp)
450{ 443{
451 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 444 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
452 struct gfs2_dinode *di; 445 struct gfs2_dinode *di;
453 struct buffer_head *dibh; 446 struct buffer_head *dibh;
454 struct timespec tv = CURRENT_TIME; 447 struct timespec tv = CURRENT_TIME;
455 448
456 dibh = gfs2_meta_new(gl, inum->no_addr); 449 dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr);
457 gfs2_trans_add_bh(gl, dibh, 1); 450 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
458 gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI); 451 gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
459 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 452 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
460 di = (struct gfs2_dinode *)dibh->b_data; 453 di = (struct gfs2_dinode *)dibh->b_data;
461 454
462 di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino); 455 di->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
463 di->di_num.no_addr = cpu_to_be64(inum->no_addr); 456 di->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
464 di->di_mode = cpu_to_be32(mode); 457 di->di_mode = cpu_to_be32(ip->i_inode.i_mode);
465 di->di_uid = cpu_to_be32(uid); 458 di->di_uid = cpu_to_be32(ip->i_inode.i_uid);
466 di->di_gid = cpu_to_be32(gid); 459 di->di_gid = cpu_to_be32(ip->i_inode.i_gid);
467 di->di_nlink = 0; 460 di->di_nlink = 0;
468 di->di_size = cpu_to_be64(size); 461 di->di_size = cpu_to_be64(ip->i_inode.i_size);
469 di->di_blocks = cpu_to_be64(1); 462 di->di_blocks = cpu_to_be64(1);
470 di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec); 463 di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec);
471 di->di_major = cpu_to_be32(MAJOR(dev)); 464 di->di_major = cpu_to_be32(MAJOR(ip->i_inode.i_rdev));
472 di->di_minor = cpu_to_be32(MINOR(dev)); 465 di->di_minor = cpu_to_be32(MINOR(ip->i_inode.i_rdev));
473 di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr); 466 di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_no_addr);
474 di->di_generation = cpu_to_be64(*generation); 467 di->di_generation = cpu_to_be64(ip->i_generation);
475 di->di_flags = 0; 468 di->di_flags = 0;
476 di->__pad1 = 0; 469 di->__pad1 = 0;
477 di->di_payload_format = cpu_to_be32(S_ISDIR(mode) ? GFS2_FORMAT_DE : 0); 470 di->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) ? GFS2_FORMAT_DE : 0);
478 di->di_height = 0; 471 di->di_height = 0;
479 di->__pad2 = 0; 472 di->__pad2 = 0;
480 di->__pad3 = 0; 473 di->__pad3 = 0;
@@ -487,7 +480,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
487 di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec); 480 di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec);
488 memset(&di->di_reserved, 0, sizeof(di->di_reserved)); 481 memset(&di->di_reserved, 0, sizeof(di->di_reserved));
489 482
490 switch(mode & S_IFMT) { 483 switch(ip->i_inode.i_mode & S_IFMT) {
491 case S_IFREG: 484 case S_IFREG:
492 if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) || 485 if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
493 gfs2_tune_get(sdp, gt_new_files_jdata)) 486 gfs2_tune_get(sdp, gt_new_files_jdata))
@@ -502,7 +495,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
502 gfs2_init_dir(dibh, dip); 495 gfs2_init_dir(dibh, dip);
503 break; 496 break;
504 case S_IFLNK: 497 case S_IFLNK:
505 memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, size); 498 memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, ip->i_inode.i_size);
506 break; 499 break;
507 } 500 }
508 501
@@ -511,25 +504,22 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
511 *bhp = dibh; 504 *bhp = dibh;
512} 505}
513 506
514static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, 507static int make_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
515 umode_t mode, const struct gfs2_inum_host *inum, 508 const char *symname, struct buffer_head **bhp)
516 const u64 *generation, dev_t dev, const char *symname,
517 unsigned int size, struct buffer_head **bhp)
518{ 509{
510 struct inode *inode = &ip->i_inode;
519 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 511 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
520 unsigned int uid, gid;
521 int error; 512 int error;
522 513
523 munge_mode_uid_gid(dip, &mode, &uid, &gid);
524 error = gfs2_rindex_update(sdp); 514 error = gfs2_rindex_update(sdp);
525 if (error) 515 if (error)
526 return error; 516 return error;
527 517
528 error = gfs2_quota_lock(dip, uid, gid); 518 error = gfs2_quota_lock(dip, inode->i_uid, inode->i_gid);
529 if (error) 519 if (error)
530 return error; 520 return error;
531 521
532 error = gfs2_quota_check(dip, uid, gid); 522 error = gfs2_quota_check(dip, inode->i_uid, inode->i_gid);
533 if (error) 523 if (error)
534 goto out_quota; 524 goto out_quota;
535 525
@@ -537,8 +527,8 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
537 if (error) 527 if (error)
538 goto out_quota; 528 goto out_quota;
539 529
540 init_dinode(dip, gl, inum, mode, uid, gid, generation, dev, symname, size, bhp); 530 init_dinode(dip, ip, symname, bhp);
541 gfs2_quota_change(dip, +1, uid, gid); 531 gfs2_quota_change(dip, +1, inode->i_uid, inode->i_gid);
542 gfs2_trans_end(sdp); 532 gfs2_trans_end(sdp);
543 533
544out_quota: 534out_quota:
@@ -570,7 +560,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
570 if (error) 560 if (error)
571 goto fail_quota_locks; 561 goto fail_quota_locks;
572 562
573 error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres); 563 error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0);
574 if (error) 564 if (error)
575 goto fail_quota_locks; 565 goto fail_quota_locks;
576 566
@@ -657,19 +647,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
657 struct inode *inode = NULL; 647 struct inode *inode = NULL;
658 struct gfs2_inode *dip = GFS2_I(dir), *ip; 648 struct gfs2_inode *dip = GFS2_I(dir), *ip;
659 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 649 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
660 struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 }; 650 struct gfs2_glock *io_gl;
661 int error; 651 int error;
662 u64 generation;
663 struct buffer_head *bh = NULL; 652 struct buffer_head *bh = NULL;
653 u32 aflags = 0;
664 654
665 if (!name->len || name->len > GFS2_FNAMESIZE) 655 if (!name->len || name->len > GFS2_FNAMESIZE)
666 return -ENAMETOOLONG; 656 return -ENAMETOOLONG;
667 657
668 /* We need a reservation to allocate the new dinode block. The
669 directory ip temporarily points to the reservation, but this is
670 being done to get a set of contiguous blocks for the new dinode.
671 Since this is a create, we don't have a sizehint yet, so it will
672 have to use the minimum reservation size. */
673 error = gfs2_rs_alloc(dip); 658 error = gfs2_rs_alloc(dip);
674 if (error) 659 if (error)
675 return error; 660 return error;
@@ -688,45 +673,72 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
688 if (error) 673 if (error)
689 goto fail_gunlock; 674 goto fail_gunlock;
690 675
691 error = alloc_dinode(dip, &inum.no_addr, &generation); 676 inode = new_inode(sdp->sd_vfs);
677 if (!inode) {
678 gfs2_glock_dq_uninit(ghs);
679 return -ENOMEM;
680 }
681 ip = GFS2_I(inode);
682 error = gfs2_rs_alloc(ip);
692 if (error) 683 if (error)
693 goto fail_gunlock; 684 goto fail_free_inode;
694 inum.no_formal_ino = generation; 685
686 set_bit(GIF_INVALID, &ip->i_flags);
687 inode->i_mode = mode;
688 inode->i_rdev = dev;
689 inode->i_size = size;
690 munge_mode_uid_gid(dip, inode);
691 ip->i_goal = dip->i_goal;
695 692
696 error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops, 693 if ((GFS2_I(sdp->sd_root_dir->d_inode) == dip) ||
697 LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); 694 (dip->i_diskflags & GFS2_DIF_TOPDIR))
695 aflags |= GFS2_AF_ORLOV;
696
697 error = alloc_dinode(ip, aflags);
698 if (error) 698 if (error)
699 goto fail_gunlock; 699 goto fail_free_inode;
700 700
701 error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev, symname, size, &bh); 701 error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
702 if (error) 702 if (error)
703 goto fail_gunlock2; 703 goto fail_free_inode;
704 704
705 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr, 705 ip->i_gl->gl_object = ip;
706 inum.no_formal_ino, 0); 706 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
707 if (IS_ERR(inode)) 707 if (error)
708 goto fail_free_inode;
709
710 error = make_dinode(dip, ip, symname, &bh);
711 if (error)
708 goto fail_gunlock2; 712 goto fail_gunlock2;
709 713
710 ip = GFS2_I(inode); 714 error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
711 error = gfs2_inode_refresh(ip);
712 if (error) 715 if (error)
713 goto fail_gunlock2; 716 goto fail_gunlock2;
714 717
715 error = gfs2_rs_alloc(ip); 718 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
716 if (error) 719 if (error)
717 goto fail_gunlock2; 720 goto fail_gunlock2;
718 721
722 ip->i_iopen_gh.gh_gl->gl_object = ip;
723 gfs2_glock_put(io_gl);
724 gfs2_set_iop(inode);
725 insert_inode_hash(inode);
726
727 error = gfs2_inode_refresh(ip);
728 if (error)
729 goto fail_gunlock3;
730
719 error = gfs2_acl_create(dip, inode); 731 error = gfs2_acl_create(dip, inode);
720 if (error) 732 if (error)
721 goto fail_gunlock2; 733 goto fail_gunlock3;
722 734
723 error = gfs2_security_init(dip, ip, name); 735 error = gfs2_security_init(dip, ip, name);
724 if (error) 736 if (error)
725 goto fail_gunlock2; 737 goto fail_gunlock3;
726 738
727 error = link_dinode(dip, name, ip); 739 error = link_dinode(dip, name, ip);
728 if (error) 740 if (error)
729 goto fail_gunlock2; 741 goto fail_gunlock3;
730 742
731 if (bh) 743 if (bh)
732 brelse(bh); 744 brelse(bh);
@@ -739,8 +751,20 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
739 d_instantiate(dentry, inode); 751 d_instantiate(dentry, inode);
740 return 0; 752 return 0;
741 753
754fail_gunlock3:
755 gfs2_glock_dq_uninit(ghs + 1);
756 if (ip->i_gl)
757 gfs2_glock_put(ip->i_gl);
758 goto fail_gunlock;
759
742fail_gunlock2: 760fail_gunlock2:
743 gfs2_glock_dq_uninit(ghs + 1); 761 gfs2_glock_dq_uninit(ghs + 1);
762fail_free_inode:
763 if (ip->i_gl)
764 gfs2_glock_put(ip->i_gl);
765 gfs2_rs_delete(ip);
766 free_inode_nonrcu(inode);
767 inode = NULL;
744fail_gunlock: 768fail_gunlock:
745 gfs2_glock_dq_uninit(ghs); 769 gfs2_glock_dq_uninit(ghs);
746 if (inode && !IS_ERR(inode)) { 770 if (inode && !IS_ERR(inode)) {
@@ -748,7 +772,6 @@ fail_gunlock:
748 iput(inode); 772 iput(inode);
749 } 773 }
750fail: 774fail:
751 gfs2_rs_delete(dip);
752 if (bh) 775 if (bh)
753 brelse(bh); 776 brelse(bh);
754 return error; 777 return error;
@@ -884,7 +907,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
884 if (error) 907 if (error)
885 goto out_gunlock; 908 goto out_gunlock;
886 909
887 error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres); 910 error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0);
888 if (error) 911 if (error)
889 goto out_gunlock_q; 912 goto out_gunlock_q;
890 913
@@ -977,7 +1000,6 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
977 * gfs2_unlink_inode - Removes an inode from its parent dir and unlinks it 1000 * gfs2_unlink_inode - Removes an inode from its parent dir and unlinks it
978 * @dip: The parent directory 1001 * @dip: The parent directory
979 * @name: The name of the entry in the parent directory 1002 * @name: The name of the entry in the parent directory
980 * @bh: The inode buffer for the inode to be removed
981 * @inode: The inode to be removed 1003 * @inode: The inode to be removed
982 * 1004 *
983 * Called with all the locks and in a transaction. This will only be 1005 * Called with all the locks and in a transaction. This will only be
@@ -987,8 +1009,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
987 */ 1009 */
988 1010
989static int gfs2_unlink_inode(struct gfs2_inode *dip, 1011static int gfs2_unlink_inode(struct gfs2_inode *dip,
990 const struct dentry *dentry, 1012 const struct dentry *dentry)
991 struct buffer_head *bh)
992{ 1013{
993 struct inode *inode = dentry->d_inode; 1014 struct inode *inode = dentry->d_inode;
994 struct gfs2_inode *ip = GFS2_I(inode); 1015 struct gfs2_inode *ip = GFS2_I(inode);
@@ -1028,7 +1049,6 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
1028 struct gfs2_sbd *sdp = GFS2_SB(dir); 1049 struct gfs2_sbd *sdp = GFS2_SB(dir);
1029 struct inode *inode = dentry->d_inode; 1050 struct inode *inode = dentry->d_inode;
1030 struct gfs2_inode *ip = GFS2_I(inode); 1051 struct gfs2_inode *ip = GFS2_I(inode);
1031 struct buffer_head *bh;
1032 struct gfs2_holder ghs[3]; 1052 struct gfs2_holder ghs[3];
1033 struct gfs2_rgrpd *rgd; 1053 struct gfs2_rgrpd *rgd;
1034 int error; 1054 int error;
@@ -1077,14 +1097,9 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
1077 1097
1078 error = gfs2_trans_begin(sdp, 2*RES_DINODE + 3*RES_LEAF + RES_RG_BIT, 0); 1098 error = gfs2_trans_begin(sdp, 2*RES_DINODE + 3*RES_LEAF + RES_RG_BIT, 0);
1079 if (error) 1099 if (error)
1080 goto out_gunlock;
1081
1082 error = gfs2_meta_inode_buffer(ip, &bh);
1083 if (error)
1084 goto out_end_trans; 1100 goto out_end_trans;
1085 1101
1086 error = gfs2_unlink_inode(dip, dentry, bh); 1102 error = gfs2_unlink_inode(dip, dentry);
1087 brelse(bh);
1088 1103
1089out_end_trans: 1104out_end_trans:
1090 gfs2_trans_end(sdp); 1105 gfs2_trans_end(sdp);
@@ -1365,7 +1380,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1365 if (error) 1380 if (error)
1366 goto out_gunlock; 1381 goto out_gunlock;
1367 1382
1368 error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres); 1383 error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres, 0);
1369 if (error) 1384 if (error)
1370 goto out_gunlock_q; 1385 goto out_gunlock_q;
1371 1386
@@ -1384,14 +1399,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1384 1399
1385 /* Remove the target file, if it exists */ 1400 /* Remove the target file, if it exists */
1386 1401
1387 if (nip) { 1402 if (nip)
1388 struct buffer_head *bh; 1403 error = gfs2_unlink_inode(ndip, ndentry);
1389 error = gfs2_meta_inode_buffer(nip, &bh);
1390 if (error)
1391 goto out_end_trans;
1392 error = gfs2_unlink_inode(ndip, ndentry, bh);
1393 brelse(bh);
1394 }
1395 1404
1396 if (dir_rename) { 1405 if (dir_rename) {
1397 error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); 1406 error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 0fb6539b0c8c..8dad6b093716 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -120,8 +120,8 @@ static void gdlm_ast(void *arg)
120 gfs2_update_reply_times(gl); 120 gfs2_update_reply_times(gl);
121 BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); 121 BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
122 122
123 if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) 123 if ((gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) && gl->gl_lksb.sb_lvbptr)
124 memset(gl->gl_lvb, 0, GDLM_LVB_SIZE); 124 memset(gl->gl_lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
125 125
126 switch (gl->gl_lksb.sb_status) { 126 switch (gl->gl_lksb.sb_status) {
127 case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ 127 case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
@@ -203,8 +203,10 @@ static int make_mode(const unsigned int lmstate)
203static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, 203static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
204 const int req) 204 const int req)
205{ 205{
206 u32 lkf = DLM_LKF_VALBLK; 206 u32 lkf = 0;
207 u32 lkid = gl->gl_lksb.sb_lkid; 207
208 if (gl->gl_lksb.sb_lvbptr)
209 lkf |= DLM_LKF_VALBLK;
208 210
209 if (gfs_flags & LM_FLAG_TRY) 211 if (gfs_flags & LM_FLAG_TRY)
210 lkf |= DLM_LKF_NOQUEUE; 212 lkf |= DLM_LKF_NOQUEUE;
@@ -228,7 +230,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
228 BUG(); 230 BUG();
229 } 231 }
230 232
231 if (lkid != 0) { 233 if (gl->gl_lksb.sb_lkid != 0) {
232 lkf |= DLM_LKF_CONVERT; 234 lkf |= DLM_LKF_CONVERT;
233 if (test_bit(GLF_BLOCKING, &gl->gl_flags)) 235 if (test_bit(GLF_BLOCKING, &gl->gl_flags))
234 lkf |= DLM_LKF_QUECVT; 236 lkf |= DLM_LKF_QUECVT;
@@ -289,6 +291,14 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
289 gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); 291 gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT);
290 gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); 292 gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
291 gfs2_update_request_times(gl); 293 gfs2_update_request_times(gl);
294
295 /* don't want to skip dlm_unlock writing the lvb when lock is ex */
296 if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) &&
297 gl->gl_lksb.sb_lvbptr && (gl->gl_state != LM_ST_EXCLUSIVE)) {
298 gfs2_glock_free(gl);
299 return;
300 }
301
292 error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, 302 error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
293 NULL, gl); 303 NULL, gl);
294 if (error) { 304 if (error) {
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 8ff95a2d54ee..9ceccb1595a3 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -393,12 +393,10 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
393 struct gfs2_meta_header *mh; 393 struct gfs2_meta_header *mh;
394 struct gfs2_trans *tr; 394 struct gfs2_trans *tr;
395 395
396 lock_buffer(bd->bd_bh);
397 gfs2_log_lock(sdp);
398 tr = current->journal_info; 396 tr = current->journal_info;
399 tr->tr_touched = 1; 397 tr->tr_touched = 1;
400 if (!list_empty(&bd->bd_list)) 398 if (!list_empty(&bd->bd_list))
401 goto out; 399 return;
402 set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); 400 set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
403 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); 401 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
404 mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; 402 mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
@@ -414,9 +412,6 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
414 sdp->sd_log_num_buf++; 412 sdp->sd_log_num_buf++;
415 list_add(&bd->bd_list, &sdp->sd_log_le_buf); 413 list_add(&bd->bd_list, &sdp->sd_log_le_buf);
416 tr->tr_num_buf_new++; 414 tr->tr_num_buf_new++;
417out:
418 gfs2_log_unlock(sdp);
419 unlock_buffer(bd->bd_bh);
420} 415}
421 416
422static void gfs2_check_magic(struct buffer_head *bh) 417static void gfs2_check_magic(struct buffer_head *bh)
@@ -621,7 +616,6 @@ static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
621 616
622static void revoke_lo_before_commit(struct gfs2_sbd *sdp) 617static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
623{ 618{
624 struct gfs2_log_descriptor *ld;
625 struct gfs2_meta_header *mh; 619 struct gfs2_meta_header *mh;
626 unsigned int offset; 620 unsigned int offset;
627 struct list_head *head = &sdp->sd_log_le_revoke; 621 struct list_head *head = &sdp->sd_log_le_revoke;
@@ -634,7 +628,6 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
634 628
635 length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, sizeof(u64)); 629 length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, sizeof(u64));
636 page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke); 630 page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke);
637 ld = page_address(page);
638 offset = sizeof(struct gfs2_log_descriptor); 631 offset = sizeof(struct gfs2_log_descriptor);
639 632
640 list_for_each_entry(bd, head, bd_list) { 633 list_for_each_entry(bd, head, bd_list) {
@@ -777,12 +770,10 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
777 struct address_space *mapping = bd->bd_bh->b_page->mapping; 770 struct address_space *mapping = bd->bd_bh->b_page->mapping;
778 struct gfs2_inode *ip = GFS2_I(mapping->host); 771 struct gfs2_inode *ip = GFS2_I(mapping->host);
779 772
780 lock_buffer(bd->bd_bh);
781 gfs2_log_lock(sdp);
782 if (tr) 773 if (tr)
783 tr->tr_touched = 1; 774 tr->tr_touched = 1;
784 if (!list_empty(&bd->bd_list)) 775 if (!list_empty(&bd->bd_list))
785 goto out; 776 return;
786 set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); 777 set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
787 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); 778 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
788 if (gfs2_is_jdata(ip)) { 779 if (gfs2_is_jdata(ip)) {
@@ -793,9 +784,6 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
793 } else { 784 } else {
794 list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered); 785 list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered);
795 } 786 }
796out:
797 gfs2_log_unlock(sdp);
798 unlock_buffer(bd->bd_bh);
799} 787}
800 788
801/** 789/**
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index e443966c8106..0e3554edb8f2 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -278,6 +278,9 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
278 sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize - 278 sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
279 sizeof(struct gfs2_meta_header)) / 279 sizeof(struct gfs2_meta_header)) /
280 sizeof(struct gfs2_quota_change); 280 sizeof(struct gfs2_quota_change);
281 sdp->sd_blocks_per_bitmap = (sdp->sd_sb.sb_bsize -
282 sizeof(struct gfs2_meta_header))
283 * GFS2_NBBY; /* not the rgrp bitmap, subsequent bitmaps only */
281 284
282 /* Compute maximum reservation required to add a entry to a directory */ 285 /* Compute maximum reservation required to add a entry to a directory */
283 286
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 40c4b0d42fa8..ae55e248c3b7 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -497,8 +497,11 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
497 struct gfs2_quota_data **qd; 497 struct gfs2_quota_data **qd;
498 int error; 498 int error;
499 499
500 if (ip->i_res == NULL) 500 if (ip->i_res == NULL) {
501 gfs2_rs_alloc(ip); 501 error = gfs2_rs_alloc(ip);
502 if (error)
503 return error;
504 }
502 505
503 qd = ip->i_res->rs_qa_qd; 506 qd = ip->i_res->rs_qa_qd;
504 507
@@ -813,7 +816,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
813 blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3; 816 blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
814 817
815 reserved = 1 + (nalloc * (data_blocks + ind_blocks)); 818 reserved = 1 + (nalloc * (data_blocks + ind_blocks));
816 error = gfs2_inplace_reserve(ip, reserved); 819 error = gfs2_inplace_reserve(ip, reserved, 0);
817 if (error) 820 if (error)
818 goto out_alloc; 821 goto out_alloc;
819 822
@@ -866,7 +869,7 @@ static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
866 if (error < 0) 869 if (error < 0)
867 return error; 870 return error;
868 871
869 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; 872 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
870 qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC); 873 qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
871 qlvb->__pad = 0; 874 qlvb->__pad = 0;
872 qlvb->qb_limit = q.qu_limit; 875 qlvb->qb_limit = q.qu_limit;
@@ -890,7 +893,7 @@ restart:
890 if (error) 893 if (error)
891 return error; 894 return error;
892 895
893 qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; 896 qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
894 897
895 if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) { 898 if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
896 gfs2_glock_dq_uninit(q_gh); 899 gfs2_glock_dq_uninit(q_gh);
@@ -1503,7 +1506,7 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
1503 if (error) 1506 if (error)
1504 goto out; 1507 goto out;
1505 1508
1506 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; 1509 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
1507 fdq->d_version = FS_DQUOT_VERSION; 1510 fdq->d_version = FS_DQUOT_VERSION;
1508 fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA; 1511 fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
1509 fdq->d_id = from_kqid(&init_user_ns, qid); 1512 fdq->d_id = from_kqid(&init_user_ns, qid);
@@ -1602,7 +1605,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
1602 gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), 1605 gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
1603 &data_blocks, &ind_blocks); 1606 &data_blocks, &ind_blocks);
1604 blocks = 1 + data_blocks + ind_blocks; 1607 blocks = 1 + data_blocks + ind_blocks;
1605 error = gfs2_inplace_reserve(ip, blocks); 1608 error = gfs2_inplace_reserve(ip, blocks, 0);
1606 if (error) 1609 if (error)
1607 goto out_i; 1610 goto out_i;
1608 blocks += gfs2_rg_blocks(ip, blocks); 1611 blocks += gfs2_rg_blocks(ip, blocks);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3cc402ce6fea..37ee061d899e 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -16,6 +16,7 @@
16#include <linux/prefetch.h> 16#include <linux/prefetch.h>
17#include <linux/blkdev.h> 17#include <linux/blkdev.h>
18#include <linux/rbtree.h> 18#include <linux/rbtree.h>
19#include <linux/random.h>
19 20
20#include "gfs2.h" 21#include "gfs2.h"
21#include "incore.h" 22#include "incore.h"
@@ -251,22 +252,25 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
251static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block) 252static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
252{ 253{
253 u64 rblock = block - rbm->rgd->rd_data0; 254 u64 rblock = block - rbm->rgd->rd_data0;
254 u32 goal = (u32)rblock; 255 u32 x;
255 int x;
256 256
257 if (WARN_ON_ONCE(rblock > UINT_MAX)) 257 if (WARN_ON_ONCE(rblock > UINT_MAX))
258 return -EINVAL; 258 return -EINVAL;
259 if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data) 259 if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data)
260 return -E2BIG; 260 return -E2BIG;
261 261
262 for (x = 0; x < rbm->rgd->rd_length; x++) { 262 rbm->bi = rbm->rgd->rd_bits;
263 rbm->bi = rbm->rgd->rd_bits + x; 263 rbm->offset = (u32)(rblock);
264 if (goal < (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY) { 264 /* Check if the block is within the first block */
265 rbm->offset = goal - (rbm->bi->bi_start * GFS2_NBBY); 265 if (rbm->offset < (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY)
266 break; 266 return 0;
267 }
268 }
269 267
268 /* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
269 rbm->offset += (sizeof(struct gfs2_rgrp) -
270 sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
271 x = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
272 rbm->offset -= x * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
273 rbm->bi += x;
270 return 0; 274 return 0;
271} 275}
272 276
@@ -553,7 +557,6 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)
553 */ 557 */
554int gfs2_rs_alloc(struct gfs2_inode *ip) 558int gfs2_rs_alloc(struct gfs2_inode *ip)
555{ 559{
556 int error = 0;
557 struct gfs2_blkreserv *res; 560 struct gfs2_blkreserv *res;
558 561
559 if (ip->i_res) 562 if (ip->i_res)
@@ -561,7 +564,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
561 564
562 res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS); 565 res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
563 if (!res) 566 if (!res)
564 error = -ENOMEM; 567 return -ENOMEM;
565 568
566 RB_CLEAR_NODE(&res->rs_node); 569 RB_CLEAR_NODE(&res->rs_node);
567 570
@@ -571,7 +574,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
571 else 574 else
572 ip->i_res = res; 575 ip->i_res = res;
573 up_write(&ip->i_rw_mutex); 576 up_write(&ip->i_rw_mutex);
574 return error; 577 return 0;
575} 578}
576 579
577static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs) 580static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
@@ -876,7 +879,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
876 goto fail; 879 goto fail;
877 880
878 rgd->rd_gl->gl_object = rgd; 881 rgd->rd_gl->gl_object = rgd;
879 rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lvb; 882 rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
880 rgd->rd_flags &= ~GFS2_RDF_UPTODATE; 883 rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
881 if (rgd->rd_data > sdp->sd_max_rg_data) 884 if (rgd->rd_data > sdp->sd_max_rg_data)
882 sdp->sd_max_rg_data = rgd->rd_data; 885 sdp->sd_max_rg_data = rgd->rd_data;
@@ -1263,7 +1266,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
1263 int ret = 0; 1266 int ret = 0;
1264 u64 amt; 1267 u64 amt;
1265 u64 trimmed = 0; 1268 u64 trimmed = 0;
1269 u64 start, end, minlen;
1266 unsigned int x; 1270 unsigned int x;
1271 unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
1267 1272
1268 if (!capable(CAP_SYS_ADMIN)) 1273 if (!capable(CAP_SYS_ADMIN))
1269 return -EPERM; 1274 return -EPERM;
@@ -1271,19 +1276,25 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
1271 if (!blk_queue_discard(q)) 1276 if (!blk_queue_discard(q))
1272 return -EOPNOTSUPP; 1277 return -EOPNOTSUPP;
1273 1278
1274 if (argp == NULL) { 1279 if (copy_from_user(&r, argp, sizeof(r)))
1275 r.start = 0;
1276 r.len = ULLONG_MAX;
1277 r.minlen = 0;
1278 } else if (copy_from_user(&r, argp, sizeof(r)))
1279 return -EFAULT; 1280 return -EFAULT;
1280 1281
1281 ret = gfs2_rindex_update(sdp); 1282 ret = gfs2_rindex_update(sdp);
1282 if (ret) 1283 if (ret)
1283 return ret; 1284 return ret;
1284 1285
1285 rgd = gfs2_blk2rgrpd(sdp, r.start, 0); 1286 start = r.start >> bs_shift;
1286 rgd_end = gfs2_blk2rgrpd(sdp, r.start + r.len, 0); 1287 end = start + (r.len >> bs_shift);
1288 minlen = max_t(u64, r.minlen,
1289 q->limits.discard_granularity) >> bs_shift;
1290
1291 rgd = gfs2_blk2rgrpd(sdp, start, 0);
1292 rgd_end = gfs2_blk2rgrpd(sdp, end - 1, 0);
1293
1294 if (end <= start ||
1295 minlen > sdp->sd_max_rg_data ||
1296 start > rgd_end->rd_data0 + rgd_end->rd_data)
1297 return -EINVAL;
1287 1298
1288 while (1) { 1299 while (1) {
1289 1300
@@ -1295,7 +1306,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
1295 /* Trim each bitmap in the rgrp */ 1306 /* Trim each bitmap in the rgrp */
1296 for (x = 0; x < rgd->rd_length; x++) { 1307 for (x = 0; x < rgd->rd_length; x++) {
1297 struct gfs2_bitmap *bi = rgd->rd_bits + x; 1308 struct gfs2_bitmap *bi = rgd->rd_bits + x;
1298 ret = gfs2_rgrp_send_discards(sdp, rgd->rd_data0, NULL, bi, r.minlen, &amt); 1309 ret = gfs2_rgrp_send_discards(sdp,
1310 rgd->rd_data0, NULL, bi, minlen,
1311 &amt);
1299 if (ret) { 1312 if (ret) {
1300 gfs2_glock_dq_uninit(&gh); 1313 gfs2_glock_dq_uninit(&gh);
1301 goto out; 1314 goto out;
@@ -1324,7 +1337,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
1324 1337
1325out: 1338out:
1326 r.len = trimmed << 9; 1339 r.len = trimmed << 9;
1327 if (argp && copy_to_user(argp, &r, sizeof(r))) 1340 if (copy_to_user(argp, &r, sizeof(r)))
1328 return -EFAULT; 1341 return -EFAULT;
1329 1342
1330 return ret; 1343 return ret;
@@ -1669,13 +1682,105 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
1669 return; 1682 return;
1670} 1683}
1671 1684
1685/**
1686 * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
1687 * @rgd: The rgrp in question
1688 * @loops: An indication of how picky we can be (0=very, 1=less so)
1689 *
1690 * This function uses the recently added glock statistics in order to
1691 * figure out whether a parciular resource group is suffering from
1692 * contention from multiple nodes. This is done purely on the basis
1693 * of timings, since this is the only data we have to work with and
1694 * our aim here is to reject a resource group which is highly contended
1695 * but (very important) not to do this too often in order to ensure that
1696 * we do not land up introducing fragmentation by changing resource
1697 * groups when not actually required.
1698 *
1699 * The calculation is fairly simple, we want to know whether the SRTTB
1700 * (i.e. smoothed round trip time for blocking operations) to acquire
1701 * the lock for this rgrp's glock is significantly greater than the
1702 * time taken for resource groups on average. We introduce a margin in
1703 * the form of the variable @var which is computed as the sum of the two
1704 * respective variences, and multiplied by a factor depending on @loops
1705 * and whether we have a lot of data to base the decision on. This is
1706 * then tested against the square difference of the means in order to
1707 * decide whether the result is statistically significant or not.
1708 *
1709 * Returns: A boolean verdict on the congestion status
1710 */
1711
1712static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
1713{
1714 const struct gfs2_glock *gl = rgd->rd_gl;
1715 const struct gfs2_sbd *sdp = gl->gl_sbd;
1716 struct gfs2_lkstats *st;
1717 s64 r_dcount, l_dcount;
1718 s64 r_srttb, l_srttb;
1719 s64 srttb_diff;
1720 s64 sqr_diff;
1721 s64 var;
1722
1723 preempt_disable();
1724 st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
1725 r_srttb = st->stats[GFS2_LKS_SRTTB];
1726 r_dcount = st->stats[GFS2_LKS_DCOUNT];
1727 var = st->stats[GFS2_LKS_SRTTVARB] +
1728 gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
1729 preempt_enable();
1730
1731 l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
1732 l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
1733
1734 if ((l_dcount < 1) || (r_dcount < 1) || (r_srttb == 0))
1735 return false;
1736
1737 srttb_diff = r_srttb - l_srttb;
1738 sqr_diff = srttb_diff * srttb_diff;
1739
1740 var *= 2;
1741 if (l_dcount < 8 || r_dcount < 8)
1742 var *= 2;
1743 if (loops == 1)
1744 var *= 2;
1745
1746 return ((srttb_diff < 0) && (sqr_diff > var));
1747}
1748
1749/**
1750 * gfs2_rgrp_used_recently
1751 * @rs: The block reservation with the rgrp to test
1752 * @msecs: The time limit in milliseconds
1753 *
1754 * Returns: True if the rgrp glock has been used within the time limit
1755 */
1756static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
1757 u64 msecs)
1758{
1759 u64 tdiff;
1760
1761 tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
1762 rs->rs_rbm.rgd->rd_gl->gl_dstamp));
1763
1764 return tdiff > (msecs * 1000 * 1000);
1765}
1766
1767static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
1768{
1769 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1770 u32 skip;
1771
1772 get_random_bytes(&skip, sizeof(skip));
1773 return skip % sdp->sd_rgrps;
1774}
1775
1672static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin) 1776static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
1673{ 1777{
1674 struct gfs2_rgrpd *rgd = *pos; 1778 struct gfs2_rgrpd *rgd = *pos;
1779 struct gfs2_sbd *sdp = rgd->rd_sbd;
1675 1780
1676 rgd = gfs2_rgrpd_get_next(rgd); 1781 rgd = gfs2_rgrpd_get_next(rgd);
1677 if (rgd == NULL) 1782 if (rgd == NULL)
1678 rgd = gfs2_rgrpd_get_next(NULL); 1783 rgd = gfs2_rgrpd_get_first(sdp);
1679 *pos = rgd; 1784 *pos = rgd;
1680 if (rgd != begin) /* If we didn't wrap */ 1785 if (rgd != begin) /* If we didn't wrap */
1681 return true; 1786 return true;
@@ -1690,14 +1795,15 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *b
1690 * Returns: errno 1795 * Returns: errno
1691 */ 1796 */
1692 1797
1693int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) 1798int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags)
1694{ 1799{
1695 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1800 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1696 struct gfs2_rgrpd *begin = NULL; 1801 struct gfs2_rgrpd *begin = NULL;
1697 struct gfs2_blkreserv *rs = ip->i_res; 1802 struct gfs2_blkreserv *rs = ip->i_res;
1698 int error = 0, rg_locked, flags = LM_FLAG_TRY; 1803 int error = 0, rg_locked, flags = 0;
1699 u64 last_unlinked = NO_BLOCK; 1804 u64 last_unlinked = NO_BLOCK;
1700 int loops = 0; 1805 int loops = 0;
1806 u32 skip = 0;
1701 1807
1702 if (sdp->sd_args.ar_rgrplvb) 1808 if (sdp->sd_args.ar_rgrplvb)
1703 flags |= GL_SKIP; 1809 flags |= GL_SKIP;
@@ -1711,6 +1817,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
1711 } else { 1817 } else {
1712 rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); 1818 rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
1713 } 1819 }
1820 if (S_ISDIR(ip->i_inode.i_mode) && (aflags & GFS2_AF_ORLOV))
1821 skip = gfs2_orlov_skip(ip);
1714 if (rs->rs_rbm.rgd == NULL) 1822 if (rs->rs_rbm.rgd == NULL)
1715 return -EBADSLT; 1823 return -EBADSLT;
1716 1824
@@ -1719,13 +1827,20 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
1719 1827
1720 if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) { 1828 if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
1721 rg_locked = 0; 1829 rg_locked = 0;
1830 if (skip && skip--)
1831 goto next_rgrp;
1832 if (!gfs2_rs_active(rs) && (loops < 2) &&
1833 gfs2_rgrp_used_recently(rs, 1000) &&
1834 gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
1835 goto next_rgrp;
1722 error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl, 1836 error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
1723 LM_ST_EXCLUSIVE, flags, 1837 LM_ST_EXCLUSIVE, flags,
1724 &rs->rs_rgd_gh); 1838 &rs->rs_rgd_gh);
1725 if (error == GLR_TRYFAILED)
1726 goto next_rgrp;
1727 if (unlikely(error)) 1839 if (unlikely(error))
1728 return error; 1840 return error;
1841 if (!gfs2_rs_active(rs) && (loops < 2) &&
1842 gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
1843 goto skip_rgrp;
1729 if (sdp->sd_args.ar_rgrplvb) { 1844 if (sdp->sd_args.ar_rgrplvb) {
1730 error = update_rgrp_lvb(rs->rs_rbm.rgd); 1845 error = update_rgrp_lvb(rs->rs_rbm.rgd);
1731 if (unlikely(error)) { 1846 if (unlikely(error)) {
@@ -1772,12 +1887,13 @@ next_rgrp:
1772 /* Find the next rgrp, and continue looking */ 1887 /* Find the next rgrp, and continue looking */
1773 if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin)) 1888 if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
1774 continue; 1889 continue;
1890 if (skip)
1891 continue;
1775 1892
1776 /* If we've scanned all the rgrps, but found no free blocks 1893 /* If we've scanned all the rgrps, but found no free blocks
1777 * then this checks for some less likely conditions before 1894 * then this checks for some less likely conditions before
1778 * trying again. 1895 * trying again.
1779 */ 1896 */
1780 flags &= ~LM_FLAG_TRY;
1781 loops++; 1897 loops++;
1782 /* Check that fs hasn't grown if writing to rindex */ 1898 /* Check that fs hasn't grown if writing to rindex */
1783 if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) { 1899 if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 24077958dcf6..842185853f6b 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -39,7 +39,8 @@ extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
39 39
40extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); 40extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
41 41
42extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested); 42#define GFS2_AF_ORLOV 1
43extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 flags);
43extern void gfs2_inplace_release(struct gfs2_inode *ip); 44extern void gfs2_inplace_release(struct gfs2_inode *ip);
44 45
45extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, 46extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index bc737261f234..d6488674d916 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -810,7 +810,8 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
810 return; 810 return;
811 } 811 }
812 need_unlock = 1; 812 need_unlock = 1;
813 } 813 } else if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE))
814 return;
814 815
815 if (current->journal_info == NULL) { 816 if (current->journal_info == NULL) {
816 ret = gfs2_trans_begin(sdp, RES_DINODE, 0); 817 ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index bbdc78af60ca..2ee13e841e9f 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -486,7 +486,7 @@ TRACE_EVENT(gfs2_block_alloc,
486 ), 486 ),
487 487
488 TP_fast_assign( 488 TP_fast_assign(
489 __entry->dev = ip->i_gl->gl_sbd->sd_vfs->s_dev; 489 __entry->dev = rgd->rd_gl->gl_sbd->sd_vfs->s_dev;
490 __entry->start = block; 490 __entry->start = block;
491 __entry->inum = ip->i_no_addr; 491 __entry->inum = ip->i_no_addr;
492 __entry->len = len; 492 __entry->len = len;
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index adbd27875ef9..413627072f36 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -155,14 +155,22 @@ void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
155 struct gfs2_sbd *sdp = gl->gl_sbd; 155 struct gfs2_sbd *sdp = gl->gl_sbd;
156 struct gfs2_bufdata *bd; 156 struct gfs2_bufdata *bd;
157 157
158 lock_buffer(bh);
159 gfs2_log_lock(sdp);
158 bd = bh->b_private; 160 bd = bh->b_private;
159 if (bd) 161 if (bd)
160 gfs2_assert(sdp, bd->bd_gl == gl); 162 gfs2_assert(sdp, bd->bd_gl == gl);
161 else { 163 else {
164 gfs2_log_unlock(sdp);
165 unlock_buffer(bh);
162 gfs2_attach_bufdata(gl, bh, meta); 166 gfs2_attach_bufdata(gl, bh, meta);
163 bd = bh->b_private; 167 bd = bh->b_private;
168 lock_buffer(bh);
169 gfs2_log_lock(sdp);
164 } 170 }
165 lops_add(sdp, bd); 171 lops_add(sdp, bd);
172 gfs2_log_unlock(sdp);
173 unlock_buffer(bh);
166} 174}
167 175
168void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) 176void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index db330e5518cd..76c144b3c9bb 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
734 if (error) 734 if (error)
735 return error; 735 return error;
736 736
737 error = gfs2_inplace_reserve(ip, blks); 737 error = gfs2_inplace_reserve(ip, blks, 0);
738 if (error) 738 if (error)
739 goto out_gunlock_q; 739 goto out_gunlock_q;
740 740
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 78f21f8dc2ec..43b315f2002b 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -710,7 +710,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
710 struct vfsmount *proc_mnt; 710 struct vfsmount *proc_mnt;
711 int err = -ENOENT; 711 int err = -ENOENT;
712 712
713 proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt); 713 proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt);
714 if (IS_ERR(proc_mnt)) 714 if (IS_ERR(proc_mnt))
715 goto out; 715 goto out;
716 716
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c5bc355d8243..78bde32ea951 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * hugetlbpage-backed filesystem. Based on ramfs. 2 * hugetlbpage-backed filesystem. Based on ramfs.
3 * 3 *
4 * William Irwin, 2002 4 * Nadia Yvette Chambers, 2002
5 * 5 *
6 * Copyright (C) 2002 Linus Torvalds. 6 * Copyright (C) 2002 Linus Torvalds.
7 */ 7 */
@@ -151,8 +151,8 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
151{ 151{
152 struct mm_struct *mm = current->mm; 152 struct mm_struct *mm = current->mm;
153 struct vm_area_struct *vma; 153 struct vm_area_struct *vma;
154 unsigned long start_addr;
155 struct hstate *h = hstate_file(file); 154 struct hstate *h = hstate_file(file);
155 struct vm_unmapped_area_info info;
156 156
157 if (len & ~huge_page_mask(h)) 157 if (len & ~huge_page_mask(h))
158 return -EINVAL; 158 return -EINVAL;
@@ -173,39 +173,13 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
173 return addr; 173 return addr;
174 } 174 }
175 175
176 if (len > mm->cached_hole_size) 176 info.flags = 0;
177 start_addr = mm->free_area_cache; 177 info.length = len;
178 else { 178 info.low_limit = TASK_UNMAPPED_BASE;
179 start_addr = TASK_UNMAPPED_BASE; 179 info.high_limit = TASK_SIZE;
180 mm->cached_hole_size = 0; 180 info.align_mask = PAGE_MASK & ~huge_page_mask(h);
181 } 181 info.align_offset = 0;
182 182 return vm_unmapped_area(&info);
183full_search:
184 addr = ALIGN(start_addr, huge_page_size(h));
185
186 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
187 /* At this point: (!vma || addr < vma->vm_end). */
188 if (TASK_SIZE - len < addr) {
189 /*
190 * Start a new search - just in case we missed
191 * some holes.
192 */
193 if (start_addr != TASK_UNMAPPED_BASE) {
194 start_addr = TASK_UNMAPPED_BASE;
195 mm->cached_hole_size = 0;
196 goto full_search;
197 }
198 return -ENOMEM;
199 }
200
201 if (!vma || addr + len <= vma->vm_start) {
202 mm->free_area_cache = addr + len;
203 return addr;
204 }
205 if (addr + mm->cached_hole_size < vma->vm_start)
206 mm->cached_hole_size = vma->vm_start - addr;
207 addr = ALIGN(vma->vm_end, huge_page_size(h));
208 }
209} 183}
210#endif 184#endif
211 185
@@ -608,11 +582,11 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
608 int rc; 582 int rc;
609 583
610 rc = migrate_huge_page_move_mapping(mapping, newpage, page); 584 rc = migrate_huge_page_move_mapping(mapping, newpage, page);
611 if (rc) 585 if (rc != MIGRATEPAGE_SUCCESS)
612 return rc; 586 return rc;
613 migrate_page_copy(newpage, page); 587 migrate_page_copy(newpage, page);
614 588
615 return 0; 589 return MIGRATEPAGE_SUCCESS;
616} 590}
617 591
618static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) 592static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -923,7 +897,7 @@ static struct file_system_type hugetlbfs_fs_type = {
923 .kill_sb = kill_litter_super, 897 .kill_sb = kill_litter_super,
924}; 898};
925 899
926static struct vfsmount *hugetlbfs_vfsmount; 900static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
927 901
928static int can_do_hugetlb_shm(void) 902static int can_do_hugetlb_shm(void)
929{ 903{
@@ -932,9 +906,22 @@ static int can_do_hugetlb_shm(void)
932 return capable(CAP_IPC_LOCK) || in_group_p(shm_group); 906 return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
933} 907}
934 908
909static int get_hstate_idx(int page_size_log)
910{
911 struct hstate *h;
912
913 if (!page_size_log)
914 return default_hstate_idx;
915 h = size_to_hstate(1 << page_size_log);
916 if (!h)
917 return -1;
918 return h - hstates;
919}
920
935struct file *hugetlb_file_setup(const char *name, unsigned long addr, 921struct file *hugetlb_file_setup(const char *name, unsigned long addr,
936 size_t size, vm_flags_t acctflag, 922 size_t size, vm_flags_t acctflag,
937 struct user_struct **user, int creat_flags) 923 struct user_struct **user,
924 int creat_flags, int page_size_log)
938{ 925{
939 int error = -ENOMEM; 926 int error = -ENOMEM;
940 struct file *file; 927 struct file *file;
@@ -944,9 +931,14 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
944 struct qstr quick_string; 931 struct qstr quick_string;
945 struct hstate *hstate; 932 struct hstate *hstate;
946 unsigned long num_pages; 933 unsigned long num_pages;
934 int hstate_idx;
935
936 hstate_idx = get_hstate_idx(page_size_log);
937 if (hstate_idx < 0)
938 return ERR_PTR(-ENODEV);
947 939
948 *user = NULL; 940 *user = NULL;
949 if (!hugetlbfs_vfsmount) 941 if (!hugetlbfs_vfsmount[hstate_idx])
950 return ERR_PTR(-ENOENT); 942 return ERR_PTR(-ENOENT);
951 943
952 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { 944 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
@@ -963,7 +955,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
963 } 955 }
964 } 956 }
965 957
966 root = hugetlbfs_vfsmount->mnt_root; 958 root = hugetlbfs_vfsmount[hstate_idx]->mnt_root;
967 quick_string.name = name; 959 quick_string.name = name;
968 quick_string.len = strlen(quick_string.name); 960 quick_string.len = strlen(quick_string.name);
969 quick_string.hash = 0; 961 quick_string.hash = 0;
@@ -971,7 +963,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
971 if (!path.dentry) 963 if (!path.dentry)
972 goto out_shm_unlock; 964 goto out_shm_unlock;
973 965
974 path.mnt = mntget(hugetlbfs_vfsmount); 966 path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
975 error = -ENOSPC; 967 error = -ENOSPC;
976 inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0); 968 inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0);
977 if (!inode) 969 if (!inode)
@@ -1011,8 +1003,9 @@ out_shm_unlock:
1011 1003
1012static int __init init_hugetlbfs_fs(void) 1004static int __init init_hugetlbfs_fs(void)
1013{ 1005{
1006 struct hstate *h;
1014 int error; 1007 int error;
1015 struct vfsmount *vfsmount; 1008 int i;
1016 1009
1017 error = bdi_init(&hugetlbfs_backing_dev_info); 1010 error = bdi_init(&hugetlbfs_backing_dev_info);
1018 if (error) 1011 if (error)
@@ -1029,14 +1022,26 @@ static int __init init_hugetlbfs_fs(void)
1029 if (error) 1022 if (error)
1030 goto out; 1023 goto out;
1031 1024
1032 vfsmount = kern_mount(&hugetlbfs_fs_type); 1025 i = 0;
1026 for_each_hstate(h) {
1027 char buf[50];
1028 unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
1033 1029
1034 if (!IS_ERR(vfsmount)) { 1030 snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
1035 hugetlbfs_vfsmount = vfsmount; 1031 hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
1036 return 0; 1032 buf);
1037 }
1038 1033
1039 error = PTR_ERR(vfsmount); 1034 if (IS_ERR(hugetlbfs_vfsmount[i])) {
1035 pr_err("hugetlb: Cannot mount internal hugetlbfs for "
1036 "page size %uK", ps_kb);
1037 error = PTR_ERR(hugetlbfs_vfsmount[i]);
1038 hugetlbfs_vfsmount[i] = NULL;
1039 }
1040 i++;
1041 }
1042 /* Non default hstates are optional */
1043 if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
1044 return 0;
1040 1045
1041 out: 1046 out:
1042 kmem_cache_destroy(hugetlbfs_inode_cachep); 1047 kmem_cache_destroy(hugetlbfs_inode_cachep);
@@ -1047,13 +1052,19 @@ static int __init init_hugetlbfs_fs(void)
1047 1052
1048static void __exit exit_hugetlbfs_fs(void) 1053static void __exit exit_hugetlbfs_fs(void)
1049{ 1054{
1055 struct hstate *h;
1056 int i;
1057
1058
1050 /* 1059 /*
1051 * Make sure all delayed rcu free inodes are flushed before we 1060 * Make sure all delayed rcu free inodes are flushed before we
1052 * destroy cache. 1061 * destroy cache.
1053 */ 1062 */
1054 rcu_barrier(); 1063 rcu_barrier();
1055 kmem_cache_destroy(hugetlbfs_inode_cachep); 1064 kmem_cache_destroy(hugetlbfs_inode_cachep);
1056 kern_unmount(hugetlbfs_vfsmount); 1065 i = 0;
1066 for_each_hstate(h)
1067 kern_unmount(hugetlbfs_vfsmount[i++]);
1057 unregister_filesystem(&hugetlbfs_fs_type); 1068 unregister_filesystem(&hugetlbfs_fs_type);
1058 bdi_destroy(&hugetlbfs_backing_dev_info); 1069 bdi_destroy(&hugetlbfs_backing_dev_info);
1059} 1070}
diff --git a/fs/inode.c b/fs/inode.c
index b03c71957246..14084b72b259 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -165,7 +165,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
165 mapping->host = inode; 165 mapping->host = inode;
166 mapping->flags = 0; 166 mapping->flags = 0;
167 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); 167 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
168 mapping->assoc_mapping = NULL; 168 mapping->private_data = NULL;
169 mapping->backing_dev_info = &default_backing_dev_info; 169 mapping->backing_dev_info = &default_backing_dev_info;
170 mapping->writeback_index = 0; 170 mapping->writeback_index = 0;
171 171
@@ -408,6 +408,19 @@ static void inode_lru_list_add(struct inode *inode)
408 spin_unlock(&inode->i_sb->s_inode_lru_lock); 408 spin_unlock(&inode->i_sb->s_inode_lru_lock);
409} 409}
410 410
411/*
412 * Add inode to LRU if needed (inode is unused and clean).
413 *
414 * Needs inode->i_lock held.
415 */
416void inode_add_lru(struct inode *inode)
417{
418 if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) &&
419 !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
420 inode_lru_list_add(inode);
421}
422
423
411static void inode_lru_list_del(struct inode *inode) 424static void inode_lru_list_del(struct inode *inode)
412{ 425{
413 spin_lock(&inode->i_sb->s_inode_lru_lock); 426 spin_lock(&inode->i_sb->s_inode_lru_lock);
@@ -1390,8 +1403,7 @@ static void iput_final(struct inode *inode)
1390 1403
1391 if (!drop && (sb->s_flags & MS_ACTIVE)) { 1404 if (!drop && (sb->s_flags & MS_ACTIVE)) {
1392 inode->i_state |= I_REFERENCED; 1405 inode->i_state |= I_REFERENCED;
1393 if (!(inode->i_state & (I_DIRTY|I_SYNC))) 1406 inode_add_lru(inode);
1394 inode_lru_list_add(inode);
1395 spin_unlock(&inode->i_lock); 1407 spin_unlock(&inode->i_lock);
1396 return; 1408 return;
1397 } 1409 }
diff --git a/fs/internal.h b/fs/internal.h
index 916b7cbf3e3e..2f6af7f645eb 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -110,6 +110,7 @@ extern int open_check_o_direct(struct file *f);
110 * inode.c 110 * inode.c
111 */ 111 */
112extern spinlock_t inode_sb_list_lock; 112extern spinlock_t inode_sb_list_lock;
113extern void inode_add_lru(struct inode *inode);
113 114
114/* 115/*
115 * fs-writeback.c 116 * fs-writeback.c
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 78b7f84241d4..071d6905f0dd 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1259,7 +1259,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
1259 goto not_jbd; 1259 goto not_jbd;
1260 } 1260 }
1261 1261
1262 /* keep track of wether or not this transaction modified us */ 1262 /* keep track of whether or not this transaction modified us */
1263 was_modified = jh->b_modified; 1263 was_modified = jh->b_modified;
1264 1264
1265 /* 1265 /*
@@ -1961,7 +1961,9 @@ retry:
1961 spin_unlock(&journal->j_list_lock); 1961 spin_unlock(&journal->j_list_lock);
1962 jbd_unlock_bh_state(bh); 1962 jbd_unlock_bh_state(bh);
1963 spin_unlock(&journal->j_state_lock); 1963 spin_unlock(&journal->j_state_lock);
1964 unlock_buffer(bh);
1964 log_wait_commit(journal, tid); 1965 log_wait_commit(journal, tid);
1966 lock_buffer(bh);
1965 goto retry; 1967 goto retry;
1966 } 1968 }
1967 /* 1969 /*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 484b8d1c6cb6..dbf41f9452db 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -60,7 +60,6 @@ EXPORT_SYMBOL(jbd2_journal_get_create_access);
60EXPORT_SYMBOL(jbd2_journal_get_undo_access); 60EXPORT_SYMBOL(jbd2_journal_get_undo_access);
61EXPORT_SYMBOL(jbd2_journal_set_triggers); 61EXPORT_SYMBOL(jbd2_journal_set_triggers);
62EXPORT_SYMBOL(jbd2_journal_dirty_metadata); 62EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
63EXPORT_SYMBOL(jbd2_journal_release_buffer);
64EXPORT_SYMBOL(jbd2_journal_forget); 63EXPORT_SYMBOL(jbd2_journal_forget);
65#if 0 64#if 0
66EXPORT_SYMBOL(journal_sync_buffer); 65EXPORT_SYMBOL(journal_sync_buffer);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a74ba4659549..42f6615af0ac 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1207,17 +1207,6 @@ out:
1207 return ret; 1207 return ret;
1208} 1208}
1209 1209
1210/*
1211 * jbd2_journal_release_buffer: undo a get_write_access without any buffer
1212 * updates, if the update decided in the end that it didn't need access.
1213 *
1214 */
1215void
1216jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
1217{
1218 BUFFER_TRACE(bh, "entry");
1219}
1220
1221/** 1210/**
1222 * void jbd2_journal_forget() - bforget() for potentially-journaled buffers. 1211 * void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
1223 * @handle: transaction handle 1212 * @handle: transaction handle
@@ -1261,7 +1250,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1261 goto not_jbd; 1250 goto not_jbd;
1262 } 1251 }
1263 1252
1264 /* keep track of wether or not this transaction modified us */ 1253 /* keep track of whether or not this transaction modified us */
1265 was_modified = jh->b_modified; 1254 was_modified = jh->b_modified;
1266 1255
1267 /* 1256 /*
diff --git a/fs/libfs.c b/fs/libfs.c
index 7cc37ca19cd8..35fc6e74cd88 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -81,11 +81,11 @@ int dcache_dir_close(struct inode *inode, struct file *file)
81 return 0; 81 return 0;
82} 82}
83 83
84loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) 84loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
85{ 85{
86 struct dentry *dentry = file->f_path.dentry; 86 struct dentry *dentry = file->f_path.dentry;
87 mutex_lock(&dentry->d_inode->i_mutex); 87 mutex_lock(&dentry->d_inode->i_mutex);
88 switch (origin) { 88 switch (whence) {
89 case 1: 89 case 1:
90 offset += file->f_pos; 90 offset += file->f_pos;
91 case 0: 91 case 0:
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index 13ad1539fbf2..00ec0b9c94d1 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -64,10 +64,6 @@ static void nlm4_compute_offsets(const struct nlm_lock *lock,
64{ 64{
65 const struct file_lock *fl = &lock->fl; 65 const struct file_lock *fl = &lock->fl;
66 66
67 BUG_ON(fl->fl_start > NLM4_OFFSET_MAX);
68 BUG_ON(fl->fl_end > NLM4_OFFSET_MAX &&
69 fl->fl_end != OFFSET_MAX);
70
71 *l_offset = loff_t_to_s64(fl->fl_start); 67 *l_offset = loff_t_to_s64(fl->fl_start);
72 if (fl->fl_end == OFFSET_MAX) 68 if (fl->fl_end == OFFSET_MAX)
73 *l_len = 0; 69 *l_len = 0;
@@ -122,7 +118,6 @@ static void encode_netobj(struct xdr_stream *xdr,
122{ 118{
123 __be32 *p; 119 __be32 *p;
124 120
125 BUG_ON(length > XDR_MAX_NETOBJ);
126 p = xdr_reserve_space(xdr, 4 + length); 121 p = xdr_reserve_space(xdr, 4 + length);
127 xdr_encode_opaque(p, data, length); 122 xdr_encode_opaque(p, data, length);
128} 123}
@@ -156,7 +151,6 @@ out_overflow:
156static void encode_cookie(struct xdr_stream *xdr, 151static void encode_cookie(struct xdr_stream *xdr,
157 const struct nlm_cookie *cookie) 152 const struct nlm_cookie *cookie)
158{ 153{
159 BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
160 encode_netobj(xdr, (u8 *)&cookie->data, cookie->len); 154 encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
161} 155}
162 156
@@ -198,7 +192,6 @@ out_overflow:
198 */ 192 */
199static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh) 193static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
200{ 194{
201 BUG_ON(fh->size > NFS3_FHSIZE);
202 encode_netobj(xdr, (u8 *)&fh->data, fh->size); 195 encode_netobj(xdr, (u8 *)&fh->data, fh->size);
203} 196}
204 197
@@ -336,7 +329,6 @@ static void encode_caller_name(struct xdr_stream *xdr, const char *name)
336 u32 length = strlen(name); 329 u32 length = strlen(name);
337 __be32 *p; 330 __be32 *p;
338 331
339 BUG_ON(length > NLM_MAXSTRLEN);
340 p = xdr_reserve_space(xdr, 4 + length); 332 p = xdr_reserve_space(xdr, 4 + length);
341 xdr_encode_opaque(p, name, length); 333 xdr_encode_opaque(p, name, length);
342} 334}
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 05d29124c6ab..54f9e6ce0430 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -141,7 +141,7 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
141 141
142static void nlmclnt_release_lockargs(struct nlm_rqst *req) 142static void nlmclnt_release_lockargs(struct nlm_rqst *req)
143{ 143{
144 BUG_ON(req->a_args.lock.fl.fl_ops != NULL); 144 WARN_ON_ONCE(req->a_args.lock.fl.fl_ops != NULL);
145} 145}
146 146
147/** 147/**
@@ -465,7 +465,6 @@ static const struct file_lock_operations nlmclnt_lock_ops = {
465 465
466static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host) 466static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host)
467{ 467{
468 BUG_ON(fl->fl_ops != NULL);
469 fl->fl_u.nfs_fl.state = 0; 468 fl->fl_u.nfs_fl.state = 0;
470 fl->fl_u.nfs_fl.owner = nlm_find_lockowner(host, fl->fl_owner); 469 fl->fl_u.nfs_fl.owner = nlm_find_lockowner(host, fl->fl_owner);
471 INIT_LIST_HEAD(&fl->fl_u.nfs_fl.list); 470 INIT_LIST_HEAD(&fl->fl_u.nfs_fl.list);
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 982d2676e1f8..9a55797a1cd4 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -60,10 +60,6 @@ static void nlm_compute_offsets(const struct nlm_lock *lock,
60{ 60{
61 const struct file_lock *fl = &lock->fl; 61 const struct file_lock *fl = &lock->fl;
62 62
63 BUG_ON(fl->fl_start > NLM_OFFSET_MAX);
64 BUG_ON(fl->fl_end > NLM_OFFSET_MAX &&
65 fl->fl_end != OFFSET_MAX);
66
67 *l_offset = loff_t_to_s32(fl->fl_start); 63 *l_offset = loff_t_to_s32(fl->fl_start);
68 if (fl->fl_end == OFFSET_MAX) 64 if (fl->fl_end == OFFSET_MAX)
69 *l_len = 0; 65 *l_len = 0;
@@ -119,7 +115,6 @@ static void encode_netobj(struct xdr_stream *xdr,
119{ 115{
120 __be32 *p; 116 __be32 *p;
121 117
122 BUG_ON(length > XDR_MAX_NETOBJ);
123 p = xdr_reserve_space(xdr, 4 + length); 118 p = xdr_reserve_space(xdr, 4 + length);
124 xdr_encode_opaque(p, data, length); 119 xdr_encode_opaque(p, data, length);
125} 120}
@@ -153,7 +148,6 @@ out_overflow:
153static void encode_cookie(struct xdr_stream *xdr, 148static void encode_cookie(struct xdr_stream *xdr,
154 const struct nlm_cookie *cookie) 149 const struct nlm_cookie *cookie)
155{ 150{
156 BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
157 encode_netobj(xdr, (u8 *)&cookie->data, cookie->len); 151 encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
158} 152}
159 153
@@ -195,7 +189,6 @@ out_overflow:
195 */ 189 */
196static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh) 190static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
197{ 191{
198 BUG_ON(fh->size != NFS2_FHSIZE);
199 encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE); 192 encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE);
200} 193}
201 194
@@ -330,7 +323,6 @@ static void encode_caller_name(struct xdr_stream *xdr, const char *name)
330 u32 length = strlen(name); 323 u32 length = strlen(name);
331 __be32 *p; 324 __be32 *p;
332 325
333 BUG_ON(length > NLM_MAXSTRLEN);
334 p = xdr_reserve_space(xdr, 4 + length); 326 p = xdr_reserve_space(xdr, 4 + length);
335 xdr_encode_opaque(p, name, length); 327 xdr_encode_opaque(p, name, length);
336} 328}
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index f9b22e58f78f..0e17090c310f 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -177,9 +177,6 @@ static void nlm_destroy_host_locked(struct nlm_host *host)
177 177
178 dprintk("lockd: destroy host %s\n", host->h_name); 178 dprintk("lockd: destroy host %s\n", host->h_name);
179 179
180 BUG_ON(!list_empty(&host->h_lockowners));
181 BUG_ON(atomic_read(&host->h_count));
182
183 hlist_del_init(&host->h_hash); 180 hlist_del_init(&host->h_hash);
184 181
185 nsm_unmonitor(host); 182 nsm_unmonitor(host);
@@ -289,13 +286,12 @@ void nlmclnt_release_host(struct nlm_host *host)
289 286
290 dprintk("lockd: release client host %s\n", host->h_name); 287 dprintk("lockd: release client host %s\n", host->h_name);
291 288
292 BUG_ON(atomic_read(&host->h_count) < 0); 289 WARN_ON_ONCE(host->h_server);
293 BUG_ON(host->h_server);
294 290
295 if (atomic_dec_and_test(&host->h_count)) { 291 if (atomic_dec_and_test(&host->h_count)) {
296 BUG_ON(!list_empty(&host->h_lockowners)); 292 WARN_ON_ONCE(!list_empty(&host->h_lockowners));
297 BUG_ON(!list_empty(&host->h_granted)); 293 WARN_ON_ONCE(!list_empty(&host->h_granted));
298 BUG_ON(!list_empty(&host->h_reclaim)); 294 WARN_ON_ONCE(!list_empty(&host->h_reclaim));
299 295
300 mutex_lock(&nlm_host_mutex); 296 mutex_lock(&nlm_host_mutex);
301 nlm_destroy_host_locked(host); 297 nlm_destroy_host_locked(host);
@@ -412,8 +408,7 @@ void nlmsvc_release_host(struct nlm_host *host)
412 408
413 dprintk("lockd: release server host %s\n", host->h_name); 409 dprintk("lockd: release server host %s\n", host->h_name);
414 410
415 BUG_ON(atomic_read(&host->h_count) < 0); 411 WARN_ON_ONCE(!host->h_server);
416 BUG_ON(!host->h_server);
417 atomic_dec(&host->h_count); 412 atomic_dec(&host->h_count);
418} 413}
419 414
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 3d7e09bcc0e9..3c2cfc683631 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -154,8 +154,6 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
154 .rpc_resp = res, 154 .rpc_resp = res,
155 }; 155 };
156 156
157 BUG_ON(clnt == NULL);
158
159 memset(res, 0, sizeof(*res)); 157 memset(res, 0, sizeof(*res));
160 158
161 msg.rpc_proc = &clnt->cl_procinfo[proc]; 159 msg.rpc_proc = &clnt->cl_procinfo[proc];
@@ -466,7 +464,6 @@ static void encode_nsm_string(struct xdr_stream *xdr, const char *string)
466 const u32 len = strlen(string); 464 const u32 len = strlen(string);
467 __be32 *p; 465 __be32 *p;
468 466
469 BUG_ON(len > SM_MAXSTRLEN);
470 p = xdr_reserve_space(xdr, 4 + len); 467 p = xdr_reserve_space(xdr, 4 + len);
471 xdr_encode_opaque(p, string, len); 468 xdr_encode_opaque(p, string, len);
472} 469}
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index adb90116d36b..af49e2d6941a 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -33,7 +33,7 @@
33 * are being written out - and waiting for GC to make progress, naturally. 33 * are being written out - and waiting for GC to make progress, naturally.
34 * 34 *
35 * So we cannot just call iget() or some variant of it, but first have to check 35 * So we cannot just call iget() or some variant of it, but first have to check
36 * wether the inode in question might be in I_FREEING state. Therefore we 36 * whether the inode in question might be in I_FREEING state. Therefore we
37 * maintain our own per-sb list of "almost deleted" inodes and check against 37 * maintain our own per-sb list of "almost deleted" inodes and check against
38 * that list first. Normally this should be at most 1-2 entries long. 38 * that list first. Normally this should be at most 1-2 entries long.
39 * 39 *
diff --git a/fs/mount.h b/fs/mount.h
index 4f291f9de641..cd5007980400 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -4,8 +4,11 @@
4 4
5struct mnt_namespace { 5struct mnt_namespace {
6 atomic_t count; 6 atomic_t count;
7 unsigned int proc_inum;
7 struct mount * root; 8 struct mount * root;
8 struct list_head list; 9 struct list_head list;
10 struct user_namespace *user_ns;
11 u64 seq; /* Sequence number to prevent loops */
9 wait_queue_head_t poll; 12 wait_queue_head_t poll;
10 int event; 13 int event;
11}; 14};
diff --git a/fs/namei.c b/fs/namei.c
index 937f9d50c84b..5f4cdf3ad913 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2131,6 +2131,11 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
2131 if (!len) 2131 if (!len)
2132 return ERR_PTR(-EACCES); 2132 return ERR_PTR(-EACCES);
2133 2133
2134 if (unlikely(name[0] == '.')) {
2135 if (len < 2 || (len == 2 && name[1] == '.'))
2136 return ERR_PTR(-EACCES);
2137 }
2138
2134 while (len--) { 2139 while (len--) {
2135 c = *(const unsigned char *)name++; 2140 c = *(const unsigned char *)name++;
2136 if (c == '/' || c == '\0') 2141 if (c == '/' || c == '\0')
diff --git a/fs/namespace.c b/fs/namespace.c
index 24960626bb6b..398a50ff2438 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -12,6 +12,7 @@
12#include <linux/export.h> 12#include <linux/export.h>
13#include <linux/capability.h> 13#include <linux/capability.h>
14#include <linux/mnt_namespace.h> 14#include <linux/mnt_namespace.h>
15#include <linux/user_namespace.h>
15#include <linux/namei.h> 16#include <linux/namei.h>
16#include <linux/security.h> 17#include <linux/security.h>
17#include <linux/idr.h> 18#include <linux/idr.h>
@@ -20,6 +21,7 @@
20#include <linux/fs_struct.h> /* get_fs_root et.al. */ 21#include <linux/fs_struct.h> /* get_fs_root et.al. */
21#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ 22#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
22#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/proc_fs.h>
23#include "pnode.h" 25#include "pnode.h"
24#include "internal.h" 26#include "internal.h"
25 27
@@ -784,7 +786,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
784 if (!mnt) 786 if (!mnt)
785 return ERR_PTR(-ENOMEM); 787 return ERR_PTR(-ENOMEM);
786 788
787 if (flag & (CL_SLAVE | CL_PRIVATE)) 789 if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
788 mnt->mnt_group_id = 0; /* not a peer of original */ 790 mnt->mnt_group_id = 0; /* not a peer of original */
789 else 791 else
790 mnt->mnt_group_id = old->mnt_group_id; 792 mnt->mnt_group_id = old->mnt_group_id;
@@ -805,7 +807,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
805 list_add_tail(&mnt->mnt_instance, &sb->s_mounts); 807 list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
806 br_write_unlock(&vfsmount_lock); 808 br_write_unlock(&vfsmount_lock);
807 809
808 if (flag & CL_SLAVE) { 810 if ((flag & CL_SLAVE) ||
811 ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
809 list_add(&mnt->mnt_slave, &old->mnt_slave_list); 812 list_add(&mnt->mnt_slave, &old->mnt_slave_list);
810 mnt->mnt_master = old; 813 mnt->mnt_master = old;
811 CLEAR_MNT_SHARED(mnt); 814 CLEAR_MNT_SHARED(mnt);
@@ -1266,7 +1269,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
1266 goto dput_and_out; 1269 goto dput_and_out;
1267 1270
1268 retval = -EPERM; 1271 retval = -EPERM;
1269 if (!capable(CAP_SYS_ADMIN)) 1272 if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
1270 goto dput_and_out; 1273 goto dput_and_out;
1271 1274
1272 retval = do_umount(mnt, flags); 1275 retval = do_umount(mnt, flags);
@@ -1292,7 +1295,7 @@ SYSCALL_DEFINE1(oldumount, char __user *, name)
1292 1295
1293static int mount_is_safe(struct path *path) 1296static int mount_is_safe(struct path *path)
1294{ 1297{
1295 if (capable(CAP_SYS_ADMIN)) 1298 if (ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN))
1296 return 0; 1299 return 0;
1297 return -EPERM; 1300 return -EPERM;
1298#ifdef notyet 1301#ifdef notyet
@@ -1308,6 +1311,26 @@ static int mount_is_safe(struct path *path)
1308#endif 1311#endif
1309} 1312}
1310 1313
1314static bool mnt_ns_loop(struct path *path)
1315{
1316 /* Could bind mounting the mount namespace inode cause a
1317 * mount namespace loop?
1318 */
1319 struct inode *inode = path->dentry->d_inode;
1320 struct proc_inode *ei;
1321 struct mnt_namespace *mnt_ns;
1322
1323 if (!proc_ns_inode(inode))
1324 return false;
1325
1326 ei = PROC_I(inode);
1327 if (ei->ns_ops != &mntns_operations)
1328 return false;
1329
1330 mnt_ns = ei->ns;
1331 return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
1332}
1333
1311struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, 1334struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1312 int flag) 1335 int flag)
1313{ 1336{
@@ -1610,7 +1633,7 @@ static int do_change_type(struct path *path, int flag)
1610 int type; 1633 int type;
1611 int err = 0; 1634 int err = 0;
1612 1635
1613 if (!capable(CAP_SYS_ADMIN)) 1636 if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
1614 return -EPERM; 1637 return -EPERM;
1615 1638
1616 if (path->dentry != path->mnt->mnt_root) 1639 if (path->dentry != path->mnt->mnt_root)
@@ -1655,6 +1678,10 @@ static int do_loopback(struct path *path, const char *old_name,
1655 if (err) 1678 if (err)
1656 return err; 1679 return err;
1657 1680
1681 err = -EINVAL;
1682 if (mnt_ns_loop(&old_path))
1683 goto out;
1684
1658 err = lock_mount(path); 1685 err = lock_mount(path);
1659 if (err) 1686 if (err)
1660 goto out; 1687 goto out;
@@ -1770,7 +1797,7 @@ static int do_move_mount(struct path *path, const char *old_name)
1770 struct mount *p; 1797 struct mount *p;
1771 struct mount *old; 1798 struct mount *old;
1772 int err = 0; 1799 int err = 0;
1773 if (!capable(CAP_SYS_ADMIN)) 1800 if (!ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN))
1774 return -EPERM; 1801 return -EPERM;
1775 if (!old_name || !*old_name) 1802 if (!old_name || !*old_name)
1776 return -EINVAL; 1803 return -EINVAL;
@@ -1857,21 +1884,6 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
1857 return ERR_PTR(err); 1884 return ERR_PTR(err);
1858} 1885}
1859 1886
1860static struct vfsmount *
1861do_kern_mount(const char *fstype, int flags, const char *name, void *data)
1862{
1863 struct file_system_type *type = get_fs_type(fstype);
1864 struct vfsmount *mnt;
1865 if (!type)
1866 return ERR_PTR(-ENODEV);
1867 mnt = vfs_kern_mount(type, flags, name, data);
1868 if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
1869 !mnt->mnt_sb->s_subtype)
1870 mnt = fs_set_subtype(mnt, fstype);
1871 put_filesystem(type);
1872 return mnt;
1873}
1874
1875/* 1887/*
1876 * add a mount into a namespace's mount tree 1888 * add a mount into a namespace's mount tree
1877 */ 1889 */
@@ -1917,20 +1929,46 @@ unlock:
1917 * create a new mount for userspace and request it to be added into the 1929 * create a new mount for userspace and request it to be added into the
1918 * namespace's tree 1930 * namespace's tree
1919 */ 1931 */
1920static int do_new_mount(struct path *path, const char *type, int flags, 1932static int do_new_mount(struct path *path, const char *fstype, int flags,
1921 int mnt_flags, const char *name, void *data) 1933 int mnt_flags, const char *name, void *data)
1922{ 1934{
1935 struct file_system_type *type;
1936 struct user_namespace *user_ns;
1923 struct vfsmount *mnt; 1937 struct vfsmount *mnt;
1924 int err; 1938 int err;
1925 1939
1926 if (!type) 1940 if (!fstype)
1927 return -EINVAL; 1941 return -EINVAL;
1928 1942
1929 /* we need capabilities... */ 1943 /* we need capabilities... */
1930 if (!capable(CAP_SYS_ADMIN)) 1944 user_ns = real_mount(path->mnt)->mnt_ns->user_ns;
1945 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
1931 return -EPERM; 1946 return -EPERM;
1932 1947
1933 mnt = do_kern_mount(type, flags, name, data); 1948 type = get_fs_type(fstype);
1949 if (!type)
1950 return -ENODEV;
1951
1952 if (user_ns != &init_user_ns) {
1953 if (!(type->fs_flags & FS_USERNS_MOUNT)) {
1954 put_filesystem(type);
1955 return -EPERM;
1956 }
1957 /* Only in special cases allow devices from mounts
1958 * created outside the initial user namespace.
1959 */
1960 if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
1961 flags |= MS_NODEV;
1962 mnt_flags |= MNT_NODEV;
1963 }
1964 }
1965
1966 mnt = vfs_kern_mount(type, flags, name, data);
1967 if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
1968 !mnt->mnt_sb->s_subtype)
1969 mnt = fs_set_subtype(mnt, fstype);
1970
1971 put_filesystem(type);
1934 if (IS_ERR(mnt)) 1972 if (IS_ERR(mnt))
1935 return PTR_ERR(mnt); 1973 return PTR_ERR(mnt);
1936 1974
@@ -2261,18 +2299,42 @@ dput_out:
2261 return retval; 2299 return retval;
2262} 2300}
2263 2301
2264static struct mnt_namespace *alloc_mnt_ns(void) 2302static void free_mnt_ns(struct mnt_namespace *ns)
2303{
2304 proc_free_inum(ns->proc_inum);
2305 put_user_ns(ns->user_ns);
2306 kfree(ns);
2307}
2308
2309/*
2310 * Assign a sequence number so we can detect when we attempt to bind
2311 * mount a reference to an older mount namespace into the current
2312 * mount namespace, preventing reference counting loops. A 64bit
2313 * number incrementing at 10Ghz will take 12,427 years to wrap which
2314 * is effectively never, so we can ignore the possibility.
2315 */
2316static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
2317
2318static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
2265{ 2319{
2266 struct mnt_namespace *new_ns; 2320 struct mnt_namespace *new_ns;
2321 int ret;
2267 2322
2268 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); 2323 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
2269 if (!new_ns) 2324 if (!new_ns)
2270 return ERR_PTR(-ENOMEM); 2325 return ERR_PTR(-ENOMEM);
2326 ret = proc_alloc_inum(&new_ns->proc_inum);
2327 if (ret) {
2328 kfree(new_ns);
2329 return ERR_PTR(ret);
2330 }
2331 new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
2271 atomic_set(&new_ns->count, 1); 2332 atomic_set(&new_ns->count, 1);
2272 new_ns->root = NULL; 2333 new_ns->root = NULL;
2273 INIT_LIST_HEAD(&new_ns->list); 2334 INIT_LIST_HEAD(&new_ns->list);
2274 init_waitqueue_head(&new_ns->poll); 2335 init_waitqueue_head(&new_ns->poll);
2275 new_ns->event = 0; 2336 new_ns->event = 0;
2337 new_ns->user_ns = get_user_ns(user_ns);
2276 return new_ns; 2338 return new_ns;
2277} 2339}
2278 2340
@@ -2281,24 +2343,28 @@ static struct mnt_namespace *alloc_mnt_ns(void)
2281 * copied from the namespace of the passed in task structure. 2343 * copied from the namespace of the passed in task structure.
2282 */ 2344 */
2283static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, 2345static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2284 struct fs_struct *fs) 2346 struct user_namespace *user_ns, struct fs_struct *fs)
2285{ 2347{
2286 struct mnt_namespace *new_ns; 2348 struct mnt_namespace *new_ns;
2287 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; 2349 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
2288 struct mount *p, *q; 2350 struct mount *p, *q;
2289 struct mount *old = mnt_ns->root; 2351 struct mount *old = mnt_ns->root;
2290 struct mount *new; 2352 struct mount *new;
2353 int copy_flags;
2291 2354
2292 new_ns = alloc_mnt_ns(); 2355 new_ns = alloc_mnt_ns(user_ns);
2293 if (IS_ERR(new_ns)) 2356 if (IS_ERR(new_ns))
2294 return new_ns; 2357 return new_ns;
2295 2358
2296 down_write(&namespace_sem); 2359 down_write(&namespace_sem);
2297 /* First pass: copy the tree topology */ 2360 /* First pass: copy the tree topology */
2298 new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE); 2361 copy_flags = CL_COPY_ALL | CL_EXPIRE;
2362 if (user_ns != mnt_ns->user_ns)
2363 copy_flags |= CL_SHARED_TO_SLAVE;
2364 new = copy_tree(old, old->mnt.mnt_root, copy_flags);
2299 if (IS_ERR(new)) { 2365 if (IS_ERR(new)) {
2300 up_write(&namespace_sem); 2366 up_write(&namespace_sem);
2301 kfree(new_ns); 2367 free_mnt_ns(new_ns);
2302 return ERR_CAST(new); 2368 return ERR_CAST(new);
2303 } 2369 }
2304 new_ns->root = new; 2370 new_ns->root = new;
@@ -2339,7 +2405,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2339} 2405}
2340 2406
2341struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, 2407struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
2342 struct fs_struct *new_fs) 2408 struct user_namespace *user_ns, struct fs_struct *new_fs)
2343{ 2409{
2344 struct mnt_namespace *new_ns; 2410 struct mnt_namespace *new_ns;
2345 2411
@@ -2349,7 +2415,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
2349 if (!(flags & CLONE_NEWNS)) 2415 if (!(flags & CLONE_NEWNS))
2350 return ns; 2416 return ns;
2351 2417
2352 new_ns = dup_mnt_ns(ns, new_fs); 2418 new_ns = dup_mnt_ns(ns, user_ns, new_fs);
2353 2419
2354 put_mnt_ns(ns); 2420 put_mnt_ns(ns);
2355 return new_ns; 2421 return new_ns;
@@ -2361,7 +2427,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
2361 */ 2427 */
2362static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) 2428static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
2363{ 2429{
2364 struct mnt_namespace *new_ns = alloc_mnt_ns(); 2430 struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
2365 if (!IS_ERR(new_ns)) { 2431 if (!IS_ERR(new_ns)) {
2366 struct mount *mnt = real_mount(m); 2432 struct mount *mnt = real_mount(m);
2367 mnt->mnt_ns = new_ns; 2433 mnt->mnt_ns = new_ns;
@@ -2501,7 +2567,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2501 struct mount *new_mnt, *root_mnt; 2567 struct mount *new_mnt, *root_mnt;
2502 int error; 2568 int error;
2503 2569
2504 if (!capable(CAP_SYS_ADMIN)) 2570 if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
2505 return -EPERM; 2571 return -EPERM;
2506 2572
2507 error = user_path_dir(new_root, &new); 2573 error = user_path_dir(new_root, &new);
@@ -2583,8 +2649,13 @@ static void __init init_mount_tree(void)
2583 struct vfsmount *mnt; 2649 struct vfsmount *mnt;
2584 struct mnt_namespace *ns; 2650 struct mnt_namespace *ns;
2585 struct path root; 2651 struct path root;
2652 struct file_system_type *type;
2586 2653
2587 mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); 2654 type = get_fs_type("rootfs");
2655 if (!type)
2656 panic("Can't find rootfs type");
2657 mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
2658 put_filesystem(type);
2588 if (IS_ERR(mnt)) 2659 if (IS_ERR(mnt))
2589 panic("Can't create rootfs"); 2660 panic("Can't create rootfs");
2590 2661
@@ -2647,7 +2718,7 @@ void put_mnt_ns(struct mnt_namespace *ns)
2647 br_write_unlock(&vfsmount_lock); 2718 br_write_unlock(&vfsmount_lock);
2648 up_write(&namespace_sem); 2719 up_write(&namespace_sem);
2649 release_mounts(&umount_list); 2720 release_mounts(&umount_list);
2650 kfree(ns); 2721 free_mnt_ns(ns);
2651} 2722}
2652 2723
2653struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) 2724struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
@@ -2681,3 +2752,72 @@ bool our_mnt(struct vfsmount *mnt)
2681{ 2752{
2682 return check_mnt(real_mount(mnt)); 2753 return check_mnt(real_mount(mnt));
2683} 2754}
2755
2756static void *mntns_get(struct task_struct *task)
2757{
2758 struct mnt_namespace *ns = NULL;
2759 struct nsproxy *nsproxy;
2760
2761 rcu_read_lock();
2762 nsproxy = task_nsproxy(task);
2763 if (nsproxy) {
2764 ns = nsproxy->mnt_ns;
2765 get_mnt_ns(ns);
2766 }
2767 rcu_read_unlock();
2768
2769 return ns;
2770}
2771
2772static void mntns_put(void *ns)
2773{
2774 put_mnt_ns(ns);
2775}
2776
2777static int mntns_install(struct nsproxy *nsproxy, void *ns)
2778{
2779 struct fs_struct *fs = current->fs;
2780 struct mnt_namespace *mnt_ns = ns;
2781 struct path root;
2782
2783 if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
2784 !nsown_capable(CAP_SYS_CHROOT) ||
2785 !nsown_capable(CAP_SYS_ADMIN))
2786 return -EPERM;
2787
2788 if (fs->users != 1)
2789 return -EINVAL;
2790
2791 get_mnt_ns(mnt_ns);
2792 put_mnt_ns(nsproxy->mnt_ns);
2793 nsproxy->mnt_ns = mnt_ns;
2794
2795 /* Find the root */
2796 root.mnt = &mnt_ns->root->mnt;
2797 root.dentry = mnt_ns->root->mnt.mnt_root;
2798 path_get(&root);
2799 while(d_mountpoint(root.dentry) && follow_down_one(&root))
2800 ;
2801
2802 /* Update the pwd and root */
2803 set_fs_pwd(fs, &root);
2804 set_fs_root(fs, &root);
2805
2806 path_put(&root);
2807 return 0;
2808}
2809
2810static unsigned int mntns_inum(void *ns)
2811{
2812 struct mnt_namespace *mnt_ns = ns;
2813 return mnt_ns->proc_inum;
2814}
2815
2816const struct proc_ns_operations mntns_operations = {
2817 .name = "mnt",
2818 .type = CLONE_NEWNS,
2819 .get = mntns_get,
2820 .put = mntns_put,
2821 .install = mntns_install,
2822 .inum = mntns_inum,
2823};
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index be20a7e171a0..63d14a99483d 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -89,7 +89,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area,
89 /* 89 /*
90 * If I understand ncp_read_kernel() properly, the above always 90 * If I understand ncp_read_kernel() properly, the above always
91 * fetches from the network, here the analogue of disk. 91 * fetches from the network, here the analogue of disk.
92 * -- wli 92 * -- nyc
93 */ 93 */
94 count_vm_event(PGMAJFAULT); 94 count_vm_event(PGMAJFAULT);
95 mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT); 95 mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT);
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index b7db60897f91..cce2c057bd2d 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -24,7 +24,7 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o
24 delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \ 24 delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \
25 nfs4namespace.o nfs4getroot.o nfs4client.o 25 nfs4namespace.o nfs4getroot.o nfs4client.o
26nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o 26nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
27nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o 27nfsv4-$(CONFIG_NFS_V4_1) += nfs4session.o pnfs.o pnfs_dev.o
28 28
29obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o 29obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
30nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o 30nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index f1027b06a1a9..4fa788c93f46 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -40,6 +40,7 @@
40#include <linux/pagevec.h> 40#include <linux/pagevec.h>
41 41
42#include "../pnfs.h" 42#include "../pnfs.h"
43#include "../nfs4session.h"
43#include "../internal.h" 44#include "../internal.h"
44#include "blocklayout.h" 45#include "blocklayout.h"
45 46
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index dded26368111..862a2f16db64 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -118,7 +118,6 @@ int nfs_cache_register_sb(struct super_block *sb, struct cache_detail *cd)
118 struct dentry *dir; 118 struct dentry *dir;
119 119
120 dir = rpc_d_lookup_sb(sb, "cache"); 120 dir = rpc_d_lookup_sb(sb, "cache");
121 BUG_ON(dir == NULL);
122 ret = sunrpc_cache_register_pipefs(dir, cd->name, 0600, cd); 121 ret = sunrpc_cache_register_pipefs(dir, cd->name, 0600, cd);
123 dput(dir); 122 dput(dir);
124 return ret; 123 return ret;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 4251c2ae06ad..efd54f0a4c46 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -142,7 +142,7 @@ extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
142 142
143struct cb_recallslotargs { 143struct cb_recallslotargs {
144 struct sockaddr *crsa_addr; 144 struct sockaddr *crsa_addr;
145 uint32_t crsa_target_max_slots; 145 uint32_t crsa_target_highest_slotid;
146}; 146};
147extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, 147extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
148 void *dummy, 148 void *dummy,
@@ -167,8 +167,6 @@ extern __be32 nfs4_callback_layoutrecall(
167 struct cb_layoutrecallargs *args, 167 struct cb_layoutrecallargs *args,
168 void *dummy, struct cb_process_state *cps); 168 void *dummy, struct cb_process_state *cps);
169 169
170extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
171
172struct cb_devicenotifyitem { 170struct cb_devicenotifyitem {
173 uint32_t cbd_notify_type; 171 uint32_t cbd_notify_type;
174 uint32_t cbd_layout_type; 172 uint32_t cbd_layout_type;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 76b4a7a3e559..c89b26bc9759 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -14,6 +14,7 @@
14#include "delegation.h" 14#include "delegation.h"
15#include "internal.h" 15#include "internal.h"
16#include "pnfs.h" 16#include "pnfs.h"
17#include "nfs4session.h"
17 18
18#ifdef NFS_DEBUG 19#ifdef NFS_DEBUG
19#define NFSDBG_FACILITY NFSDBG_CALLBACK 20#define NFSDBG_FACILITY NFSDBG_CALLBACK
@@ -216,7 +217,6 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
216 } 217 }
217 pnfs_get_layout_hdr(lo); 218 pnfs_get_layout_hdr(lo);
218 spin_unlock(&ino->i_lock); 219 spin_unlock(&ino->i_lock);
219 BUG_ON(!list_empty(&lo->plh_bulk_recall));
220 list_add(&lo->plh_bulk_recall, &recall_list); 220 list_add(&lo->plh_bulk_recall, &recall_list);
221 } 221 }
222 } 222 }
@@ -562,23 +562,16 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
562 if (!cps->clp) /* set in cb_sequence */ 562 if (!cps->clp) /* set in cb_sequence */
563 goto out; 563 goto out;
564 564
565 dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target max slots %d\n", 565 dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target highest slotid %d\n",
566 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR), 566 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
567 args->crsa_target_max_slots); 567 args->crsa_target_highest_slotid);
568 568
569 fc_tbl = &cps->clp->cl_session->fc_slot_table; 569 fc_tbl = &cps->clp->cl_session->fc_slot_table;
570 570
571 status = htonl(NFS4ERR_BAD_HIGH_SLOT);
572 if (args->crsa_target_max_slots > fc_tbl->max_slots ||
573 args->crsa_target_max_slots < 1)
574 goto out;
575
576 status = htonl(NFS4_OK); 571 status = htonl(NFS4_OK);
577 if (args->crsa_target_max_slots == fc_tbl->max_slots)
578 goto out;
579 572
580 fc_tbl->target_max_slots = args->crsa_target_max_slots; 573 nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid);
581 nfs41_handle_recall_slot(cps->clp); 574 nfs41_server_notify_target_slotid_update(cps->clp);
582out: 575out:
583 dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); 576 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
584 return status; 577 return status;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 742ff4ffced7..59461c957d9d 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -16,6 +16,7 @@
16#include "nfs4_fs.h" 16#include "nfs4_fs.h"
17#include "callback.h" 17#include "callback.h"
18#include "internal.h" 18#include "internal.h"
19#include "nfs4session.h"
19 20
20#define CB_OP_TAGLEN_MAXSZ (512) 21#define CB_OP_TAGLEN_MAXSZ (512)
21#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ) 22#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ)
@@ -520,7 +521,7 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
520 p = read_buf(xdr, 4); 521 p = read_buf(xdr, 4);
521 if (unlikely(p == NULL)) 522 if (unlikely(p == NULL))
522 return htonl(NFS4ERR_BADXDR); 523 return htonl(NFS4ERR_BADXDR);
523 args->crsa_target_max_slots = ntohl(*p++); 524 args->crsa_target_highest_slotid = ntohl(*p++);
524 return 0; 525 return 0;
525} 526}
526 527
@@ -762,7 +763,7 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
762 * A single slot, so highest used slotid is either 0 or -1 763 * A single slot, so highest used slotid is either 0 or -1
763 */ 764 */
764 tbl->highest_used_slotid = NFS4_NO_SLOT; 765 tbl->highest_used_slotid = NFS4_NO_SLOT;
765 nfs4_check_drain_bc_complete(session); 766 nfs4_session_drain_complete(session, tbl);
766 spin_unlock(&tbl->slot_tbl_lock); 767 spin_unlock(&tbl->slot_tbl_lock);
767} 768}
768 769
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8b39a42ac35e..9f3c66438d0e 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -277,7 +277,7 @@ void nfs_put_client(struct nfs_client *clp)
277 nfs_cb_idr_remove_locked(clp); 277 nfs_cb_idr_remove_locked(clp);
278 spin_unlock(&nn->nfs_client_lock); 278 spin_unlock(&nn->nfs_client_lock);
279 279
280 BUG_ON(!list_empty(&clp->cl_superblocks)); 280 WARN_ON_ONCE(!list_empty(&clp->cl_superblocks));
281 281
282 clp->rpc_ops->free_client(clp); 282 clp->rpc_ops->free_client(clp);
283 } 283 }
@@ -615,8 +615,7 @@ EXPORT_SYMBOL_GPL(nfs_create_rpc_client);
615 */ 615 */
616static void nfs_destroy_server(struct nfs_server *server) 616static void nfs_destroy_server(struct nfs_server *server)
617{ 617{
618 if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) || 618 if (server->nlm_host)
619 !(server->flags & NFS_MOUNT_LOCAL_FCNTL))
620 nlmclnt_done(server->nlm_host); 619 nlmclnt_done(server->nlm_host);
621} 620}
622 621
@@ -1061,10 +1060,6 @@ struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info,
1061 if (error < 0) 1060 if (error < 0)
1062 goto error; 1061 goto error;
1063 1062
1064 BUG_ON(!server->nfs_client);
1065 BUG_ON(!server->nfs_client->rpc_ops);
1066 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1067
1068 /* Probe the root fh to retrieve its FSID */ 1063 /* Probe the root fh to retrieve its FSID */
1069 error = nfs_probe_fsinfo(server, mount_info->mntfh, fattr); 1064 error = nfs_probe_fsinfo(server, mount_info->mntfh, fattr);
1070 if (error < 0) 1065 if (error < 0)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ce8cb926526b..32e6c53520e2 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -450,7 +450,8 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
450 nfs_refresh_inode(dentry->d_inode, entry->fattr); 450 nfs_refresh_inode(dentry->d_inode, entry->fattr);
451 goto out; 451 goto out;
452 } else { 452 } else {
453 d_drop(dentry); 453 if (d_invalidate(dentry) != 0)
454 goto out;
454 dput(dentry); 455 dput(dentry);
455 } 456 }
456 } 457 }
@@ -870,7 +871,7 @@ out:
870 return res; 871 return res;
871} 872}
872 873
873static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) 874static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
874{ 875{
875 struct dentry *dentry = filp->f_path.dentry; 876 struct dentry *dentry = filp->f_path.dentry;
876 struct inode *inode = dentry->d_inode; 877 struct inode *inode = dentry->d_inode;
@@ -879,10 +880,10 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
879 dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n", 880 dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",
880 dentry->d_parent->d_name.name, 881 dentry->d_parent->d_name.name,
881 dentry->d_name.name, 882 dentry->d_name.name,
882 offset, origin); 883 offset, whence);
883 884
884 mutex_lock(&inode->i_mutex); 885 mutex_lock(&inode->i_mutex);
885 switch (origin) { 886 switch (whence) {
886 case 1: 887 case 1:
887 offset += filp->f_pos; 888 offset += filp->f_pos;
888 case 0: 889 case 0:
@@ -978,10 +979,11 @@ static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags)
978 * particular file and the "nocto" mount flag is not set. 979 * particular file and the "nocto" mount flag is not set.
979 * 980 *
980 */ 981 */
981static inline 982static
982int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags) 983int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
983{ 984{
984 struct nfs_server *server = NFS_SERVER(inode); 985 struct nfs_server *server = NFS_SERVER(inode);
986 int ret;
985 987
986 if (IS_AUTOMOUNT(inode)) 988 if (IS_AUTOMOUNT(inode))
987 return 0; 989 return 0;
@@ -992,9 +994,13 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
992 if ((flags & LOOKUP_OPEN) && !(server->flags & NFS_MOUNT_NOCTO) && 994 if ((flags & LOOKUP_OPEN) && !(server->flags & NFS_MOUNT_NOCTO) &&
993 (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) 995 (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
994 goto out_force; 996 goto out_force;
995 return 0; 997out:
998 return (inode->i_nlink == 0) ? -ENOENT : 0;
996out_force: 999out_force:
997 return __nfs_revalidate_inode(server, inode); 1000 ret = __nfs_revalidate_inode(server, inode);
1001 if (ret != 0)
1002 return ret;
1003 goto out;
998} 1004}
999 1005
1000/* 1006/*
@@ -1100,6 +1106,8 @@ out_set_verifier:
1100out_zap_parent: 1106out_zap_parent:
1101 nfs_zap_caches(dir); 1107 nfs_zap_caches(dir);
1102 out_bad: 1108 out_bad:
1109 nfs_free_fattr(fattr);
1110 nfs_free_fhandle(fhandle);
1103 nfs_mark_for_revalidate(dir); 1111 nfs_mark_for_revalidate(dir);
1104 if (inode && S_ISDIR(inode->i_mode)) { 1112 if (inode && S_ISDIR(inode->i_mode)) {
1105 /* Purge readdir caches. */ 1113 /* Purge readdir caches. */
@@ -1112,8 +1120,6 @@ out_zap_parent:
1112 shrink_dcache_parent(dentry); 1120 shrink_dcache_parent(dentry);
1113 } 1121 }
1114 d_drop(dentry); 1122 d_drop(dentry);
1115 nfs_free_fattr(fattr);
1116 nfs_free_fhandle(fhandle);
1117 dput(parent); 1123 dput(parent);
1118 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", 1124 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
1119 __func__, dentry->d_parent->d_name.name, 1125 __func__, dentry->d_parent->d_name.name,
@@ -1155,11 +1161,14 @@ static int nfs_dentry_delete(const struct dentry *dentry)
1155 1161
1156} 1162}
1157 1163
1164/* Ensure that we revalidate inode->i_nlink */
1158static void nfs_drop_nlink(struct inode *inode) 1165static void nfs_drop_nlink(struct inode *inode)
1159{ 1166{
1160 spin_lock(&inode->i_lock); 1167 spin_lock(&inode->i_lock);
1161 if (inode->i_nlink > 0) 1168 /* drop the inode if we're reasonably sure this is the last link */
1162 drop_nlink(inode); 1169 if (inode->i_nlink == 1)
1170 clear_nlink(inode);
1171 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
1163 spin_unlock(&inode->i_lock); 1172 spin_unlock(&inode->i_lock);
1164} 1173}
1165 1174
@@ -1174,8 +1183,8 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
1174 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; 1183 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
1175 1184
1176 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { 1185 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
1177 drop_nlink(inode);
1178 nfs_complete_unlink(dentry, inode); 1186 nfs_complete_unlink(dentry, inode);
1187 nfs_drop_nlink(inode);
1179 } 1188 }
1180 iput(inode); 1189 iput(inode);
1181} 1190}
@@ -1646,10 +1655,8 @@ static int nfs_safe_remove(struct dentry *dentry)
1646 if (inode != NULL) { 1655 if (inode != NULL) {
1647 NFS_PROTO(inode)->return_delegation(inode); 1656 NFS_PROTO(inode)->return_delegation(inode);
1648 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); 1657 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
1649 /* The VFS may want to delete this inode */
1650 if (error == 0) 1658 if (error == 0)
1651 nfs_drop_nlink(inode); 1659 nfs_drop_nlink(inode);
1652 nfs_mark_for_revalidate(inode);
1653 } else 1660 } else
1654 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); 1661 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
1655 if (error == -ENOENT) 1662 if (error == -ENOENT)
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index cae26cbd59ee..0bd7a55a5f07 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -266,21 +266,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
266 struct nfs_page *req = nfs_list_entry(hdr->pages.next); 266 struct nfs_page *req = nfs_list_entry(hdr->pages.next);
267 struct page *page = req->wb_page; 267 struct page *page = req->wb_page;
268 268
269 if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) { 269 if (!PageCompound(page) && bytes < hdr->good_bytes)
270 if (bytes > hdr->good_bytes) 270 set_page_dirty(page);
271 zero_user(page, 0, PAGE_SIZE);
272 else if (hdr->good_bytes - bytes < PAGE_SIZE)
273 zero_user_segment(page,
274 hdr->good_bytes & ~PAGE_MASK,
275 PAGE_SIZE);
276 }
277 if (!PageCompound(page)) {
278 if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
279 if (bytes < hdr->good_bytes)
280 set_page_dirty(page);
281 } else
282 set_page_dirty(page);
283 }
284 bytes += req->wb_bytes; 271 bytes += req->wb_bytes;
285 nfs_list_remove_request(req); 272 nfs_list_remove_request(req);
286 nfs_direct_readpage_release(req); 273 nfs_direct_readpage_release(req);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 582bb8866131..3c2b893665ba 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -119,18 +119,18 @@ force_reval:
119 return __nfs_revalidate_inode(server, inode); 119 return __nfs_revalidate_inode(server, inode);
120} 120}
121 121
122loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) 122loff_t nfs_file_llseek(struct file *filp, loff_t offset, int whence)
123{ 123{
124 dprintk("NFS: llseek file(%s/%s, %lld, %d)\n", 124 dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
125 filp->f_path.dentry->d_parent->d_name.name, 125 filp->f_path.dentry->d_parent->d_name.name,
126 filp->f_path.dentry->d_name.name, 126 filp->f_path.dentry->d_name.name,
127 offset, origin); 127 offset, whence);
128 128
129 /* 129 /*
130 * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate 130 * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
131 * the cached file length 131 * the cached file length
132 */ 132 */
133 if (origin != SEEK_SET && origin != SEEK_CUR) { 133 if (whence != SEEK_SET && whence != SEEK_CUR) {
134 struct inode *inode = filp->f_mapping->host; 134 struct inode *inode = filp->f_mapping->host;
135 135
136 int retval = nfs_revalidate_file_size(inode, filp); 136 int retval = nfs_revalidate_file_size(inode, filp);
@@ -138,7 +138,7 @@ loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
138 return (loff_t)retval; 138 return (loff_t)retval;
139 } 139 }
140 140
141 return generic_file_llseek(filp, offset, origin); 141 return generic_file_llseek(filp, offset, whence);
142} 142}
143EXPORT_SYMBOL_GPL(nfs_file_llseek); 143EXPORT_SYMBOL_GPL(nfs_file_llseek);
144 144
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 9cc4a3fbf4b0..bc3968fa81e5 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -193,19 +193,15 @@ static int nfs_idmap_init_keyring(void)
193 if (!cred) 193 if (!cred)
194 return -ENOMEM; 194 return -ENOMEM;
195 195
196 keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred, 196 keyring = keyring_alloc(".id_resolver", 0, 0, cred,
197 (KEY_POS_ALL & ~KEY_POS_SETATTR) | 197 (KEY_POS_ALL & ~KEY_POS_SETATTR) |
198 KEY_USR_VIEW | KEY_USR_READ, 198 KEY_USR_VIEW | KEY_USR_READ,
199 KEY_ALLOC_NOT_IN_QUOTA); 199 KEY_ALLOC_NOT_IN_QUOTA, NULL);
200 if (IS_ERR(keyring)) { 200 if (IS_ERR(keyring)) {
201 ret = PTR_ERR(keyring); 201 ret = PTR_ERR(keyring);
202 goto failed_put_cred; 202 goto failed_put_cred;
203 } 203 }
204 204
205 ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
206 if (ret < 0)
207 goto failed_put_key;
208
209 ret = register_key_type(&key_type_id_resolver); 205 ret = register_key_type(&key_type_id_resolver);
210 if (ret < 0) 206 if (ret < 0)
211 goto failed_put_key; 207 goto failed_put_key;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6fa01aea2488..2faae14d89f4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -107,13 +107,19 @@ u64 nfs_compat_user_ino64(u64 fileid)
107 return ino; 107 return ino;
108} 108}
109 109
110int nfs_drop_inode(struct inode *inode)
111{
112 return NFS_STALE(inode) || generic_drop_inode(inode);
113}
114EXPORT_SYMBOL_GPL(nfs_drop_inode);
115
110void nfs_clear_inode(struct inode *inode) 116void nfs_clear_inode(struct inode *inode)
111{ 117{
112 /* 118 /*
113 * The following should never happen... 119 * The following should never happen...
114 */ 120 */
115 BUG_ON(nfs_have_writebacks(inode)); 121 WARN_ON_ONCE(nfs_have_writebacks(inode));
116 BUG_ON(!list_empty(&NFS_I(inode)->open_files)); 122 WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files));
117 nfs_zap_acl_cache(inode); 123 nfs_zap_acl_cache(inode);
118 nfs_access_zap_cache(inode); 124 nfs_access_zap_cache(inode);
119 nfs_fscache_release_inode_cookie(inode); 125 nfs_fscache_release_inode_cookie(inode);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 05521cadac2e..f0e6c7df1a07 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -18,27 +18,6 @@ struct nfs_string;
18 */ 18 */
19#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) 19#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1)
20 20
21/*
22 * Determine if sessions are in use.
23 */
24static inline int nfs4_has_session(const struct nfs_client *clp)
25{
26#ifdef CONFIG_NFS_V4_1
27 if (clp->cl_session)
28 return 1;
29#endif /* CONFIG_NFS_V4_1 */
30 return 0;
31}
32
33static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
34{
35#ifdef CONFIG_NFS_V4_1
36 if (nfs4_has_session(clp))
37 return (clp->cl_session->flags & SESSION4_PERSIST);
38#endif /* CONFIG_NFS_V4_1 */
39 return 0;
40}
41
42static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) 21static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)
43{ 22{
44 if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) 23 if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid))
@@ -276,8 +255,6 @@ extern const u32 nfs41_maxwrite_overhead;
276extern struct rpc_procinfo nfs4_procedures[]; 255extern struct rpc_procinfo nfs4_procedures[];
277#endif 256#endif
278 257
279extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
280
281/* proc.c */ 258/* proc.c */
282void nfs_close_context(struct nfs_open_context *ctx, int is_sync); 259void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
283extern struct nfs_client *nfs_init_client(struct nfs_client *clp, 260extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
@@ -319,6 +296,7 @@ extern struct workqueue_struct *nfsiod_workqueue;
319extern struct inode *nfs_alloc_inode(struct super_block *sb); 296extern struct inode *nfs_alloc_inode(struct super_block *sb);
320extern void nfs_destroy_inode(struct inode *); 297extern void nfs_destroy_inode(struct inode *);
321extern int nfs_write_inode(struct inode *, struct writeback_control *); 298extern int nfs_write_inode(struct inode *, struct writeback_control *);
299extern int nfs_drop_inode(struct inode *);
322extern void nfs_clear_inode(struct inode *); 300extern void nfs_clear_inode(struct inode *);
323extern void nfs_evict_inode(struct inode *); 301extern void nfs_evict_inode(struct inode *);
324void nfs_zap_acl_cache(struct inode *inode); 302void nfs_zap_acl_cache(struct inode *inode);
@@ -386,9 +364,6 @@ extern int nfs_initiate_read(struct rpc_clnt *clnt,
386extern void nfs_read_prepare(struct rpc_task *task, void *calldata); 364extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
387extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, 365extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
388 struct nfs_pgio_header *hdr); 366 struct nfs_pgio_header *hdr);
389extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
390 struct inode *inode,
391 const struct nfs_pgio_completion_ops *compl_ops);
392extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); 367extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
393extern void nfs_readdata_release(struct nfs_read_data *rdata); 368extern void nfs_readdata_release(struct nfs_read_data *rdata);
394 369
@@ -411,9 +386,6 @@ extern struct nfs_write_header *nfs_writehdr_alloc(void);
411extern void nfs_writehdr_free(struct nfs_pgio_header *hdr); 386extern void nfs_writehdr_free(struct nfs_pgio_header *hdr);
412extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc, 387extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
413 struct nfs_pgio_header *hdr); 388 struct nfs_pgio_header *hdr);
414extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
415 struct inode *inode, int ioflags,
416 const struct nfs_pgio_completion_ops *compl_ops);
417extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); 389extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
418extern void nfs_writedata_release(struct nfs_write_data *wdata); 390extern void nfs_writedata_release(struct nfs_write_data *wdata);
419extern void nfs_commit_free(struct nfs_commit_data *p); 391extern void nfs_commit_free(struct nfs_commit_data *p);
@@ -474,18 +446,6 @@ extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
474 const struct rpc_timeout *timeparms, 446 const struct rpc_timeout *timeparms,
475 const char *ip_addr, 447 const char *ip_addr,
476 rpc_authflavor_t authflavour); 448 rpc_authflavor_t authflavour);
477extern int _nfs4_call_sync(struct rpc_clnt *clnt,
478 struct nfs_server *server,
479 struct rpc_message *msg,
480 struct nfs4_sequence_args *args,
481 struct nfs4_sequence_res *res,
482 int cache_reply);
483extern int _nfs4_call_sync_session(struct rpc_clnt *clnt,
484 struct nfs_server *server,
485 struct rpc_message *msg,
486 struct nfs4_sequence_args *args,
487 struct nfs4_sequence_res *res,
488 int cache_reply);
489extern int nfs40_walk_client_list(struct nfs_client *clp, 449extern int nfs40_walk_client_list(struct nfs_client *clp,
490 struct nfs_client **result, 450 struct nfs_client **result,
491 struct rpc_cred *cred); 451 struct rpc_cred *cred);
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 015f71f8f62c..91a6faf811ac 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -169,6 +169,9 @@ int nfs_mount(struct nfs_mount_request *info)
169 (info->hostname ? info->hostname : "server"), 169 (info->hostname ? info->hostname : "server"),
170 info->dirpath); 170 info->dirpath);
171 171
172 if (strlen(info->dirpath) > MNTPATHLEN)
173 return -ENAMETOOLONG;
174
172 if (info->noresvport) 175 if (info->noresvport)
173 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; 176 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
174 177
@@ -242,6 +245,9 @@ void nfs_umount(const struct nfs_mount_request *info)
242 struct rpc_clnt *clnt; 245 struct rpc_clnt *clnt;
243 int status; 246 int status;
244 247
248 if (strlen(info->dirpath) > MNTPATHLEN)
249 return;
250
245 if (info->noresvport) 251 if (info->noresvport)
246 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; 252 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
247 253
@@ -283,7 +289,6 @@ static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
283 const u32 pathname_len = strlen(pathname); 289 const u32 pathname_len = strlen(pathname);
284 __be32 *p; 290 __be32 *p;
285 291
286 BUG_ON(pathname_len > MNTPATHLEN);
287 p = xdr_reserve_space(xdr, 4 + pathname_len); 292 p = xdr_reserve_space(xdr, 4 + pathname_len);
288 xdr_encode_opaque(p, pathname, pathname_len); 293 xdr_encode_opaque(p, pathname, pathname_len);
289} 294}
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index d04f0df7be55..06b9df49f7f7 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -195,7 +195,6 @@ static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh)
195{ 195{
196 __be32 *p; 196 __be32 *p;
197 197
198 BUG_ON(fh->size != NFS2_FHSIZE);
199 p = xdr_reserve_space(xdr, NFS2_FHSIZE); 198 p = xdr_reserve_space(xdr, NFS2_FHSIZE);
200 memcpy(p, fh->data, NFS2_FHSIZE); 199 memcpy(p, fh->data, NFS2_FHSIZE);
201} 200}
@@ -388,7 +387,7 @@ static void encode_filename(struct xdr_stream *xdr,
388{ 387{
389 __be32 *p; 388 __be32 *p;
390 389
391 BUG_ON(length > NFS2_MAXNAMLEN); 390 WARN_ON_ONCE(length > NFS2_MAXNAMLEN);
392 p = xdr_reserve_space(xdr, 4 + length); 391 p = xdr_reserve_space(xdr, 4 + length);
393 xdr_encode_opaque(p, name, length); 392 xdr_encode_opaque(p, name, length);
394} 393}
@@ -428,7 +427,6 @@ static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length)
428{ 427{
429 __be32 *p; 428 __be32 *p;
430 429
431 BUG_ON(length > NFS2_MAXPATHLEN);
432 p = xdr_reserve_space(xdr, 4); 430 p = xdr_reserve_space(xdr, 4);
433 *p = cpu_to_be32(length); 431 *p = cpu_to_be32(length);
434 xdr_write_pages(xdr, pages, 0, length); 432 xdr_write_pages(xdr, pages, 0, length);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 69322096c325..70efb63b1e42 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -24,14 +24,14 @@
24 24
25#define NFSDBG_FACILITY NFSDBG_PROC 25#define NFSDBG_FACILITY NFSDBG_PROC
26 26
27/* A wrapper to handle the EJUKEBOX and EKEYEXPIRED error messages */ 27/* A wrapper to handle the EJUKEBOX error messages */
28static int 28static int
29nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) 29nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
30{ 30{
31 int res; 31 int res;
32 do { 32 do {
33 res = rpc_call_sync(clnt, msg, flags); 33 res = rpc_call_sync(clnt, msg, flags);
34 if (res != -EJUKEBOX && res != -EKEYEXPIRED) 34 if (res != -EJUKEBOX)
35 break; 35 break;
36 freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); 36 freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
37 res = -ERESTARTSYS; 37 res = -ERESTARTSYS;
@@ -44,7 +44,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
44static int 44static int
45nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode) 45nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode)
46{ 46{
47 if (task->tk_status != -EJUKEBOX && task->tk_status != -EKEYEXPIRED) 47 if (task->tk_status != -EJUKEBOX)
48 return 0; 48 return 0;
49 if (task->tk_status == -EJUKEBOX) 49 if (task->tk_status == -EJUKEBOX)
50 nfs_inc_stats(inode, NFSIOS_DELAY); 50 nfs_inc_stats(inode, NFSIOS_DELAY);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 6cbe89400dfc..bffc32406fbf 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -198,7 +198,7 @@ static void encode_filename3(struct xdr_stream *xdr,
198{ 198{
199 __be32 *p; 199 __be32 *p;
200 200
201 BUG_ON(length > NFS3_MAXNAMLEN); 201 WARN_ON_ONCE(length > NFS3_MAXNAMLEN);
202 p = xdr_reserve_space(xdr, 4 + length); 202 p = xdr_reserve_space(xdr, 4 + length);
203 xdr_encode_opaque(p, name, length); 203 xdr_encode_opaque(p, name, length);
204} 204}
@@ -238,7 +238,6 @@ out_overflow:
238static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages, 238static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages,
239 const u32 length) 239 const u32 length)
240{ 240{
241 BUG_ON(length > NFS3_MAXPATHLEN);
242 encode_uint32(xdr, length); 241 encode_uint32(xdr, length);
243 xdr_write_pages(xdr, pages, 0, length); 242 xdr_write_pages(xdr, pages, 0, length);
244} 243}
@@ -388,7 +387,6 @@ out_overflow:
388 */ 387 */
389static void encode_ftype3(struct xdr_stream *xdr, const u32 type) 388static void encode_ftype3(struct xdr_stream *xdr, const u32 type)
390{ 389{
391 BUG_ON(type > NF3FIFO);
392 encode_uint32(xdr, type); 390 encode_uint32(xdr, type);
393} 391}
394 392
@@ -443,7 +441,7 @@ static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh)
443{ 441{
444 __be32 *p; 442 __be32 *p;
445 443
446 BUG_ON(fh->size > NFS3_FHSIZE); 444 WARN_ON_ONCE(fh->size > NFS3_FHSIZE);
447 p = xdr_reserve_space(xdr, 4 + fh->size); 445 p = xdr_reserve_space(xdr, 4 + fh->size);
448 xdr_encode_opaque(p, fh->data, fh->size); 446 xdr_encode_opaque(p, fh->data, fh->size);
449} 447}
@@ -1339,6 +1337,7 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
1339 error = nfsacl_encode(xdr->buf, base, args->inode, 1337 error = nfsacl_encode(xdr->buf, base, args->inode,
1340 (args->mask & NFS_ACL) ? 1338 (args->mask & NFS_ACL) ?
1341 args->acl_access : NULL, 1, 0); 1339 args->acl_access : NULL, 1, 0);
1340 /* FIXME: this is just broken */
1342 BUG_ON(error < 0); 1341 BUG_ON(error < 0);
1343 error = nfsacl_encode(xdr->buf, base + error, args->inode, 1342 error = nfsacl_encode(xdr->buf, base + error, args->inode,
1344 (args->mask & NFS_DFACL) ? 1343 (args->mask & NFS_DFACL) ?
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a525fdefccde..a3f488b074a2 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -11,6 +11,8 @@
11 11
12#if IS_ENABLED(CONFIG_NFS_V4) 12#if IS_ENABLED(CONFIG_NFS_V4)
13 13
14#define NFS4_MAX_LOOP_ON_RECOVER (10)
15
14struct idmap; 16struct idmap;
15 17
16enum nfs4_client_state { 18enum nfs4_client_state {
@@ -21,18 +23,12 @@ enum nfs4_client_state {
21 NFS4CLNT_RECLAIM_NOGRACE, 23 NFS4CLNT_RECLAIM_NOGRACE,
22 NFS4CLNT_DELEGRETURN, 24 NFS4CLNT_DELEGRETURN,
23 NFS4CLNT_SESSION_RESET, 25 NFS4CLNT_SESSION_RESET,
24 NFS4CLNT_RECALL_SLOT,
25 NFS4CLNT_LEASE_CONFIRM, 26 NFS4CLNT_LEASE_CONFIRM,
26 NFS4CLNT_SERVER_SCOPE_MISMATCH, 27 NFS4CLNT_SERVER_SCOPE_MISMATCH,
27 NFS4CLNT_PURGE_STATE, 28 NFS4CLNT_PURGE_STATE,
28 NFS4CLNT_BIND_CONN_TO_SESSION, 29 NFS4CLNT_BIND_CONN_TO_SESSION,
29}; 30};
30 31
31enum nfs4_session_state {
32 NFS4_SESSION_INITING,
33 NFS4_SESSION_DRAINING,
34};
35
36#define NFS4_RENEW_TIMEOUT 0x01 32#define NFS4_RENEW_TIMEOUT 0x01
37#define NFS4_RENEW_DELEGATION_CB 0x02 33#define NFS4_RENEW_DELEGATION_CB 0x02
38 34
@@ -43,8 +39,7 @@ struct nfs4_minor_version_ops {
43 struct nfs_server *server, 39 struct nfs_server *server,
44 struct rpc_message *msg, 40 struct rpc_message *msg,
45 struct nfs4_sequence_args *args, 41 struct nfs4_sequence_args *args,
46 struct nfs4_sequence_res *res, 42 struct nfs4_sequence_res *res);
47 int cache_reply);
48 bool (*match_stateid)(const nfs4_stateid *, 43 bool (*match_stateid)(const nfs4_stateid *,
49 const nfs4_stateid *); 44 const nfs4_stateid *);
50 int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, 45 int (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
@@ -241,18 +236,14 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
241 return server->nfs_client->cl_session; 236 return server->nfs_client->cl_session;
242} 237}
243 238
244extern bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy);
245extern int nfs4_setup_sequence(const struct nfs_server *server, 239extern int nfs4_setup_sequence(const struct nfs_server *server,
246 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 240 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
247 struct rpc_task *task); 241 struct rpc_task *task);
248extern int nfs41_setup_sequence(struct nfs4_session *session, 242extern int nfs41_setup_sequence(struct nfs4_session *session,
249 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 243 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
250 struct rpc_task *task); 244 struct rpc_task *task);
251extern void nfs4_destroy_session(struct nfs4_session *session);
252extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
253extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *); 245extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *);
254extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *); 246extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *);
255extern int nfs4_init_session(struct nfs_server *server);
256extern int nfs4_proc_get_lease_time(struct nfs_client *clp, 247extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
257 struct nfs_fsinfo *fsinfo); 248 struct nfs_fsinfo *fsinfo);
258extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, 249extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
@@ -280,11 +271,7 @@ static inline int nfs4_setup_sequence(const struct nfs_server *server,
280 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 271 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
281 struct rpc_task *task) 272 struct rpc_task *task)
282{ 273{
283 return 0; 274 rpc_call_start(task);
284}
285
286static inline int nfs4_init_session(struct nfs_server *server)
287{
288 return 0; 275 return 0;
289} 276}
290 277
@@ -321,17 +308,20 @@ extern void nfs4_renew_state(struct work_struct *);
321 308
322/* nfs4state.c */ 309/* nfs4state.c */
323struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp); 310struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp);
311struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
324struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); 312struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
325int nfs4_discover_server_trunking(struct nfs_client *clp, 313int nfs4_discover_server_trunking(struct nfs_client *clp,
326 struct nfs_client **); 314 struct nfs_client **);
327int nfs40_discover_server_trunking(struct nfs_client *clp, 315int nfs40_discover_server_trunking(struct nfs_client *clp,
328 struct nfs_client **, struct rpc_cred *); 316 struct nfs_client **, struct rpc_cred *);
329#if defined(CONFIG_NFS_V4_1) 317#if defined(CONFIG_NFS_V4_1)
330struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
331struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); 318struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
332int nfs41_discover_server_trunking(struct nfs_client *clp, 319int nfs41_discover_server_trunking(struct nfs_client *clp,
333 struct nfs_client **, struct rpc_cred *); 320 struct nfs_client **, struct rpc_cred *);
334extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); 321extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
322extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp);
323extern void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp);
324
335#else 325#else
336static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) 326static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
337{ 327{
@@ -349,11 +339,12 @@ extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
349extern void nfs_inode_find_state_and_recover(struct inode *inode, 339extern void nfs_inode_find_state_and_recover(struct inode *inode,
350 const nfs4_stateid *stateid); 340 const nfs4_stateid *stateid);
351extern void nfs4_schedule_lease_recovery(struct nfs_client *); 341extern void nfs4_schedule_lease_recovery(struct nfs_client *);
342extern int nfs4_wait_clnt_recover(struct nfs_client *clp);
343extern int nfs4_client_recover_expired_lease(struct nfs_client *clp);
352extern void nfs4_schedule_state_manager(struct nfs_client *); 344extern void nfs4_schedule_state_manager(struct nfs_client *);
353extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); 345extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
354extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); 346extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
355extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); 347extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
356extern void nfs41_handle_recall_slot(struct nfs_client *clp);
357extern void nfs41_handle_server_scope(struct nfs_client *, 348extern void nfs41_handle_server_scope(struct nfs_client *,
358 struct nfs41_server_scope **); 349 struct nfs41_server_scope **);
359extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); 350extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 6bacfde1319a..acc347268124 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -12,6 +12,7 @@
12#include "internal.h" 12#include "internal.h"
13#include "callback.h" 13#include "callback.h"
14#include "delegation.h" 14#include "delegation.h"
15#include "nfs4session.h"
15#include "pnfs.h" 16#include "pnfs.h"
16#include "netns.h" 17#include "netns.h"
17 18
@@ -713,10 +714,6 @@ static int nfs4_server_common_setup(struct nfs_server *server,
713 struct nfs_fattr *fattr; 714 struct nfs_fattr *fattr;
714 int error; 715 int error;
715 716
716 BUG_ON(!server->nfs_client);
717 BUG_ON(!server->nfs_client->rpc_ops);
718 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
719
720 /* data servers support only a subset of NFSv4.1 */ 717 /* data servers support only a subset of NFSv4.1 */
721 if (is_ds_only_client(server->nfs_client)) 718 if (is_ds_only_client(server->nfs_client))
722 return -EPROTONOSUPPORT; 719 return -EPROTONOSUPPORT;
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index afddd6639afb..e7699308364a 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -20,7 +20,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
20 struct iattr attr; 20 struct iattr attr;
21 int err; 21 int err;
22 22
23 BUG_ON(inode != dentry->d_inode);
24 /* 23 /*
25 * If no cached dentry exists or if it's negative, NFSv4 handled the 24 * If no cached dentry exists or if it's negative, NFSv4 handled the
26 * opens in ->lookup() or ->create(). 25 * opens in ->lookup() or ->create().
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 2e45fd9c02a3..194c48410336 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -35,6 +35,7 @@
35 35
36#include <linux/sunrpc/metrics.h> 36#include <linux/sunrpc/metrics.h>
37 37
38#include "nfs4session.h"
38#include "internal.h" 39#include "internal.h"
39#include "delegation.h" 40#include "delegation.h"
40#include "nfs4filelayout.h" 41#include "nfs4filelayout.h"
@@ -178,7 +179,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
178 break; 179 break;
179 case -NFS4ERR_DELAY: 180 case -NFS4ERR_DELAY:
180 case -NFS4ERR_GRACE: 181 case -NFS4ERR_GRACE:
181 case -EKEYEXPIRED:
182 rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX); 182 rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
183 break; 183 break;
184 case -NFS4ERR_RETRY_UNCACHED_REP: 184 case -NFS4ERR_RETRY_UNCACHED_REP:
@@ -306,12 +306,10 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
306 } 306 }
307 rdata->read_done_cb = filelayout_read_done_cb; 307 rdata->read_done_cb = filelayout_read_done_cb;
308 308
309 if (nfs41_setup_sequence(rdata->ds_clp->cl_session, 309 nfs41_setup_sequence(rdata->ds_clp->cl_session,
310 &rdata->args.seq_args, &rdata->res.seq_res, 310 &rdata->args.seq_args,
311 task)) 311 &rdata->res.seq_res,
312 return; 312 task);
313
314 rpc_call_start(task);
315} 313}
316 314
317static void filelayout_read_call_done(struct rpc_task *task, void *data) 315static void filelayout_read_call_done(struct rpc_task *task, void *data)
@@ -408,12 +406,10 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data)
408 rpc_exit(task, 0); 406 rpc_exit(task, 0);
409 return; 407 return;
410 } 408 }
411 if (nfs41_setup_sequence(wdata->ds_clp->cl_session, 409 nfs41_setup_sequence(wdata->ds_clp->cl_session,
412 &wdata->args.seq_args, &wdata->res.seq_res, 410 &wdata->args.seq_args,
413 task)) 411 &wdata->res.seq_res,
414 return; 412 task);
415
416 rpc_call_start(task);
417} 413}
418 414
419static void filelayout_write_call_done(struct rpc_task *task, void *data) 415static void filelayout_write_call_done(struct rpc_task *task, void *data)
@@ -449,12 +445,10 @@ static void filelayout_commit_prepare(struct rpc_task *task, void *data)
449{ 445{
450 struct nfs_commit_data *wdata = data; 446 struct nfs_commit_data *wdata = data;
451 447
452 if (nfs41_setup_sequence(wdata->ds_clp->cl_session, 448 nfs41_setup_sequence(wdata->ds_clp->cl_session,
453 &wdata->args.seq_args, &wdata->res.seq_res, 449 &wdata->args.seq_args,
454 task)) 450 &wdata->res.seq_res,
455 return; 451 task);
456
457 rpc_call_start(task);
458} 452}
459 453
460static void filelayout_write_commit_done(struct rpc_task *task, void *data) 454static void filelayout_write_commit_done(struct rpc_task *task, void *data)
@@ -512,7 +506,6 @@ filelayout_read_pagelist(struct nfs_read_data *data)
512 loff_t offset = data->args.offset; 506 loff_t offset = data->args.offset;
513 u32 j, idx; 507 u32 j, idx;
514 struct nfs_fh *fh; 508 struct nfs_fh *fh;
515 int status;
516 509
517 dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n", 510 dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
518 __func__, hdr->inode->i_ino, 511 __func__, hdr->inode->i_ino,
@@ -538,9 +531,8 @@ filelayout_read_pagelist(struct nfs_read_data *data)
538 data->mds_offset = offset; 531 data->mds_offset = offset;
539 532
540 /* Perform an asynchronous read to ds */ 533 /* Perform an asynchronous read to ds */
541 status = nfs_initiate_read(ds->ds_clp->cl_rpcclient, data, 534 nfs_initiate_read(ds->ds_clp->cl_rpcclient, data,
542 &filelayout_read_call_ops, RPC_TASK_SOFTCONN); 535 &filelayout_read_call_ops, RPC_TASK_SOFTCONN);
543 BUG_ON(status != 0);
544 return PNFS_ATTEMPTED; 536 return PNFS_ATTEMPTED;
545} 537}
546 538
@@ -554,7 +546,6 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
554 loff_t offset = data->args.offset; 546 loff_t offset = data->args.offset;
555 u32 j, idx; 547 u32 j, idx;
556 struct nfs_fh *fh; 548 struct nfs_fh *fh;
557 int status;
558 549
559 /* Retrieve the correct rpc_client for the byte range */ 550 /* Retrieve the correct rpc_client for the byte range */
560 j = nfs4_fl_calc_j_index(lseg, offset); 551 j = nfs4_fl_calc_j_index(lseg, offset);
@@ -579,10 +570,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
579 data->args.offset = filelayout_get_dserver_offset(lseg, offset); 570 data->args.offset = filelayout_get_dserver_offset(lseg, offset);
580 571
581 /* Perform an asynchronous write */ 572 /* Perform an asynchronous write */
582 status = nfs_initiate_write(ds->ds_clp->cl_rpcclient, data, 573 nfs_initiate_write(ds->ds_clp->cl_rpcclient, data,
583 &filelayout_write_call_ops, sync, 574 &filelayout_write_call_ops, sync,
584 RPC_TASK_SOFTCONN); 575 RPC_TASK_SOFTCONN);
585 BUG_ON(status != 0);
586 return PNFS_ATTEMPTED; 576 return PNFS_ATTEMPTED;
587} 577}
588 578
@@ -909,7 +899,7 @@ static void
909filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, 899filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
910 struct nfs_page *req) 900 struct nfs_page *req)
911{ 901{
912 BUG_ON(pgio->pg_lseg != NULL); 902 WARN_ON_ONCE(pgio->pg_lseg != NULL);
913 903
914 if (req->wb_offset != req->wb_pgbase) { 904 if (req->wb_offset != req->wb_pgbase) {
915 /* 905 /*
@@ -939,7 +929,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
939 struct nfs_commit_info cinfo; 929 struct nfs_commit_info cinfo;
940 int status; 930 int status;
941 931
942 BUG_ON(pgio->pg_lseg != NULL); 932 WARN_ON_ONCE(pgio->pg_lseg != NULL);
943 933
944 if (req->wb_offset != req->wb_pgbase) 934 if (req->wb_offset != req->wb_pgbase)
945 goto out_mds; 935 goto out_mds;
@@ -1187,7 +1177,6 @@ static void filelayout_recover_commit_reqs(struct list_head *dst,
1187 */ 1177 */
1188 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { 1178 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
1189 if (transfer_commit_list(&b->written, dst, cinfo, 0)) { 1179 if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
1190 BUG_ON(!list_empty(&b->written));
1191 pnfs_put_lseg(b->wlseg); 1180 pnfs_put_lseg(b->wlseg);
1192 b->wlseg = NULL; 1181 b->wlseg = NULL;
1193 } 1182 }
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index a8eaa9b7bb0f..b720064bcd7f 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -33,6 +33,7 @@
33#include <linux/module.h> 33#include <linux/module.h>
34 34
35#include "internal.h" 35#include "internal.h"
36#include "nfs4session.h"
36#include "nfs4filelayout.h" 37#include "nfs4filelayout.h"
37 38
38#define NFSDBG_FACILITY NFSDBG_PNFS_LD 39#define NFSDBG_FACILITY NFSDBG_PNFS_LD
@@ -162,8 +163,6 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
162 dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr, 163 dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
163 mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor); 164 mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
164 165
165 BUG_ON(list_empty(&ds->ds_addrs));
166
167 list_for_each_entry(da, &ds->ds_addrs, da_node) { 166 list_for_each_entry(da, &ds->ds_addrs, da_node) {
168 dprintk("%s: DS %s: trying address %s\n", 167 dprintk("%s: DS %s: trying address %s\n",
169 __func__, ds->ds_remotestr, da->da_remotestr); 168 __func__, ds->ds_remotestr, da->da_remotestr);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5eec4429970c..493f0f41c554 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -52,7 +52,6 @@
52#include <linux/mount.h> 52#include <linux/mount.h>
53#include <linux/module.h> 53#include <linux/module.h>
54#include <linux/nfs_idmap.h> 54#include <linux/nfs_idmap.h>
55#include <linux/sunrpc/bc_xprt.h>
56#include <linux/xattr.h> 55#include <linux/xattr.h>
57#include <linux/utsname.h> 56#include <linux/utsname.h>
58#include <linux/freezer.h> 57#include <linux/freezer.h>
@@ -64,14 +63,14 @@
64#include "callback.h" 63#include "callback.h"
65#include "pnfs.h" 64#include "pnfs.h"
66#include "netns.h" 65#include "netns.h"
66#include "nfs4session.h"
67
67 68
68#define NFSDBG_FACILITY NFSDBG_PROC 69#define NFSDBG_FACILITY NFSDBG_PROC
69 70
70#define NFS4_POLL_RETRY_MIN (HZ/10) 71#define NFS4_POLL_RETRY_MIN (HZ/10)
71#define NFS4_POLL_RETRY_MAX (15*HZ) 72#define NFS4_POLL_RETRY_MAX (15*HZ)
72 73
73#define NFS4_MAX_LOOP_ON_RECOVER (10)
74
75struct nfs4_opendata; 74struct nfs4_opendata;
76static int _nfs4_proc_open(struct nfs4_opendata *data); 75static int _nfs4_proc_open(struct nfs4_opendata *data);
77static int _nfs4_recover_proc_open(struct nfs4_opendata *data); 76static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
@@ -206,7 +205,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
206{ 205{
207 __be32 *start, *p; 206 __be32 *start, *p;
208 207
209 BUG_ON(readdir->count < 80);
210 if (cookie > 2) { 208 if (cookie > 2) {
211 readdir->cookie = cookie; 209 readdir->cookie = cookie;
212 memcpy(&readdir->verifier, verifier, sizeof(readdir->verifier)); 210 memcpy(&readdir->verifier, verifier, sizeof(readdir->verifier));
@@ -256,22 +254,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
256 kunmap_atomic(start); 254 kunmap_atomic(start);
257} 255}
258 256
259static int nfs4_wait_clnt_recover(struct nfs_client *clp)
260{
261 int res;
262
263 might_sleep();
264
265 res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
266 nfs_wait_bit_killable, TASK_KILLABLE);
267 if (res)
268 return res;
269
270 if (clp->cl_cons_state < 0)
271 return clp->cl_cons_state;
272 return 0;
273}
274
275static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) 257static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
276{ 258{
277 int res = 0; 259 int res = 0;
@@ -351,7 +333,6 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
351 } 333 }
352 case -NFS4ERR_GRACE: 334 case -NFS4ERR_GRACE:
353 case -NFS4ERR_DELAY: 335 case -NFS4ERR_DELAY:
354 case -EKEYEXPIRED:
355 ret = nfs4_delay(server->client, &exception->timeout); 336 ret = nfs4_delay(server->client, &exception->timeout);
356 if (ret != 0) 337 if (ret != 0)
357 break; 338 break;
@@ -397,144 +378,136 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
397 378
398#if defined(CONFIG_NFS_V4_1) 379#if defined(CONFIG_NFS_V4_1)
399 380
400/*
401 * nfs4_free_slot - free a slot and efficiently update slot table.
402 *
403 * freeing a slot is trivially done by clearing its respective bit
404 * in the bitmap.
405 * If the freed slotid equals highest_used_slotid we want to update it
406 * so that the server would be able to size down the slot table if needed,
407 * otherwise we know that the highest_used_slotid is still in use.
408 * When updating highest_used_slotid there may be "holes" in the bitmap
409 * so we need to scan down from highest_used_slotid to 0 looking for the now
410 * highest slotid in use.
411 * If none found, highest_used_slotid is set to NFS4_NO_SLOT.
412 *
413 * Must be called while holding tbl->slot_tbl_lock
414 */
415static void
416nfs4_free_slot(struct nfs4_slot_table *tbl, u32 slotid)
417{
418 BUG_ON(slotid >= NFS4_MAX_SLOT_TABLE);
419 /* clear used bit in bitmap */
420 __clear_bit(slotid, tbl->used_slots);
421
422 /* update highest_used_slotid when it is freed */
423 if (slotid == tbl->highest_used_slotid) {
424 slotid = find_last_bit(tbl->used_slots, tbl->max_slots);
425 if (slotid < tbl->max_slots)
426 tbl->highest_used_slotid = slotid;
427 else
428 tbl->highest_used_slotid = NFS4_NO_SLOT;
429 }
430 dprintk("%s: slotid %u highest_used_slotid %d\n", __func__,
431 slotid, tbl->highest_used_slotid);
432}
433
434bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy)
435{
436 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
437 return true;
438}
439
440/*
441 * Signal state manager thread if session fore channel is drained
442 */
443static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
444{
445 if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
446 rpc_wake_up_first(&ses->fc_slot_table.slot_tbl_waitq,
447 nfs4_set_task_privileged, NULL);
448 return;
449 }
450
451 if (ses->fc_slot_table.highest_used_slotid != NFS4_NO_SLOT)
452 return;
453
454 dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
455 complete(&ses->fc_slot_table.complete);
456}
457
458/*
459 * Signal state manager thread if session back channel is drained
460 */
461void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
462{
463 if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
464 ses->bc_slot_table.highest_used_slotid != NFS4_NO_SLOT)
465 return;
466 dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
467 complete(&ses->bc_slot_table.complete);
468}
469
470static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) 381static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
471{ 382{
383 struct nfs4_session *session;
472 struct nfs4_slot_table *tbl; 384 struct nfs4_slot_table *tbl;
385 bool send_new_highest_used_slotid = false;
473 386
474 tbl = &res->sr_session->fc_slot_table;
475 if (!res->sr_slot) { 387 if (!res->sr_slot) {
476 /* just wake up the next guy waiting since 388 /* just wake up the next guy waiting since
477 * we may have not consumed a slot after all */ 389 * we may have not consumed a slot after all */
478 dprintk("%s: No slot\n", __func__); 390 dprintk("%s: No slot\n", __func__);
479 return; 391 return;
480 } 392 }
393 tbl = res->sr_slot->table;
394 session = tbl->session;
481 395
482 spin_lock(&tbl->slot_tbl_lock); 396 spin_lock(&tbl->slot_tbl_lock);
483 nfs4_free_slot(tbl, res->sr_slot - tbl->slots); 397 /* Be nice to the server: try to ensure that the last transmitted
484 nfs4_check_drain_fc_complete(res->sr_session); 398 * value for highest_user_slotid <= target_highest_slotid
399 */
400 if (tbl->highest_used_slotid > tbl->target_highest_slotid)
401 send_new_highest_used_slotid = true;
402
403 if (nfs41_wake_and_assign_slot(tbl, res->sr_slot)) {
404 send_new_highest_used_slotid = false;
405 goto out_unlock;
406 }
407 nfs4_free_slot(tbl, res->sr_slot);
408
409 if (tbl->highest_used_slotid != NFS4_NO_SLOT)
410 send_new_highest_used_slotid = false;
411out_unlock:
485 spin_unlock(&tbl->slot_tbl_lock); 412 spin_unlock(&tbl->slot_tbl_lock);
486 res->sr_slot = NULL; 413 res->sr_slot = NULL;
414 if (send_new_highest_used_slotid)
415 nfs41_server_notify_highest_slotid_update(session->clp);
487} 416}
488 417
489static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) 418static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
490{ 419{
491 unsigned long timestamp; 420 struct nfs4_session *session;
421 struct nfs4_slot *slot;
492 struct nfs_client *clp; 422 struct nfs_client *clp;
493 423 bool interrupted = false;
494 /* 424 int ret = 1;
495 * sr_status remains 1 if an RPC level error occurred. The server
496 * may or may not have processed the sequence operation..
497 * Proceed as if the server received and processed the sequence
498 * operation.
499 */
500 if (res->sr_status == 1)
501 res->sr_status = NFS_OK;
502 425
503 /* don't increment the sequence number if the task wasn't sent */ 426 /* don't increment the sequence number if the task wasn't sent */
504 if (!RPC_WAS_SENT(task)) 427 if (!RPC_WAS_SENT(task))
505 goto out; 428 goto out;
506 429
430 slot = res->sr_slot;
431 session = slot->table->session;
432
433 if (slot->interrupted) {
434 slot->interrupted = 0;
435 interrupted = true;
436 }
437
507 /* Check the SEQUENCE operation status */ 438 /* Check the SEQUENCE operation status */
508 switch (res->sr_status) { 439 switch (res->sr_status) {
509 case 0: 440 case 0:
510 /* Update the slot's sequence and clientid lease timer */ 441 /* Update the slot's sequence and clientid lease timer */
511 ++res->sr_slot->seq_nr; 442 ++slot->seq_nr;
512 timestamp = res->sr_renewal_time; 443 clp = session->clp;
513 clp = res->sr_session->clp; 444 do_renew_lease(clp, res->sr_timestamp);
514 do_renew_lease(clp, timestamp);
515 /* Check sequence flags */ 445 /* Check sequence flags */
516 if (res->sr_status_flags != 0) 446 if (res->sr_status_flags != 0)
517 nfs4_schedule_lease_recovery(clp); 447 nfs4_schedule_lease_recovery(clp);
448 nfs41_update_target_slotid(slot->table, slot, res);
518 break; 449 break;
450 case 1:
451 /*
452 * sr_status remains 1 if an RPC level error occurred.
453 * The server may or may not have processed the sequence
454 * operation..
455 * Mark the slot as having hosted an interrupted RPC call.
456 */
457 slot->interrupted = 1;
458 goto out;
519 case -NFS4ERR_DELAY: 459 case -NFS4ERR_DELAY:
520 /* The server detected a resend of the RPC call and 460 /* The server detected a resend of the RPC call and
521 * returned NFS4ERR_DELAY as per Section 2.10.6.2 461 * returned NFS4ERR_DELAY as per Section 2.10.6.2
522 * of RFC5661. 462 * of RFC5661.
523 */ 463 */
524 dprintk("%s: slot=%td seq=%d: Operation in progress\n", 464 dprintk("%s: slot=%u seq=%u: Operation in progress\n",
525 __func__, 465 __func__,
526 res->sr_slot - res->sr_session->fc_slot_table.slots, 466 slot->slot_nr,
527 res->sr_slot->seq_nr); 467 slot->seq_nr);
528 goto out_retry; 468 goto out_retry;
469 case -NFS4ERR_BADSLOT:
470 /*
471 * The slot id we used was probably retired. Try again
472 * using a different slot id.
473 */
474 goto retry_nowait;
475 case -NFS4ERR_SEQ_MISORDERED:
476 /*
477 * Was the last operation on this sequence interrupted?
478 * If so, retry after bumping the sequence number.
479 */
480 if (interrupted) {
481 ++slot->seq_nr;
482 goto retry_nowait;
483 }
484 /*
485 * Could this slot have been previously retired?
486 * If so, then the server may be expecting seq_nr = 1!
487 */
488 if (slot->seq_nr != 1) {
489 slot->seq_nr = 1;
490 goto retry_nowait;
491 }
492 break;
493 case -NFS4ERR_SEQ_FALSE_RETRY:
494 ++slot->seq_nr;
495 goto retry_nowait;
529 default: 496 default:
530 /* Just update the slot sequence no. */ 497 /* Just update the slot sequence no. */
531 ++res->sr_slot->seq_nr; 498 ++slot->seq_nr;
532 } 499 }
533out: 500out:
534 /* The session may be reset by one of the error handlers. */ 501 /* The session may be reset by one of the error handlers. */
535 dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); 502 dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
536 nfs41_sequence_free_slot(res); 503 nfs41_sequence_free_slot(res);
537 return 1; 504 return ret;
505retry_nowait:
506 if (rpc_restart_call_prepare(task)) {
507 task->tk_status = 0;
508 ret = 0;
509 }
510 goto out;
538out_retry: 511out_retry:
539 if (!rpc_restart_call(task)) 512 if (!rpc_restart_call(task))
540 goto out; 513 goto out;
@@ -545,55 +518,27 @@ out_retry:
545static int nfs4_sequence_done(struct rpc_task *task, 518static int nfs4_sequence_done(struct rpc_task *task,
546 struct nfs4_sequence_res *res) 519 struct nfs4_sequence_res *res)
547{ 520{
548 if (res->sr_session == NULL) 521 if (res->sr_slot == NULL)
549 return 1; 522 return 1;
550 return nfs41_sequence_done(task, res); 523 return nfs41_sequence_done(task, res);
551} 524}
552 525
553/*
554 * nfs4_find_slot - efficiently look for a free slot
555 *
556 * nfs4_find_slot looks for an unset bit in the used_slots bitmap.
557 * If found, we mark the slot as used, update the highest_used_slotid,
558 * and respectively set up the sequence operation args.
559 * The slot number is returned if found, or NFS4_NO_SLOT otherwise.
560 *
561 * Note: must be called with under the slot_tbl_lock.
562 */
563static u32
564nfs4_find_slot(struct nfs4_slot_table *tbl)
565{
566 u32 slotid;
567 u32 ret_id = NFS4_NO_SLOT;
568
569 dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n",
570 __func__, tbl->used_slots[0], tbl->highest_used_slotid,
571 tbl->max_slots);
572 slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots);
573 if (slotid >= tbl->max_slots)
574 goto out;
575 __set_bit(slotid, tbl->used_slots);
576 if (slotid > tbl->highest_used_slotid ||
577 tbl->highest_used_slotid == NFS4_NO_SLOT)
578 tbl->highest_used_slotid = slotid;
579 ret_id = slotid;
580out:
581 dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n",
582 __func__, tbl->used_slots[0], tbl->highest_used_slotid, ret_id);
583 return ret_id;
584}
585
586static void nfs41_init_sequence(struct nfs4_sequence_args *args, 526static void nfs41_init_sequence(struct nfs4_sequence_args *args,
587 struct nfs4_sequence_res *res, int cache_reply) 527 struct nfs4_sequence_res *res, int cache_reply)
588{ 528{
589 args->sa_session = NULL; 529 args->sa_slot = NULL;
590 args->sa_cache_this = 0; 530 args->sa_cache_this = 0;
531 args->sa_privileged = 0;
591 if (cache_reply) 532 if (cache_reply)
592 args->sa_cache_this = 1; 533 args->sa_cache_this = 1;
593 res->sr_session = NULL;
594 res->sr_slot = NULL; 534 res->sr_slot = NULL;
595} 535}
596 536
537static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
538{
539 args->sa_privileged = 1;
540}
541
597int nfs41_setup_sequence(struct nfs4_session *session, 542int nfs41_setup_sequence(struct nfs4_session *session,
598 struct nfs4_sequence_args *args, 543 struct nfs4_sequence_args *args,
599 struct nfs4_sequence_res *res, 544 struct nfs4_sequence_res *res,
@@ -601,59 +546,59 @@ int nfs41_setup_sequence(struct nfs4_session *session,
601{ 546{
602 struct nfs4_slot *slot; 547 struct nfs4_slot *slot;
603 struct nfs4_slot_table *tbl; 548 struct nfs4_slot_table *tbl;
604 u32 slotid;
605 549
606 dprintk("--> %s\n", __func__); 550 dprintk("--> %s\n", __func__);
607 /* slot already allocated? */ 551 /* slot already allocated? */
608 if (res->sr_slot != NULL) 552 if (res->sr_slot != NULL)
609 return 0; 553 goto out_success;
610 554
611 tbl = &session->fc_slot_table; 555 tbl = &session->fc_slot_table;
612 556
557 task->tk_timeout = 0;
558
613 spin_lock(&tbl->slot_tbl_lock); 559 spin_lock(&tbl->slot_tbl_lock);
614 if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && 560 if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) &&
615 !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { 561 !args->sa_privileged) {
616 /* The state manager will wait until the slot table is empty */ 562 /* The state manager will wait until the slot table is empty */
617 rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
618 spin_unlock(&tbl->slot_tbl_lock);
619 dprintk("%s session is draining\n", __func__); 563 dprintk("%s session is draining\n", __func__);
620 return -EAGAIN; 564 goto out_sleep;
621 } 565 }
622 566
623 if (!rpc_queue_empty(&tbl->slot_tbl_waitq) && 567 slot = nfs4_alloc_slot(tbl);
624 !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { 568 if (IS_ERR(slot)) {
625 rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); 569 /* If out of memory, try again in 1/4 second */
626 spin_unlock(&tbl->slot_tbl_lock); 570 if (slot == ERR_PTR(-ENOMEM))
627 dprintk("%s enforce FIFO order\n", __func__); 571 task->tk_timeout = HZ >> 2;
628 return -EAGAIN;
629 }
630
631 slotid = nfs4_find_slot(tbl);
632 if (slotid == NFS4_NO_SLOT) {
633 rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
634 spin_unlock(&tbl->slot_tbl_lock);
635 dprintk("<-- %s: no free slots\n", __func__); 572 dprintk("<-- %s: no free slots\n", __func__);
636 return -EAGAIN; 573 goto out_sleep;
637 } 574 }
638 spin_unlock(&tbl->slot_tbl_lock); 575 spin_unlock(&tbl->slot_tbl_lock);
639 576
640 rpc_task_set_priority(task, RPC_PRIORITY_NORMAL); 577 args->sa_slot = slot;
641 slot = tbl->slots + slotid;
642 args->sa_session = session;
643 args->sa_slotid = slotid;
644 578
645 dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr); 579 dprintk("<-- %s slotid=%d seqid=%d\n", __func__,
580 slot->slot_nr, slot->seq_nr);
646 581
647 res->sr_session = session;
648 res->sr_slot = slot; 582 res->sr_slot = slot;
649 res->sr_renewal_time = jiffies; 583 res->sr_timestamp = jiffies;
650 res->sr_status_flags = 0; 584 res->sr_status_flags = 0;
651 /* 585 /*
652 * sr_status is only set in decode_sequence, and so will remain 586 * sr_status is only set in decode_sequence, and so will remain
653 * set to 1 if an rpc level failure occurs. 587 * set to 1 if an rpc level failure occurs.
654 */ 588 */
655 res->sr_status = 1; 589 res->sr_status = 1;
590out_success:
591 rpc_call_start(task);
656 return 0; 592 return 0;
593out_sleep:
594 /* Privileged tasks are queued with top priority */
595 if (args->sa_privileged)
596 rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task,
597 NULL, RPC_PRIORITY_PRIVILEGED);
598 else
599 rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
600 spin_unlock(&tbl->slot_tbl_lock);
601 return -EAGAIN;
657} 602}
658EXPORT_SYMBOL_GPL(nfs41_setup_sequence); 603EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
659 604
@@ -665,12 +610,14 @@ int nfs4_setup_sequence(const struct nfs_server *server,
665 struct nfs4_session *session = nfs4_get_session(server); 610 struct nfs4_session *session = nfs4_get_session(server);
666 int ret = 0; 611 int ret = 0;
667 612
668 if (session == NULL) 613 if (session == NULL) {
614 rpc_call_start(task);
669 goto out; 615 goto out;
616 }
670 617
671 dprintk("--> %s clp %p session %p sr_slot %td\n", 618 dprintk("--> %s clp %p session %p sr_slot %d\n",
672 __func__, session->clp, session, res->sr_slot ? 619 __func__, session->clp, session, res->sr_slot ?
673 res->sr_slot - session->fc_slot_table.slots : -1); 620 res->sr_slot->slot_nr : -1);
674 621
675 ret = nfs41_setup_sequence(session, args, res, task); 622 ret = nfs41_setup_sequence(session, args, res, task);
676out: 623out:
@@ -687,19 +634,11 @@ struct nfs41_call_sync_data {
687static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) 634static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
688{ 635{
689 struct nfs41_call_sync_data *data = calldata; 636 struct nfs41_call_sync_data *data = calldata;
637 struct nfs4_session *session = nfs4_get_session(data->seq_server);
690 638
691 dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); 639 dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
692 640
693 if (nfs4_setup_sequence(data->seq_server, data->seq_args, 641 nfs41_setup_sequence(session, data->seq_args, data->seq_res, task);
694 data->seq_res, task))
695 return;
696 rpc_call_start(task);
697}
698
699static void nfs41_call_priv_sync_prepare(struct rpc_task *task, void *calldata)
700{
701 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
702 nfs41_call_sync_prepare(task, calldata);
703} 642}
704 643
705static void nfs41_call_sync_done(struct rpc_task *task, void *calldata) 644static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
@@ -714,17 +653,11 @@ static const struct rpc_call_ops nfs41_call_sync_ops = {
714 .rpc_call_done = nfs41_call_sync_done, 653 .rpc_call_done = nfs41_call_sync_done,
715}; 654};
716 655
717static const struct rpc_call_ops nfs41_call_priv_sync_ops = {
718 .rpc_call_prepare = nfs41_call_priv_sync_prepare,
719 .rpc_call_done = nfs41_call_sync_done,
720};
721
722static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, 656static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
723 struct nfs_server *server, 657 struct nfs_server *server,
724 struct rpc_message *msg, 658 struct rpc_message *msg,
725 struct nfs4_sequence_args *args, 659 struct nfs4_sequence_args *args,
726 struct nfs4_sequence_res *res, 660 struct nfs4_sequence_res *res)
727 int privileged)
728{ 661{
729 int ret; 662 int ret;
730 struct rpc_task *task; 663 struct rpc_task *task;
@@ -740,8 +673,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
740 .callback_data = &data 673 .callback_data = &data
741 }; 674 };
742 675
743 if (privileged)
744 task_setup.callback_ops = &nfs41_call_priv_sync_ops;
745 task = rpc_run_task(&task_setup); 676 task = rpc_run_task(&task_setup);
746 if (IS_ERR(task)) 677 if (IS_ERR(task))
747 ret = PTR_ERR(task); 678 ret = PTR_ERR(task);
@@ -752,24 +683,18 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
752 return ret; 683 return ret;
753} 684}
754 685
755int _nfs4_call_sync_session(struct rpc_clnt *clnt,
756 struct nfs_server *server,
757 struct rpc_message *msg,
758 struct nfs4_sequence_args *args,
759 struct nfs4_sequence_res *res,
760 int cache_reply)
761{
762 nfs41_init_sequence(args, res, cache_reply);
763 return nfs4_call_sync_sequence(clnt, server, msg, args, res, 0);
764}
765
766#else 686#else
767static inline 687static
768void nfs41_init_sequence(struct nfs4_sequence_args *args, 688void nfs41_init_sequence(struct nfs4_sequence_args *args,
769 struct nfs4_sequence_res *res, int cache_reply) 689 struct nfs4_sequence_res *res, int cache_reply)
770{ 690{
771} 691}
772 692
693static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
694{
695}
696
697
773static int nfs4_sequence_done(struct rpc_task *task, 698static int nfs4_sequence_done(struct rpc_task *task,
774 struct nfs4_sequence_res *res) 699 struct nfs4_sequence_res *res)
775{ 700{
@@ -777,18 +702,17 @@ static int nfs4_sequence_done(struct rpc_task *task,
777} 702}
778#endif /* CONFIG_NFS_V4_1 */ 703#endif /* CONFIG_NFS_V4_1 */
779 704
705static
780int _nfs4_call_sync(struct rpc_clnt *clnt, 706int _nfs4_call_sync(struct rpc_clnt *clnt,
781 struct nfs_server *server, 707 struct nfs_server *server,
782 struct rpc_message *msg, 708 struct rpc_message *msg,
783 struct nfs4_sequence_args *args, 709 struct nfs4_sequence_args *args,
784 struct nfs4_sequence_res *res, 710 struct nfs4_sequence_res *res)
785 int cache_reply)
786{ 711{
787 nfs41_init_sequence(args, res, cache_reply);
788 return rpc_call_sync(clnt, msg, 0); 712 return rpc_call_sync(clnt, msg, 0);
789} 713}
790 714
791static inline 715static
792int nfs4_call_sync(struct rpc_clnt *clnt, 716int nfs4_call_sync(struct rpc_clnt *clnt,
793 struct nfs_server *server, 717 struct nfs_server *server,
794 struct rpc_message *msg, 718 struct rpc_message *msg,
@@ -796,8 +720,9 @@ int nfs4_call_sync(struct rpc_clnt *clnt,
796 struct nfs4_sequence_res *res, 720 struct nfs4_sequence_res *res,
797 int cache_reply) 721 int cache_reply)
798{ 722{
723 nfs41_init_sequence(args, res, cache_reply);
799 return server->nfs_client->cl_mvops->call_sync(clnt, server, msg, 724 return server->nfs_client->cl_mvops->call_sync(clnt, server, msg,
800 args, res, cache_reply); 725 args, res);
801} 726}
802 727
803static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) 728static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
@@ -1445,13 +1370,6 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
1445 nfs_inode_find_state_and_recover(state->inode, 1370 nfs_inode_find_state_and_recover(state->inode,
1446 stateid); 1371 stateid);
1447 nfs4_schedule_stateid_recovery(server, state); 1372 nfs4_schedule_stateid_recovery(server, state);
1448 case -EKEYEXPIRED:
1449 /*
1450 * User RPCSEC_GSS context has expired.
1451 * We cannot recover this stateid now, so
1452 * skip it and allow recovery thread to
1453 * proceed.
1454 */
1455 case -ENOMEM: 1373 case -ENOMEM:
1456 err = 0; 1374 err = 0;
1457 goto out; 1375 goto out;
@@ -1574,20 +1492,12 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
1574 &data->o_res.seq_res, 1492 &data->o_res.seq_res,
1575 task) != 0) 1493 task) != 0)
1576 nfs_release_seqid(data->o_arg.seqid); 1494 nfs_release_seqid(data->o_arg.seqid);
1577 else
1578 rpc_call_start(task);
1579 return; 1495 return;
1580unlock_no_action: 1496unlock_no_action:
1581 rcu_read_unlock(); 1497 rcu_read_unlock();
1582out_no_action: 1498out_no_action:
1583 task->tk_action = NULL; 1499 task->tk_action = NULL;
1584 1500 nfs4_sequence_done(task, &data->o_res.seq_res);
1585}
1586
1587static void nfs4_recover_open_prepare(struct rpc_task *task, void *calldata)
1588{
1589 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
1590 nfs4_open_prepare(task, calldata);
1591} 1501}
1592 1502
1593static void nfs4_open_done(struct rpc_task *task, void *calldata) 1503static void nfs4_open_done(struct rpc_task *task, void *calldata)
@@ -1648,12 +1558,6 @@ static const struct rpc_call_ops nfs4_open_ops = {
1648 .rpc_release = nfs4_open_release, 1558 .rpc_release = nfs4_open_release,
1649}; 1559};
1650 1560
1651static const struct rpc_call_ops nfs4_recover_open_ops = {
1652 .rpc_call_prepare = nfs4_recover_open_prepare,
1653 .rpc_call_done = nfs4_open_done,
1654 .rpc_release = nfs4_open_release,
1655};
1656
1657static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) 1561static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
1658{ 1562{
1659 struct inode *dir = data->dir->d_inode; 1563 struct inode *dir = data->dir->d_inode;
@@ -1683,7 +1587,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
1683 data->rpc_status = 0; 1587 data->rpc_status = 0;
1684 data->cancelled = 0; 1588 data->cancelled = 0;
1685 if (isrecover) 1589 if (isrecover)
1686 task_setup_data.callback_ops = &nfs4_recover_open_ops; 1590 nfs4_set_sequence_privileged(&o_arg->seq_args);
1687 task = rpc_run_task(&task_setup_data); 1591 task = rpc_run_task(&task_setup_data);
1688 if (IS_ERR(task)) 1592 if (IS_ERR(task))
1689 return PTR_ERR(task); 1593 return PTR_ERR(task);
@@ -1789,24 +1693,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
1789 return 0; 1693 return 0;
1790} 1694}
1791 1695
1792static int nfs4_client_recover_expired_lease(struct nfs_client *clp)
1793{
1794 unsigned int loop;
1795 int ret;
1796
1797 for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
1798 ret = nfs4_wait_clnt_recover(clp);
1799 if (ret != 0)
1800 break;
1801 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
1802 !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
1803 break;
1804 nfs4_schedule_state_manager(clp);
1805 ret = -EIO;
1806 }
1807 return ret;
1808}
1809
1810static int nfs4_recover_expired_lease(struct nfs_server *server) 1696static int nfs4_recover_expired_lease(struct nfs_server *server)
1811{ 1697{
1812 return nfs4_client_recover_expired_lease(server->nfs_client); 1698 return nfs4_client_recover_expired_lease(server->nfs_client);
@@ -2282,6 +2168,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2282 if (!call_close) { 2168 if (!call_close) {
2283 /* Note: exit _without_ calling nfs4_close_done */ 2169 /* Note: exit _without_ calling nfs4_close_done */
2284 task->tk_action = NULL; 2170 task->tk_action = NULL;
2171 nfs4_sequence_done(task, &calldata->res.seq_res);
2285 goto out; 2172 goto out;
2286 } 2173 }
2287 2174
@@ -2299,8 +2186,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2299 &calldata->res.seq_res, 2186 &calldata->res.seq_res,
2300 task) != 0) 2187 task) != 0)
2301 nfs_release_seqid(calldata->arg.seqid); 2188 nfs_release_seqid(calldata->arg.seqid);
2302 else
2303 rpc_call_start(task);
2304out: 2189out:
2305 dprintk("%s: done!\n", __func__); 2190 dprintk("%s: done!\n", __func__);
2306} 2191}
@@ -2533,7 +2418,8 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
2533 rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS]; 2418 rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS];
2534 2419
2535 len = rpcauth_list_flavors(flav_array, ARRAY_SIZE(flav_array)); 2420 len = rpcauth_list_flavors(flav_array, ARRAY_SIZE(flav_array));
2536 BUG_ON(len < 0); 2421 if (len < 0)
2422 return len;
2537 2423
2538 for (i = 0; i < len; i++) { 2424 for (i = 0; i < len; i++) {
2539 /* AUTH_UNIX is the default flavor if none was specified, 2425 /* AUTH_UNIX is the default flavor if none was specified,
@@ -3038,12 +2924,10 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
3038 2924
3039static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) 2925static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
3040{ 2926{
3041 if (nfs4_setup_sequence(NFS_SERVER(data->dir), 2927 nfs4_setup_sequence(NFS_SERVER(data->dir),
3042 &data->args.seq_args, 2928 &data->args.seq_args,
3043 &data->res.seq_res, 2929 &data->res.seq_res,
3044 task)) 2930 task);
3045 return;
3046 rpc_call_start(task);
3047} 2931}
3048 2932
3049static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) 2933static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
@@ -3071,12 +2955,10 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
3071 2955
3072static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) 2956static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
3073{ 2957{
3074 if (nfs4_setup_sequence(NFS_SERVER(data->old_dir), 2958 nfs4_setup_sequence(NFS_SERVER(data->old_dir),
3075 &data->args.seq_args, 2959 &data->args.seq_args,
3076 &data->res.seq_res, 2960 &data->res.seq_res,
3077 task)) 2961 task);
3078 return;
3079 rpc_call_start(task);
3080} 2962}
3081 2963
3082static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, 2964static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
@@ -3362,9 +3244,6 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
3362 int mode = sattr->ia_mode; 3244 int mode = sattr->ia_mode;
3363 int status = -ENOMEM; 3245 int status = -ENOMEM;
3364 3246
3365 BUG_ON(!(sattr->ia_valid & ATTR_MODE));
3366 BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode));
3367
3368 data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK); 3247 data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK);
3369 if (data == NULL) 3248 if (data == NULL)
3370 goto out; 3249 goto out;
@@ -3380,10 +3259,13 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
3380 data->arg.ftype = NF4CHR; 3259 data->arg.ftype = NF4CHR;
3381 data->arg.u.device.specdata1 = MAJOR(rdev); 3260 data->arg.u.device.specdata1 = MAJOR(rdev);
3382 data->arg.u.device.specdata2 = MINOR(rdev); 3261 data->arg.u.device.specdata2 = MINOR(rdev);
3262 } else if (!S_ISSOCK(mode)) {
3263 status = -EINVAL;
3264 goto out_free;
3383 } 3265 }
3384 3266
3385 status = nfs4_do_create(dir, dentry, data); 3267 status = nfs4_do_create(dir, dentry, data);
3386 3268out_free:
3387 nfs4_free_createdata(data); 3269 nfs4_free_createdata(data);
3388out: 3270out:
3389 return status; 3271 return status;
@@ -3565,12 +3447,10 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
3565 3447
3566static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) 3448static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
3567{ 3449{
3568 if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), 3450 nfs4_setup_sequence(NFS_SERVER(data->header->inode),
3569 &data->args.seq_args, 3451 &data->args.seq_args,
3570 &data->res.seq_res, 3452 &data->res.seq_res,
3571 task)) 3453 task);
3572 return;
3573 rpc_call_start(task);
3574} 3454}
3575 3455
3576static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data) 3456static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
@@ -3631,22 +3511,18 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
3631 3511
3632static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) 3512static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
3633{ 3513{
3634 if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), 3514 nfs4_setup_sequence(NFS_SERVER(data->header->inode),
3635 &data->args.seq_args, 3515 &data->args.seq_args,
3636 &data->res.seq_res, 3516 &data->res.seq_res,
3637 task)) 3517 task);
3638 return;
3639 rpc_call_start(task);
3640} 3518}
3641 3519
3642static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) 3520static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
3643{ 3521{
3644 if (nfs4_setup_sequence(NFS_SERVER(data->inode), 3522 nfs4_setup_sequence(NFS_SERVER(data->inode),
3645 &data->args.seq_args, 3523 &data->args.seq_args,
3646 &data->res.seq_res, 3524 &data->res.seq_res,
3647 task)) 3525 task);
3648 return;
3649 rpc_call_start(task);
3650} 3526}
3651 3527
3652static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data) 3528static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data)
@@ -3937,8 +3813,13 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
3937 goto out_free; 3813 goto out_free;
3938 } 3814 }
3939 nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len); 3815 nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len);
3940 if (buf) 3816 if (buf) {
3817 if (res.acl_len > buflen) {
3818 ret = -ERANGE;
3819 goto out_free;
3820 }
3941 _copy_from_pages(buf, pages, res.acl_data_offset, res.acl_len); 3821 _copy_from_pages(buf, pages, res.acl_data_offset, res.acl_len);
3822 }
3942out_ok: 3823out_ok:
3943 ret = res.acl_len; 3824 ret = res.acl_len;
3944out_free: 3825out_free:
@@ -4085,7 +3966,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
4085 case -NFS4ERR_DELAY: 3966 case -NFS4ERR_DELAY:
4086 nfs_inc_server_stats(server, NFSIOS_DELAY); 3967 nfs_inc_server_stats(server, NFSIOS_DELAY);
4087 case -NFS4ERR_GRACE: 3968 case -NFS4ERR_GRACE:
4088 case -EKEYEXPIRED:
4089 rpc_delay(task, NFS4_POLL_RETRY_MAX); 3969 rpc_delay(task, NFS4_POLL_RETRY_MAX);
4090 task->tk_status = 0; 3970 task->tk_status = 0;
4091 return -EAGAIN; 3971 return -EAGAIN;
@@ -4293,11 +4173,10 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
4293 4173
4294 d_data = (struct nfs4_delegreturndata *)data; 4174 d_data = (struct nfs4_delegreturndata *)data;
4295 4175
4296 if (nfs4_setup_sequence(d_data->res.server, 4176 nfs4_setup_sequence(d_data->res.server,
4297 &d_data->args.seq_args, 4177 &d_data->args.seq_args,
4298 &d_data->res.seq_res, task)) 4178 &d_data->res.seq_res,
4299 return; 4179 task);
4300 rpc_call_start(task);
4301} 4180}
4302#endif /* CONFIG_NFS_V4_1 */ 4181#endif /* CONFIG_NFS_V4_1 */
4303 4182
@@ -4543,6 +4422,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
4543 if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { 4422 if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
4544 /* Note: exit _without_ running nfs4_locku_done */ 4423 /* Note: exit _without_ running nfs4_locku_done */
4545 task->tk_action = NULL; 4424 task->tk_action = NULL;
4425 nfs4_sequence_done(task, &calldata->res.seq_res);
4546 return; 4426 return;
4547 } 4427 }
4548 calldata->timestamp = jiffies; 4428 calldata->timestamp = jiffies;
@@ -4551,8 +4431,6 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
4551 &calldata->res.seq_res, 4431 &calldata->res.seq_res,
4552 task) != 0) 4432 task) != 0)
4553 nfs_release_seqid(calldata->arg.seqid); 4433 nfs_release_seqid(calldata->arg.seqid);
4554 else
4555 rpc_call_start(task);
4556} 4434}
4557 4435
4558static const struct rpc_call_ops nfs4_locku_ops = { 4436static const struct rpc_call_ops nfs4_locku_ops = {
@@ -4696,8 +4574,9 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
4696 return; 4574 return;
4697 /* Do we need to do an open_to_lock_owner? */ 4575 /* Do we need to do an open_to_lock_owner? */
4698 if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { 4576 if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
4699 if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) 4577 if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) {
4700 goto out_release_lock_seqid; 4578 goto out_release_lock_seqid;
4579 }
4701 data->arg.open_stateid = &state->stateid; 4580 data->arg.open_stateid = &state->stateid;
4702 data->arg.new_lock_owner = 1; 4581 data->arg.new_lock_owner = 1;
4703 data->res.open_seqid = data->arg.open_seqid; 4582 data->res.open_seqid = data->arg.open_seqid;
@@ -4707,20 +4586,12 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
4707 if (nfs4_setup_sequence(data->server, 4586 if (nfs4_setup_sequence(data->server,
4708 &data->arg.seq_args, 4587 &data->arg.seq_args,
4709 &data->res.seq_res, 4588 &data->res.seq_res,
4710 task) == 0) { 4589 task) == 0)
4711 rpc_call_start(task);
4712 return; 4590 return;
4713 }
4714 nfs_release_seqid(data->arg.open_seqid); 4591 nfs_release_seqid(data->arg.open_seqid);
4715out_release_lock_seqid: 4592out_release_lock_seqid:
4716 nfs_release_seqid(data->arg.lock_seqid); 4593 nfs_release_seqid(data->arg.lock_seqid);
4717 dprintk("%s: done!, ret = %d\n", __func__, task->tk_status); 4594 dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
4718}
4719
4720static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata)
4721{
4722 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
4723 nfs4_lock_prepare(task, calldata);
4724} 4595}
4725 4596
4726static void nfs4_lock_done(struct rpc_task *task, void *calldata) 4597static void nfs4_lock_done(struct rpc_task *task, void *calldata)
@@ -4775,12 +4646,6 @@ static const struct rpc_call_ops nfs4_lock_ops = {
4775 .rpc_release = nfs4_lock_release, 4646 .rpc_release = nfs4_lock_release,
4776}; 4647};
4777 4648
4778static const struct rpc_call_ops nfs4_recover_lock_ops = {
4779 .rpc_call_prepare = nfs4_recover_lock_prepare,
4780 .rpc_call_done = nfs4_lock_done,
4781 .rpc_release = nfs4_lock_release,
4782};
4783
4784static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error) 4649static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
4785{ 4650{
4786 switch (error) { 4651 switch (error) {
@@ -4823,15 +4688,15 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
4823 return -ENOMEM; 4688 return -ENOMEM;
4824 if (IS_SETLKW(cmd)) 4689 if (IS_SETLKW(cmd))
4825 data->arg.block = 1; 4690 data->arg.block = 1;
4826 if (recovery_type > NFS_LOCK_NEW) {
4827 if (recovery_type == NFS_LOCK_RECLAIM)
4828 data->arg.reclaim = NFS_LOCK_RECLAIM;
4829 task_setup_data.callback_ops = &nfs4_recover_lock_ops;
4830 }
4831 nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); 4691 nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1);
4832 msg.rpc_argp = &data->arg; 4692 msg.rpc_argp = &data->arg;
4833 msg.rpc_resp = &data->res; 4693 msg.rpc_resp = &data->res;
4834 task_setup_data.callback_data = data; 4694 task_setup_data.callback_data = data;
4695 if (recovery_type > NFS_LOCK_NEW) {
4696 if (recovery_type == NFS_LOCK_RECLAIM)
4697 data->arg.reclaim = NFS_LOCK_RECLAIM;
4698 nfs4_set_sequence_privileged(&data->arg.seq_args);
4699 }
4835 task = rpc_run_task(&task_setup_data); 4700 task = rpc_run_task(&task_setup_data);
4836 if (IS_ERR(task)) 4701 if (IS_ERR(task))
4837 return PTR_ERR(task); 4702 return PTR_ERR(task);
@@ -5100,15 +4965,6 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
5100 nfs4_schedule_stateid_recovery(server, state); 4965 nfs4_schedule_stateid_recovery(server, state);
5101 err = 0; 4966 err = 0;
5102 goto out; 4967 goto out;
5103 case -EKEYEXPIRED:
5104 /*
5105 * User RPCSEC_GSS context has expired.
5106 * We cannot recover this stateid now, so
5107 * skip it and allow recovery thread to
5108 * proceed.
5109 */
5110 err = 0;
5111 goto out;
5112 case -ENOMEM: 4968 case -ENOMEM:
5113 case -NFS4ERR_DENIED: 4969 case -NFS4ERR_DENIED:
5114 /* kill_proc(fl->fl_pid, SIGLOST, 1); */ 4970 /* kill_proc(fl->fl_pid, SIGLOST, 1); */
@@ -5357,7 +5213,6 @@ int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred
5357 }; 5213 };
5358 5214
5359 dprintk("--> %s\n", __func__); 5215 dprintk("--> %s\n", __func__);
5360 BUG_ON(clp == NULL);
5361 5216
5362 res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); 5217 res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
5363 if (unlikely(res.session == NULL)) { 5218 if (unlikely(res.session == NULL)) {
@@ -5569,20 +5424,16 @@ struct nfs4_get_lease_time_data {
5569static void nfs4_get_lease_time_prepare(struct rpc_task *task, 5424static void nfs4_get_lease_time_prepare(struct rpc_task *task,
5570 void *calldata) 5425 void *calldata)
5571{ 5426{
5572 int ret;
5573 struct nfs4_get_lease_time_data *data = 5427 struct nfs4_get_lease_time_data *data =
5574 (struct nfs4_get_lease_time_data *)calldata; 5428 (struct nfs4_get_lease_time_data *)calldata;
5575 5429
5576 dprintk("--> %s\n", __func__); 5430 dprintk("--> %s\n", __func__);
5577 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
5578 /* just setup sequence, do not trigger session recovery 5431 /* just setup sequence, do not trigger session recovery
5579 since we're invoked within one */ 5432 since we're invoked within one */
5580 ret = nfs41_setup_sequence(data->clp->cl_session, 5433 nfs41_setup_sequence(data->clp->cl_session,
5581 &data->args->la_seq_args, 5434 &data->args->la_seq_args,
5582 &data->res->lr_seq_res, task); 5435 &data->res->lr_seq_res,
5583 5436 task);
5584 BUG_ON(ret == -EAGAIN);
5585 rpc_call_start(task);
5586 dprintk("<-- %s\n", __func__); 5437 dprintk("<-- %s\n", __func__);
5587} 5438}
5588 5439
@@ -5644,6 +5495,7 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
5644 int status; 5495 int status;
5645 5496
5646 nfs41_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0); 5497 nfs41_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0);
5498 nfs4_set_sequence_privileged(&args.la_seq_args);
5647 dprintk("--> %s\n", __func__); 5499 dprintk("--> %s\n", __func__);
5648 task = rpc_run_task(&task_setup); 5500 task = rpc_run_task(&task_setup);
5649 5501
@@ -5658,145 +5510,6 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
5658 return status; 5510 return status;
5659} 5511}
5660 5512
5661static struct nfs4_slot *nfs4_alloc_slots(u32 max_slots, gfp_t gfp_flags)
5662{
5663 return kcalloc(max_slots, sizeof(struct nfs4_slot), gfp_flags);
5664}
5665
5666static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl,
5667 struct nfs4_slot *new,
5668 u32 max_slots,
5669 u32 ivalue)
5670{
5671 struct nfs4_slot *old = NULL;
5672 u32 i;
5673
5674 spin_lock(&tbl->slot_tbl_lock);
5675 if (new) {
5676 old = tbl->slots;
5677 tbl->slots = new;
5678 tbl->max_slots = max_slots;
5679 }
5680 tbl->highest_used_slotid = NFS4_NO_SLOT;
5681 for (i = 0; i < tbl->max_slots; i++)
5682 tbl->slots[i].seq_nr = ivalue;
5683 spin_unlock(&tbl->slot_tbl_lock);
5684 kfree(old);
5685}
5686
5687/*
5688 * (re)Initialise a slot table
5689 */
5690static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
5691 u32 ivalue)
5692{
5693 struct nfs4_slot *new = NULL;
5694 int ret = -ENOMEM;
5695
5696 dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,
5697 max_reqs, tbl->max_slots);
5698
5699 /* Does the newly negotiated max_reqs match the existing slot table? */
5700 if (max_reqs != tbl->max_slots) {
5701 new = nfs4_alloc_slots(max_reqs, GFP_NOFS);
5702 if (!new)
5703 goto out;
5704 }
5705 ret = 0;
5706
5707 nfs4_add_and_init_slots(tbl, new, max_reqs, ivalue);
5708 dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
5709 tbl, tbl->slots, tbl->max_slots);
5710out:
5711 dprintk("<-- %s: return %d\n", __func__, ret);
5712 return ret;
5713}
5714
5715/* Destroy the slot table */
5716static void nfs4_destroy_slot_tables(struct nfs4_session *session)
5717{
5718 if (session->fc_slot_table.slots != NULL) {
5719 kfree(session->fc_slot_table.slots);
5720 session->fc_slot_table.slots = NULL;
5721 }
5722 if (session->bc_slot_table.slots != NULL) {
5723 kfree(session->bc_slot_table.slots);
5724 session->bc_slot_table.slots = NULL;
5725 }
5726 return;
5727}
5728
5729/*
5730 * Initialize or reset the forechannel and backchannel tables
5731 */
5732static int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
5733{
5734 struct nfs4_slot_table *tbl;
5735 int status;
5736
5737 dprintk("--> %s\n", __func__);
5738 /* Fore channel */
5739 tbl = &ses->fc_slot_table;
5740 status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
5741 if (status) /* -ENOMEM */
5742 return status;
5743 /* Back channel */
5744 tbl = &ses->bc_slot_table;
5745 status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
5746 if (status && tbl->slots == NULL)
5747 /* Fore and back channel share a connection so get
5748 * both slot tables or neither */
5749 nfs4_destroy_slot_tables(ses);
5750 return status;
5751}
5752
5753struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
5754{
5755 struct nfs4_session *session;
5756 struct nfs4_slot_table *tbl;
5757
5758 session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
5759 if (!session)
5760 return NULL;
5761
5762 tbl = &session->fc_slot_table;
5763 tbl->highest_used_slotid = NFS4_NO_SLOT;
5764 spin_lock_init(&tbl->slot_tbl_lock);
5765 rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
5766 init_completion(&tbl->complete);
5767
5768 tbl = &session->bc_slot_table;
5769 tbl->highest_used_slotid = NFS4_NO_SLOT;
5770 spin_lock_init(&tbl->slot_tbl_lock);
5771 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
5772 init_completion(&tbl->complete);
5773
5774 session->session_state = 1<<NFS4_SESSION_INITING;
5775
5776 session->clp = clp;
5777 return session;
5778}
5779
5780void nfs4_destroy_session(struct nfs4_session *session)
5781{
5782 struct rpc_xprt *xprt;
5783 struct rpc_cred *cred;
5784
5785 cred = nfs4_get_exchange_id_cred(session->clp);
5786 nfs4_proc_destroy_session(session, cred);
5787 if (cred)
5788 put_rpccred(cred);
5789
5790 rcu_read_lock();
5791 xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt);
5792 rcu_read_unlock();
5793 dprintk("%s Destroy backchannel for xprt %p\n",
5794 __func__, xprt);
5795 xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);
5796 nfs4_destroy_slot_tables(session);
5797 kfree(session);
5798}
5799
5800/* 5513/*
5801 * Initialize the values to be used by the client in CREATE_SESSION 5514 * Initialize the values to be used by the client in CREATE_SESSION
5802 * If nfs4_init_session set the fore channel request and response sizes, 5515 * If nfs4_init_session set the fore channel request and response sizes,
@@ -5809,8 +5522,8 @@ void nfs4_destroy_session(struct nfs4_session *session)
5809static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) 5522static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
5810{ 5523{
5811 struct nfs4_session *session = args->client->cl_session; 5524 struct nfs4_session *session = args->client->cl_session;
5812 unsigned int mxrqst_sz = session->fc_attrs.max_rqst_sz, 5525 unsigned int mxrqst_sz = session->fc_target_max_rqst_sz,
5813 mxresp_sz = session->fc_attrs.max_resp_sz; 5526 mxresp_sz = session->fc_target_max_resp_sz;
5814 5527
5815 if (mxrqst_sz == 0) 5528 if (mxrqst_sz == 0)
5816 mxrqst_sz = NFS_MAX_FILE_IO_SIZE; 5529 mxrqst_sz = NFS_MAX_FILE_IO_SIZE;
@@ -5919,10 +5632,9 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
5919 5632
5920 status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); 5633 status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
5921 5634
5922 if (!status) 5635 if (!status) {
5923 /* Verify the session's negotiated channel_attrs values */ 5636 /* Verify the session's negotiated channel_attrs values */
5924 status = nfs4_verify_channel_attrs(&args, session); 5637 status = nfs4_verify_channel_attrs(&args, session);
5925 if (!status) {
5926 /* Increment the clientid slot sequence id */ 5638 /* Increment the clientid slot sequence id */
5927 clp->cl_seqid++; 5639 clp->cl_seqid++;
5928 } 5640 }
@@ -5992,83 +5704,6 @@ int nfs4_proc_destroy_session(struct nfs4_session *session,
5992} 5704}
5993 5705
5994/* 5706/*
5995 * With sessions, the client is not marked ready until after a
5996 * successful EXCHANGE_ID and CREATE_SESSION.
5997 *
5998 * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate
5999 * other versions of NFS can be tried.
6000 */
6001static int nfs41_check_session_ready(struct nfs_client *clp)
6002{
6003 int ret;
6004
6005 if (clp->cl_cons_state == NFS_CS_SESSION_INITING) {
6006 ret = nfs4_client_recover_expired_lease(clp);
6007 if (ret)
6008 return ret;
6009 }
6010 if (clp->cl_cons_state < NFS_CS_READY)
6011 return -EPROTONOSUPPORT;
6012 smp_rmb();
6013 return 0;
6014}
6015
6016int nfs4_init_session(struct nfs_server *server)
6017{
6018 struct nfs_client *clp = server->nfs_client;
6019 struct nfs4_session *session;
6020 unsigned int rsize, wsize;
6021
6022 if (!nfs4_has_session(clp))
6023 return 0;
6024
6025 session = clp->cl_session;
6026 spin_lock(&clp->cl_lock);
6027 if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
6028
6029 rsize = server->rsize;
6030 if (rsize == 0)
6031 rsize = NFS_MAX_FILE_IO_SIZE;
6032 wsize = server->wsize;
6033 if (wsize == 0)
6034 wsize = NFS_MAX_FILE_IO_SIZE;
6035
6036 session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
6037 session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
6038 }
6039 spin_unlock(&clp->cl_lock);
6040
6041 return nfs41_check_session_ready(clp);
6042}
6043
6044int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time)
6045{
6046 struct nfs4_session *session = clp->cl_session;
6047 int ret;
6048
6049 spin_lock(&clp->cl_lock);
6050 if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
6051 /*
6052 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the
6053 * DS lease to be equal to the MDS lease.
6054 */
6055 clp->cl_lease_time = lease_time;
6056 clp->cl_last_renewal = jiffies;
6057 }
6058 spin_unlock(&clp->cl_lock);
6059
6060 ret = nfs41_check_session_ready(clp);
6061 if (ret)
6062 return ret;
6063 /* Test for the DS role */
6064 if (!is_ds_client(clp))
6065 return -ENODEV;
6066 return 0;
6067}
6068EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
6069
6070
6071/*
6072 * Renew the cl_session lease. 5707 * Renew the cl_session lease.
6073 */ 5708 */
6074struct nfs4_sequence_data { 5709struct nfs4_sequence_data {
@@ -6133,9 +5768,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
6133 args = task->tk_msg.rpc_argp; 5768 args = task->tk_msg.rpc_argp;
6134 res = task->tk_msg.rpc_resp; 5769 res = task->tk_msg.rpc_resp;
6135 5770
6136 if (nfs41_setup_sequence(clp->cl_session, args, res, task)) 5771 nfs41_setup_sequence(clp->cl_session, args, res, task);
6137 return;
6138 rpc_call_start(task);
6139} 5772}
6140 5773
6141static const struct rpc_call_ops nfs41_sequence_ops = { 5774static const struct rpc_call_ops nfs41_sequence_ops = {
@@ -6144,7 +5777,9 @@ static const struct rpc_call_ops nfs41_sequence_ops = {
6144 .rpc_release = nfs41_sequence_release, 5777 .rpc_release = nfs41_sequence_release,
6145}; 5778};
6146 5779
6147static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) 5780static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
5781 struct rpc_cred *cred,
5782 bool is_privileged)
6148{ 5783{
6149 struct nfs4_sequence_data *calldata; 5784 struct nfs4_sequence_data *calldata;
6150 struct rpc_message msg = { 5785 struct rpc_message msg = {
@@ -6166,6 +5801,8 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_
6166 return ERR_PTR(-ENOMEM); 5801 return ERR_PTR(-ENOMEM);
6167 } 5802 }
6168 nfs41_init_sequence(&calldata->args, &calldata->res, 0); 5803 nfs41_init_sequence(&calldata->args, &calldata->res, 0);
5804 if (is_privileged)
5805 nfs4_set_sequence_privileged(&calldata->args);
6169 msg.rpc_argp = &calldata->args; 5806 msg.rpc_argp = &calldata->args;
6170 msg.rpc_resp = &calldata->res; 5807 msg.rpc_resp = &calldata->res;
6171 calldata->clp = clp; 5808 calldata->clp = clp;
@@ -6181,7 +5818,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
6181 5818
6182 if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0) 5819 if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
6183 return 0; 5820 return 0;
6184 task = _nfs41_proc_sequence(clp, cred); 5821 task = _nfs41_proc_sequence(clp, cred, false);
6185 if (IS_ERR(task)) 5822 if (IS_ERR(task))
6186 ret = PTR_ERR(task); 5823 ret = PTR_ERR(task);
6187 else 5824 else
@@ -6195,7 +5832,7 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
6195 struct rpc_task *task; 5832 struct rpc_task *task;
6196 int ret; 5833 int ret;
6197 5834
6198 task = _nfs41_proc_sequence(clp, cred); 5835 task = _nfs41_proc_sequence(clp, cred, true);
6199 if (IS_ERR(task)) { 5836 if (IS_ERR(task)) {
6200 ret = PTR_ERR(task); 5837 ret = PTR_ERR(task);
6201 goto out; 5838 goto out;
@@ -6224,13 +5861,10 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
6224{ 5861{
6225 struct nfs4_reclaim_complete_data *calldata = data; 5862 struct nfs4_reclaim_complete_data *calldata = data;
6226 5863
6227 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); 5864 nfs41_setup_sequence(calldata->clp->cl_session,
6228 if (nfs41_setup_sequence(calldata->clp->cl_session, 5865 &calldata->arg.seq_args,
6229 &calldata->arg.seq_args, 5866 &calldata->res.seq_res,
6230 &calldata->res.seq_res, task)) 5867 task);
6231 return;
6232
6233 rpc_call_start(task);
6234} 5868}
6235 5869
6236static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp) 5870static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp)
@@ -6307,6 +5941,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
6307 calldata->arg.one_fs = 0; 5941 calldata->arg.one_fs = 0;
6308 5942
6309 nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0); 5943 nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0);
5944 nfs4_set_sequence_privileged(&calldata->arg.seq_args);
6310 msg.rpc_argp = &calldata->arg; 5945 msg.rpc_argp = &calldata->arg;
6311 msg.rpc_resp = &calldata->res; 5946 msg.rpc_resp = &calldata->res;
6312 task_setup_data.callback_data = calldata; 5947 task_setup_data.callback_data = calldata;
@@ -6330,6 +5965,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
6330{ 5965{
6331 struct nfs4_layoutget *lgp = calldata; 5966 struct nfs4_layoutget *lgp = calldata;
6332 struct nfs_server *server = NFS_SERVER(lgp->args.inode); 5967 struct nfs_server *server = NFS_SERVER(lgp->args.inode);
5968 struct nfs4_session *session = nfs4_get_session(server);
6333 5969
6334 dprintk("--> %s\n", __func__); 5970 dprintk("--> %s\n", __func__);
6335 /* Note the is a race here, where a CB_LAYOUTRECALL can come in 5971 /* Note the is a race here, where a CB_LAYOUTRECALL can come in
@@ -6337,16 +5973,14 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
6337 * However, that is not so catastrophic, and there seems 5973 * However, that is not so catastrophic, and there seems
6338 * to be no way to prevent it completely. 5974 * to be no way to prevent it completely.
6339 */ 5975 */
6340 if (nfs4_setup_sequence(server, &lgp->args.seq_args, 5976 if (nfs41_setup_sequence(session, &lgp->args.seq_args,
6341 &lgp->res.seq_res, task)) 5977 &lgp->res.seq_res, task))
6342 return; 5978 return;
6343 if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, 5979 if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
6344 NFS_I(lgp->args.inode)->layout, 5980 NFS_I(lgp->args.inode)->layout,
6345 lgp->args.ctx->state)) { 5981 lgp->args.ctx->state)) {
6346 rpc_exit(task, NFS4_OK); 5982 rpc_exit(task, NFS4_OK);
6347 return;
6348 } 5983 }
6349 rpc_call_start(task);
6350} 5984}
6351 5985
6352static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) 5986static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
@@ -6359,7 +5993,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
6359 5993
6360 dprintk("--> %s\n", __func__); 5994 dprintk("--> %s\n", __func__);
6361 5995
6362 if (!nfs4_sequence_done(task, &lgp->res.seq_res)) 5996 if (!nfs41_sequence_done(task, &lgp->res.seq_res))
6363 goto out; 5997 goto out;
6364 5998
6365 switch (task->tk_status) { 5999 switch (task->tk_status) {
@@ -6510,10 +6144,10 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
6510 struct nfs4_layoutreturn *lrp = calldata; 6144 struct nfs4_layoutreturn *lrp = calldata;
6511 6145
6512 dprintk("--> %s\n", __func__); 6146 dprintk("--> %s\n", __func__);
6513 if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args, 6147 nfs41_setup_sequence(lrp->clp->cl_session,
6514 &lrp->res.seq_res, task)) 6148 &lrp->args.seq_args,
6515 return; 6149 &lrp->res.seq_res,
6516 rpc_call_start(task); 6150 task);
6517} 6151}
6518 6152
6519static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) 6153static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
@@ -6523,7 +6157,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
6523 6157
6524 dprintk("--> %s\n", __func__); 6158 dprintk("--> %s\n", __func__);
6525 6159
6526 if (!nfs4_sequence_done(task, &lrp->res.seq_res)) 6160 if (!nfs41_sequence_done(task, &lrp->res.seq_res))
6527 return; 6161 return;
6528 6162
6529 server = NFS_SERVER(lrp->args.inode); 6163 server = NFS_SERVER(lrp->args.inode);
@@ -6672,11 +6306,12 @@ static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata)
6672{ 6306{
6673 struct nfs4_layoutcommit_data *data = calldata; 6307 struct nfs4_layoutcommit_data *data = calldata;
6674 struct nfs_server *server = NFS_SERVER(data->args.inode); 6308 struct nfs_server *server = NFS_SERVER(data->args.inode);
6309 struct nfs4_session *session = nfs4_get_session(server);
6675 6310
6676 if (nfs4_setup_sequence(server, &data->args.seq_args, 6311 nfs41_setup_sequence(session,
6677 &data->res.seq_res, task)) 6312 &data->args.seq_args,
6678 return; 6313 &data->res.seq_res,
6679 rpc_call_start(task); 6314 task);
6680} 6315}
6681 6316
6682static void 6317static void
@@ -6685,7 +6320,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
6685 struct nfs4_layoutcommit_data *data = calldata; 6320 struct nfs4_layoutcommit_data *data = calldata;
6686 struct nfs_server *server = NFS_SERVER(data->args.inode); 6321 struct nfs_server *server = NFS_SERVER(data->args.inode);
6687 6322
6688 if (!nfs4_sequence_done(task, &data->res.seq_res)) 6323 if (!nfs41_sequence_done(task, &data->res.seq_res))
6689 return; 6324 return;
6690 6325
6691 switch (task->tk_status) { /* Just ignore these failures */ 6326 switch (task->tk_status) { /* Just ignore these failures */
@@ -6873,7 +6508,9 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
6873 6508
6874 dprintk("NFS call test_stateid %p\n", stateid); 6509 dprintk("NFS call test_stateid %p\n", stateid);
6875 nfs41_init_sequence(&args.seq_args, &res.seq_res, 0); 6510 nfs41_init_sequence(&args.seq_args, &res.seq_res, 0);
6876 status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); 6511 nfs4_set_sequence_privileged(&args.seq_args);
6512 status = nfs4_call_sync_sequence(server->client, server, &msg,
6513 &args.seq_args, &res.seq_res);
6877 if (status != NFS_OK) { 6514 if (status != NFS_OK) {
6878 dprintk("NFS reply test_stateid: failed, %d\n", status); 6515 dprintk("NFS reply test_stateid: failed, %d\n", status);
6879 return status; 6516 return status;
@@ -6920,8 +6557,9 @@ static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
6920 6557
6921 dprintk("NFS call free_stateid %p\n", stateid); 6558 dprintk("NFS call free_stateid %p\n", stateid);
6922 nfs41_init_sequence(&args.seq_args, &res.seq_res, 0); 6559 nfs41_init_sequence(&args.seq_args, &res.seq_res, 0);
6560 nfs4_set_sequence_privileged(&args.seq_args);
6923 status = nfs4_call_sync_sequence(server->client, server, &msg, 6561 status = nfs4_call_sync_sequence(server->client, server, &msg,
6924 &args.seq_args, &res.seq_res, 1); 6562 &args.seq_args, &res.seq_res);
6925 dprintk("NFS reply free_stateid: %d\n", status); 6563 dprintk("NFS reply free_stateid: %d\n", status);
6926 return status; 6564 return status;
6927} 6565}
@@ -7041,7 +6679,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
7041#if defined(CONFIG_NFS_V4_1) 6679#if defined(CONFIG_NFS_V4_1)
7042static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { 6680static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
7043 .minor_version = 1, 6681 .minor_version = 1,
7044 .call_sync = _nfs4_call_sync_session, 6682 .call_sync = nfs4_call_sync_sequence,
7045 .match_stateid = nfs41_match_stateid, 6683 .match_stateid = nfs41_match_stateid,
7046 .find_root_sec = nfs41_find_root_sec, 6684 .find_root_sec = nfs41_find_root_sec,
7047 .reboot_recovery_ops = &nfs41_reboot_recovery_ops, 6685 .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
new file mode 100644
index 000000000000..ebda5f4a031b
--- /dev/null
+++ b/fs/nfs/nfs4session.c
@@ -0,0 +1,552 @@
1/*
2 * fs/nfs/nfs4session.c
3 *
4 * Copyright (c) 2012 Trond Myklebust <Trond.Myklebust@netapp.com>
5 *
6 */
7#include <linux/kernel.h>
8#include <linux/errno.h>
9#include <linux/string.h>
10#include <linux/printk.h>
11#include <linux/slab.h>
12#include <linux/sunrpc/sched.h>
13#include <linux/sunrpc/bc_xprt.h>
14#include <linux/nfs.h>
15#include <linux/nfs4.h>
16#include <linux/nfs_fs.h>
17#include <linux/module.h>
18
19#include "nfs4_fs.h"
20#include "internal.h"
21#include "nfs4session.h"
22#include "callback.h"
23
24#define NFSDBG_FACILITY NFSDBG_STATE
25
26/*
27 * nfs4_shrink_slot_table - free retired slots from the slot table
28 */
29static void nfs4_shrink_slot_table(struct nfs4_slot_table *tbl, u32 newsize)
30{
31 struct nfs4_slot **p;
32 if (newsize >= tbl->max_slots)
33 return;
34
35 p = &tbl->slots;
36 while (newsize--)
37 p = &(*p)->next;
38 while (*p) {
39 struct nfs4_slot *slot = *p;
40
41 *p = slot->next;
42 kfree(slot);
43 tbl->max_slots--;
44 }
45}
46
47/*
48 * nfs4_free_slot - free a slot and efficiently update slot table.
49 *
50 * freeing a slot is trivially done by clearing its respective bit
51 * in the bitmap.
52 * If the freed slotid equals highest_used_slotid we want to update it
53 * so that the server would be able to size down the slot table if needed,
54 * otherwise we know that the highest_used_slotid is still in use.
55 * When updating highest_used_slotid there may be "holes" in the bitmap
56 * so we need to scan down from highest_used_slotid to 0 looking for the now
57 * highest slotid in use.
58 * If none found, highest_used_slotid is set to NFS4_NO_SLOT.
59 *
60 * Must be called while holding tbl->slot_tbl_lock
61 */
62void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
63{
64 u32 slotid = slot->slot_nr;
65
66 /* clear used bit in bitmap */
67 __clear_bit(slotid, tbl->used_slots);
68
69 /* update highest_used_slotid when it is freed */
70 if (slotid == tbl->highest_used_slotid) {
71 u32 new_max = find_last_bit(tbl->used_slots, slotid);
72 if (new_max < slotid)
73 tbl->highest_used_slotid = new_max;
74 else {
75 tbl->highest_used_slotid = NFS4_NO_SLOT;
76 nfs4_session_drain_complete(tbl->session, tbl);
77 }
78 }
79 dprintk("%s: slotid %u highest_used_slotid %d\n", __func__,
80 slotid, tbl->highest_used_slotid);
81}
82
83static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table *tbl,
84 u32 slotid, u32 seq_init, gfp_t gfp_mask)
85{
86 struct nfs4_slot *slot;
87
88 slot = kzalloc(sizeof(*slot), gfp_mask);
89 if (slot) {
90 slot->table = tbl;
91 slot->slot_nr = slotid;
92 slot->seq_nr = seq_init;
93 }
94 return slot;
95}
96
97static struct nfs4_slot *nfs4_find_or_create_slot(struct nfs4_slot_table *tbl,
98 u32 slotid, u32 seq_init, gfp_t gfp_mask)
99{
100 struct nfs4_slot **p, *slot;
101
102 p = &tbl->slots;
103 for (;;) {
104 if (*p == NULL) {
105 *p = nfs4_new_slot(tbl, tbl->max_slots,
106 seq_init, gfp_mask);
107 if (*p == NULL)
108 break;
109 tbl->max_slots++;
110 }
111 slot = *p;
112 if (slot->slot_nr == slotid)
113 return slot;
114 p = &slot->next;
115 }
116 return ERR_PTR(-ENOMEM);
117}
118
119/*
120 * nfs4_alloc_slot - efficiently look for a free slot
121 *
122 * nfs4_alloc_slot looks for an unset bit in the used_slots bitmap.
123 * If found, we mark the slot as used, update the highest_used_slotid,
124 * and respectively set up the sequence operation args.
125 *
126 * Note: must be called with under the slot_tbl_lock.
127 */
128struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl)
129{
130 struct nfs4_slot *ret = ERR_PTR(-EBUSY);
131 u32 slotid;
132
133 dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n",
134 __func__, tbl->used_slots[0], tbl->highest_used_slotid,
135 tbl->max_slotid + 1);
136 slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slotid + 1);
137 if (slotid > tbl->max_slotid)
138 goto out;
139 ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
140 if (IS_ERR(ret))
141 goto out;
142 __set_bit(slotid, tbl->used_slots);
143 if (slotid > tbl->highest_used_slotid ||
144 tbl->highest_used_slotid == NFS4_NO_SLOT)
145 tbl->highest_used_slotid = slotid;
146 ret->generation = tbl->generation;
147
148out:
149 dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n",
150 __func__, tbl->used_slots[0], tbl->highest_used_slotid,
151 !IS_ERR(ret) ? ret->slot_nr : -1);
152 return ret;
153}
154
155static int nfs4_grow_slot_table(struct nfs4_slot_table *tbl,
156 u32 max_reqs, u32 ivalue)
157{
158 if (max_reqs <= tbl->max_slots)
159 return 0;
160 if (!IS_ERR(nfs4_find_or_create_slot(tbl, max_reqs - 1, ivalue, GFP_NOFS)))
161 return 0;
162 return -ENOMEM;
163}
164
165static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl,
166 u32 server_highest_slotid,
167 u32 ivalue)
168{
169 struct nfs4_slot **p;
170
171 nfs4_shrink_slot_table(tbl, server_highest_slotid + 1);
172 p = &tbl->slots;
173 while (*p) {
174 (*p)->seq_nr = ivalue;
175 (*p)->interrupted = 0;
176 p = &(*p)->next;
177 }
178 tbl->highest_used_slotid = NFS4_NO_SLOT;
179 tbl->target_highest_slotid = server_highest_slotid;
180 tbl->server_highest_slotid = server_highest_slotid;
181 tbl->d_target_highest_slotid = 0;
182 tbl->d2_target_highest_slotid = 0;
183 tbl->max_slotid = server_highest_slotid;
184}
185
186/*
187 * (re)Initialise a slot table
188 */
189static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl,
190 u32 max_reqs, u32 ivalue)
191{
192 int ret;
193
194 dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,
195 max_reqs, tbl->max_slots);
196
197 if (max_reqs > NFS4_MAX_SLOT_TABLE)
198 max_reqs = NFS4_MAX_SLOT_TABLE;
199
200 ret = nfs4_grow_slot_table(tbl, max_reqs, ivalue);
201 if (ret)
202 goto out;
203
204 spin_lock(&tbl->slot_tbl_lock);
205 nfs4_reset_slot_table(tbl, max_reqs - 1, ivalue);
206 spin_unlock(&tbl->slot_tbl_lock);
207
208 dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
209 tbl, tbl->slots, tbl->max_slots);
210out:
211 dprintk("<-- %s: return %d\n", __func__, ret);
212 return ret;
213}
214
215/* Destroy the slot table */
216static void nfs4_destroy_slot_tables(struct nfs4_session *session)
217{
218 nfs4_shrink_slot_table(&session->fc_slot_table, 0);
219 nfs4_shrink_slot_table(&session->bc_slot_table, 0);
220}
221
222static bool nfs41_assign_slot(struct rpc_task *task, void *pslot)
223{
224 struct nfs4_sequence_args *args = task->tk_msg.rpc_argp;
225 struct nfs4_sequence_res *res = task->tk_msg.rpc_resp;
226 struct nfs4_slot *slot = pslot;
227 struct nfs4_slot_table *tbl = slot->table;
228
229 if (nfs4_session_draining(tbl->session) && !args->sa_privileged)
230 return false;
231 slot->generation = tbl->generation;
232 args->sa_slot = slot;
233 res->sr_timestamp = jiffies;
234 res->sr_slot = slot;
235 res->sr_status_flags = 0;
236 res->sr_status = 1;
237 return true;
238}
239
240static bool __nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
241 struct nfs4_slot *slot)
242{
243 if (rpc_wake_up_first(&tbl->slot_tbl_waitq, nfs41_assign_slot, slot))
244 return true;
245 return false;
246}
247
248bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
249 struct nfs4_slot *slot)
250{
251 if (slot->slot_nr > tbl->max_slotid)
252 return false;
253 return __nfs41_wake_and_assign_slot(tbl, slot);
254}
255
256static bool nfs41_try_wake_next_slot_table_entry(struct nfs4_slot_table *tbl)
257{
258 struct nfs4_slot *slot = nfs4_alloc_slot(tbl);
259 if (!IS_ERR(slot)) {
260 bool ret = __nfs41_wake_and_assign_slot(tbl, slot);
261 if (ret)
262 return ret;
263 nfs4_free_slot(tbl, slot);
264 }
265 return false;
266}
267
268void nfs41_wake_slot_table(struct nfs4_slot_table *tbl)
269{
270 for (;;) {
271 if (!nfs41_try_wake_next_slot_table_entry(tbl))
272 break;
273 }
274}
275
276static void nfs41_set_max_slotid_locked(struct nfs4_slot_table *tbl,
277 u32 target_highest_slotid)
278{
279 u32 max_slotid;
280
281 max_slotid = min(NFS4_MAX_SLOT_TABLE - 1, target_highest_slotid);
282 if (max_slotid > tbl->server_highest_slotid)
283 max_slotid = tbl->server_highest_slotid;
284 if (max_slotid > tbl->target_highest_slotid)
285 max_slotid = tbl->target_highest_slotid;
286 tbl->max_slotid = max_slotid;
287 nfs41_wake_slot_table(tbl);
288}
289
290/* Update the client's idea of target_highest_slotid */
291static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl,
292 u32 target_highest_slotid)
293{
294 if (tbl->target_highest_slotid == target_highest_slotid)
295 return;
296 tbl->target_highest_slotid = target_highest_slotid;
297 tbl->generation++;
298}
299
300void nfs41_set_target_slotid(struct nfs4_slot_table *tbl,
301 u32 target_highest_slotid)
302{
303 spin_lock(&tbl->slot_tbl_lock);
304 nfs41_set_target_slotid_locked(tbl, target_highest_slotid);
305 tbl->d_target_highest_slotid = 0;
306 tbl->d2_target_highest_slotid = 0;
307 nfs41_set_max_slotid_locked(tbl, target_highest_slotid);
308 spin_unlock(&tbl->slot_tbl_lock);
309}
310
311static void nfs41_set_server_slotid_locked(struct nfs4_slot_table *tbl,
312 u32 highest_slotid)
313{
314 if (tbl->server_highest_slotid == highest_slotid)
315 return;
316 if (tbl->highest_used_slotid > highest_slotid)
317 return;
318 /* Deallocate slots */
319 nfs4_shrink_slot_table(tbl, highest_slotid + 1);
320 tbl->server_highest_slotid = highest_slotid;
321}
322
323static s32 nfs41_derivative_target_slotid(s32 s1, s32 s2)
324{
325 s1 -= s2;
326 if (s1 == 0)
327 return 0;
328 if (s1 < 0)
329 return (s1 - 1) >> 1;
330 return (s1 + 1) >> 1;
331}
332
333static int nfs41_sign_s32(s32 s1)
334{
335 if (s1 > 0)
336 return 1;
337 if (s1 < 0)
338 return -1;
339 return 0;
340}
341
342static bool nfs41_same_sign_or_zero_s32(s32 s1, s32 s2)
343{
344 if (!s1 || !s2)
345 return true;
346 return nfs41_sign_s32(s1) == nfs41_sign_s32(s2);
347}
348
349/* Try to eliminate outliers by checking for sharp changes in the
350 * derivatives and second derivatives
351 */
352static bool nfs41_is_outlier_target_slotid(struct nfs4_slot_table *tbl,
353 u32 new_target)
354{
355 s32 d_target, d2_target;
356 bool ret = true;
357
358 d_target = nfs41_derivative_target_slotid(new_target,
359 tbl->target_highest_slotid);
360 d2_target = nfs41_derivative_target_slotid(d_target,
361 tbl->d_target_highest_slotid);
362 /* Is first derivative same sign? */
363 if (nfs41_same_sign_or_zero_s32(d_target, tbl->d_target_highest_slotid))
364 ret = false;
365 /* Is second derivative same sign? */
366 if (nfs41_same_sign_or_zero_s32(d2_target, tbl->d2_target_highest_slotid))
367 ret = false;
368 tbl->d_target_highest_slotid = d_target;
369 tbl->d2_target_highest_slotid = d2_target;
370 return ret;
371}
372
373void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
374 struct nfs4_slot *slot,
375 struct nfs4_sequence_res *res)
376{
377 spin_lock(&tbl->slot_tbl_lock);
378 if (!nfs41_is_outlier_target_slotid(tbl, res->sr_target_highest_slotid))
379 nfs41_set_target_slotid_locked(tbl, res->sr_target_highest_slotid);
380 if (tbl->generation == slot->generation)
381 nfs41_set_server_slotid_locked(tbl, res->sr_highest_slotid);
382 nfs41_set_max_slotid_locked(tbl, res->sr_target_highest_slotid);
383 spin_unlock(&tbl->slot_tbl_lock);
384}
385
386/*
387 * Initialize or reset the forechannel and backchannel tables
388 */
389int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
390{
391 struct nfs4_slot_table *tbl;
392 int status;
393
394 dprintk("--> %s\n", __func__);
395 /* Fore channel */
396 tbl = &ses->fc_slot_table;
397 tbl->session = ses;
398 status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
399 if (status) /* -ENOMEM */
400 return status;
401 /* Back channel */
402 tbl = &ses->bc_slot_table;
403 tbl->session = ses;
404 status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
405 if (status && tbl->slots == NULL)
406 /* Fore and back channel share a connection so get
407 * both slot tables or neither */
408 nfs4_destroy_slot_tables(ses);
409 return status;
410}
411
412struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
413{
414 struct nfs4_session *session;
415 struct nfs4_slot_table *tbl;
416
417 session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
418 if (!session)
419 return NULL;
420
421 tbl = &session->fc_slot_table;
422 tbl->highest_used_slotid = NFS4_NO_SLOT;
423 spin_lock_init(&tbl->slot_tbl_lock);
424 rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
425 init_completion(&tbl->complete);
426
427 tbl = &session->bc_slot_table;
428 tbl->highest_used_slotid = NFS4_NO_SLOT;
429 spin_lock_init(&tbl->slot_tbl_lock);
430 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
431 init_completion(&tbl->complete);
432
433 session->session_state = 1<<NFS4_SESSION_INITING;
434
435 session->clp = clp;
436 return session;
437}
438
439void nfs4_destroy_session(struct nfs4_session *session)
440{
441 struct rpc_xprt *xprt;
442 struct rpc_cred *cred;
443
444 cred = nfs4_get_exchange_id_cred(session->clp);
445 nfs4_proc_destroy_session(session, cred);
446 if (cred)
447 put_rpccred(cred);
448
449 rcu_read_lock();
450 xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt);
451 rcu_read_unlock();
452 dprintk("%s Destroy backchannel for xprt %p\n",
453 __func__, xprt);
454 xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);
455 nfs4_destroy_slot_tables(session);
456 kfree(session);
457}
458
459/*
460 * With sessions, the client is not marked ready until after a
461 * successful EXCHANGE_ID and CREATE_SESSION.
462 *
463 * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate
464 * other versions of NFS can be tried.
465 */
466static int nfs41_check_session_ready(struct nfs_client *clp)
467{
468 int ret;
469
470 if (clp->cl_cons_state == NFS_CS_SESSION_INITING) {
471 ret = nfs4_client_recover_expired_lease(clp);
472 if (ret)
473 return ret;
474 }
475 if (clp->cl_cons_state < NFS_CS_READY)
476 return -EPROTONOSUPPORT;
477 smp_rmb();
478 return 0;
479}
480
481int nfs4_init_session(struct nfs_server *server)
482{
483 struct nfs_client *clp = server->nfs_client;
484 struct nfs4_session *session;
485 unsigned int target_max_rqst_sz = NFS_MAX_FILE_IO_SIZE;
486 unsigned int target_max_resp_sz = NFS_MAX_FILE_IO_SIZE;
487
488 if (!nfs4_has_session(clp))
489 return 0;
490
491 if (server->rsize != 0)
492 target_max_resp_sz = server->rsize;
493 target_max_resp_sz += nfs41_maxread_overhead;
494
495 if (server->wsize != 0)
496 target_max_rqst_sz = server->wsize;
497 target_max_rqst_sz += nfs41_maxwrite_overhead;
498
499 session = clp->cl_session;
500 spin_lock(&clp->cl_lock);
501 if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
502 /* Initialise targets and channel attributes */
503 session->fc_target_max_rqst_sz = target_max_rqst_sz;
504 session->fc_attrs.max_rqst_sz = target_max_rqst_sz;
505 session->fc_target_max_resp_sz = target_max_resp_sz;
506 session->fc_attrs.max_resp_sz = target_max_resp_sz;
507 } else {
508 /* Just adjust the targets */
509 if (target_max_rqst_sz > session->fc_target_max_rqst_sz) {
510 session->fc_target_max_rqst_sz = target_max_rqst_sz;
511 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
512 }
513 if (target_max_resp_sz > session->fc_target_max_resp_sz) {
514 session->fc_target_max_resp_sz = target_max_resp_sz;
515 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
516 }
517 }
518 spin_unlock(&clp->cl_lock);
519
520 if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
521 nfs4_schedule_lease_recovery(clp);
522
523 return nfs41_check_session_ready(clp);
524}
525
526int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time)
527{
528 struct nfs4_session *session = clp->cl_session;
529 int ret;
530
531 spin_lock(&clp->cl_lock);
532 if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
533 /*
534 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the
535 * DS lease to be equal to the MDS lease.
536 */
537 clp->cl_lease_time = lease_time;
538 clp->cl_last_renewal = jiffies;
539 }
540 spin_unlock(&clp->cl_lock);
541
542 ret = nfs41_check_session_ready(clp);
543 if (ret)
544 return ret;
545 /* Test for the DS role */
546 if (!is_ds_client(clp))
547 return -ENODEV;
548 return 0;
549}
550EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
551
552
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
new file mode 100644
index 000000000000..6f3cb39386d4
--- /dev/null
+++ b/fs/nfs/nfs4session.h
@@ -0,0 +1,142 @@
1/*
2 * fs/nfs/nfs4session.h
3 *
4 * Copyright (c) 2012 Trond Myklebust <Trond.Myklebust@netapp.com>
5 *
6 */
7#ifndef __LINUX_FS_NFS_NFS4SESSION_H
8#define __LINUX_FS_NFS_NFS4SESSION_H
9
10/* maximum number of slots to use */
11#define NFS4_DEF_SLOT_TABLE_SIZE (16U)
12#define NFS4_MAX_SLOT_TABLE (1024U)
13#define NFS4_NO_SLOT ((u32)-1)
14
15#if IS_ENABLED(CONFIG_NFS_V4)
16
17/* Sessions slot seqid */
18struct nfs4_slot {
19 struct nfs4_slot_table *table;
20 struct nfs4_slot *next;
21 unsigned long generation;
22 u32 slot_nr;
23 u32 seq_nr;
24 unsigned int interrupted : 1;
25};
26
27/* Sessions */
28#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long))
29struct nfs4_slot_table {
30 struct nfs4_session *session; /* Parent session */
31 struct nfs4_slot *slots; /* seqid per slot */
32 unsigned long used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */
33 spinlock_t slot_tbl_lock;
34 struct rpc_wait_queue slot_tbl_waitq; /* allocators may wait here */
35 u32 max_slots; /* # slots in table */
36 u32 max_slotid; /* Max allowed slotid value */
37 u32 highest_used_slotid; /* sent to server on each SEQ.
38 * op for dynamic resizing */
39 u32 target_highest_slotid; /* Server max_slot target */
40 u32 server_highest_slotid; /* Server highest slotid */
41 s32 d_target_highest_slotid; /* Derivative */
42 s32 d2_target_highest_slotid; /* 2nd derivative */
43 unsigned long generation; /* Generation counter for
44 target_highest_slotid */
45 struct completion complete;
46};
47
48/*
49 * Session related parameters
50 */
51struct nfs4_session {
52 struct nfs4_sessionid sess_id;
53 u32 flags;
54 unsigned long session_state;
55 u32 hash_alg;
56 u32 ssv_len;
57
58 /* The fore and back channel */
59 struct nfs4_channel_attrs fc_attrs;
60 struct nfs4_slot_table fc_slot_table;
61 struct nfs4_channel_attrs bc_attrs;
62 struct nfs4_slot_table bc_slot_table;
63 struct nfs_client *clp;
64 /* Create session arguments */
65 unsigned int fc_target_max_rqst_sz;
66 unsigned int fc_target_max_resp_sz;
67};
68
69enum nfs4_session_state {
70 NFS4_SESSION_INITING,
71 NFS4_SESSION_DRAINING,
72};
73
74#if defined(CONFIG_NFS_V4_1)
75extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
76extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
77
78extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl,
79 u32 target_highest_slotid);
80extern void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
81 struct nfs4_slot *slot,
82 struct nfs4_sequence_res *res);
83
84extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses);
85
86extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
87extern void nfs4_destroy_session(struct nfs4_session *session);
88extern int nfs4_init_session(struct nfs_server *server);
89extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
90
91extern void nfs4_session_drain_complete(struct nfs4_session *session,
92 struct nfs4_slot_table *tbl);
93
94static inline bool nfs4_session_draining(struct nfs4_session *session)
95{
96 return !!test_bit(NFS4_SESSION_DRAINING, &session->session_state);
97}
98
99bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
100 struct nfs4_slot *slot);
101void nfs41_wake_slot_table(struct nfs4_slot_table *tbl);
102
103/*
104 * Determine if sessions are in use.
105 */
106static inline int nfs4_has_session(const struct nfs_client *clp)
107{
108 if (clp->cl_session)
109 return 1;
110 return 0;
111}
112
113static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
114{
115 if (nfs4_has_session(clp))
116 return (clp->cl_session->flags & SESSION4_PERSIST);
117 return 0;
118}
119
120#else /* defined(CONFIG_NFS_V4_1) */
121
122static inline int nfs4_init_session(struct nfs_server *server)
123{
124 return 0;
125}
126
127/*
128 * Determine if sessions are in use.
129 */
130static inline int nfs4_has_session(const struct nfs_client *clp)
131{
132 return 0;
133}
134
135static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
136{
137 return 0;
138}
139
140#endif /* defined(CONFIG_NFS_V4_1) */
141#endif /* IS_ENABLED(CONFIG_NFS_V4) */
142#endif /* __LINUX_FS_NFS_NFS4SESSION_H */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index c351e6b39838..9448c579d41a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -57,6 +57,7 @@
57#include "callback.h" 57#include "callback.h"
58#include "delegation.h" 58#include "delegation.h"
59#include "internal.h" 59#include "internal.h"
60#include "nfs4session.h"
60#include "pnfs.h" 61#include "pnfs.h"
61#include "netns.h" 62#include "netns.h"
62 63
@@ -66,7 +67,6 @@
66 67
67const nfs4_stateid zero_stateid; 68const nfs4_stateid zero_stateid;
68static DEFINE_MUTEX(nfs_clid_init_mutex); 69static DEFINE_MUTEX(nfs_clid_init_mutex);
69static LIST_HEAD(nfs4_clientid_list);
70 70
71int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) 71int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
72{ 72{
@@ -254,24 +254,27 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
254{ 254{
255 struct nfs4_session *ses = clp->cl_session; 255 struct nfs4_session *ses = clp->cl_session;
256 struct nfs4_slot_table *tbl; 256 struct nfs4_slot_table *tbl;
257 int max_slots;
258 257
259 if (ses == NULL) 258 if (ses == NULL)
260 return; 259 return;
261 tbl = &ses->fc_slot_table; 260 tbl = &ses->fc_slot_table;
262 if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { 261 if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
263 spin_lock(&tbl->slot_tbl_lock); 262 spin_lock(&tbl->slot_tbl_lock);
264 max_slots = tbl->max_slots; 263 nfs41_wake_slot_table(tbl);
265 while (max_slots--) {
266 if (rpc_wake_up_first(&tbl->slot_tbl_waitq,
267 nfs4_set_task_privileged,
268 NULL) == NULL)
269 break;
270 }
271 spin_unlock(&tbl->slot_tbl_lock); 264 spin_unlock(&tbl->slot_tbl_lock);
272 } 265 }
273} 266}
274 267
268/*
269 * Signal state manager thread if session fore channel is drained
270 */
271void nfs4_session_drain_complete(struct nfs4_session *session,
272 struct nfs4_slot_table *tbl)
273{
274 if (nfs4_session_draining(session))
275 complete(&tbl->complete);
276}
277
275static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl) 278static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
276{ 279{
277 spin_lock(&tbl->slot_tbl_lock); 280 spin_lock(&tbl->slot_tbl_lock);
@@ -303,7 +306,6 @@ static void nfs41_finish_session_reset(struct nfs_client *clp)
303 clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); 306 clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
304 clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); 307 clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
305 /* create_session negotiated new slot table */ 308 /* create_session negotiated new slot table */
306 clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
307 clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); 309 clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
308 nfs41_setup_state_renewal(clp); 310 nfs41_setup_state_renewal(clp);
309} 311}
@@ -1086,7 +1088,6 @@ void nfs_free_seqid(struct nfs_seqid *seqid)
1086 */ 1088 */
1087static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) 1089static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
1088{ 1090{
1089 BUG_ON(list_first_entry(&seqid->sequence->list, struct nfs_seqid, list) != seqid);
1090 switch (status) { 1091 switch (status) {
1091 case 0: 1092 case 0:
1092 break; 1093 break;
@@ -1209,6 +1210,40 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
1209} 1210}
1210EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery); 1211EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery);
1211 1212
1213int nfs4_wait_clnt_recover(struct nfs_client *clp)
1214{
1215 int res;
1216
1217 might_sleep();
1218
1219 res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
1220 nfs_wait_bit_killable, TASK_KILLABLE);
1221 if (res)
1222 return res;
1223
1224 if (clp->cl_cons_state < 0)
1225 return clp->cl_cons_state;
1226 return 0;
1227}
1228
1229int nfs4_client_recover_expired_lease(struct nfs_client *clp)
1230{
1231 unsigned int loop;
1232 int ret;
1233
1234 for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
1235 ret = nfs4_wait_clnt_recover(clp);
1236 if (ret != 0)
1237 break;
1238 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
1239 !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
1240 break;
1241 nfs4_schedule_state_manager(clp);
1242 ret = -EIO;
1243 }
1244 return ret;
1245}
1246
1212/* 1247/*
1213 * nfs40_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN 1248 * nfs40_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
1214 * @clp: client to process 1249 * @clp: client to process
@@ -1401,14 +1436,6 @@ restart:
1401 /* Mark the file as being 'closed' */ 1436 /* Mark the file as being 'closed' */
1402 state->state = 0; 1437 state->state = 0;
1403 break; 1438 break;
1404 case -EKEYEXPIRED:
1405 /*
1406 * User RPCSEC_GSS context has expired.
1407 * We cannot recover this stateid now, so
1408 * skip it and allow recovery thread to
1409 * proceed.
1410 */
1411 break;
1412 case -NFS4ERR_ADMIN_REVOKED: 1439 case -NFS4ERR_ADMIN_REVOKED:
1413 case -NFS4ERR_STALE_STATEID: 1440 case -NFS4ERR_STALE_STATEID:
1414 case -NFS4ERR_BAD_STATEID: 1441 case -NFS4ERR_BAD_STATEID:
@@ -1561,14 +1588,6 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
1561 nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); 1588 nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
1562} 1589}
1563 1590
1564static void nfs4_warn_keyexpired(const char *s)
1565{
1566 printk_ratelimited(KERN_WARNING "Error: state manager"
1567 " encountered RPCSEC_GSS session"
1568 " expired against NFSv4 server %s.\n",
1569 s);
1570}
1571
1572static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) 1591static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1573{ 1592{
1574 switch (error) { 1593 switch (error) {
@@ -1602,10 +1621,6 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1602 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: 1621 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
1603 set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); 1622 set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
1604 break; 1623 break;
1605 case -EKEYEXPIRED:
1606 /* Nothing we can do */
1607 nfs4_warn_keyexpired(clp->cl_hostname);
1608 break;
1609 default: 1624 default:
1610 dprintk("%s: failed to handle error %d for server %s\n", 1625 dprintk("%s: failed to handle error %d for server %s\n",
1611 __func__, error, clp->cl_hostname); 1626 __func__, error, clp->cl_hostname);
@@ -1722,8 +1737,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
1722 dprintk("%s: exit with error %d for server %s\n", 1737 dprintk("%s: exit with error %d for server %s\n",
1723 __func__, -EPROTONOSUPPORT, clp->cl_hostname); 1738 __func__, -EPROTONOSUPPORT, clp->cl_hostname);
1724 return -EPROTONOSUPPORT; 1739 return -EPROTONOSUPPORT;
1725 case -EKEYEXPIRED:
1726 nfs4_warn_keyexpired(clp->cl_hostname);
1727 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery 1740 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
1728 * in nfs4_exchange_id */ 1741 * in nfs4_exchange_id */
1729 default: 1742 default:
@@ -1876,7 +1889,6 @@ again:
1876 break; 1889 break;
1877 1890
1878 case -EKEYEXPIRED: 1891 case -EKEYEXPIRED:
1879 nfs4_warn_keyexpired(clp->cl_hostname);
1880 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery 1892 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
1881 * in nfs4_exchange_id */ 1893 * in nfs4_exchange_id */
1882 status = -EKEYEXPIRED; 1894 status = -EKEYEXPIRED;
@@ -1907,14 +1919,23 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
1907} 1919}
1908EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); 1920EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
1909 1921
1910void nfs41_handle_recall_slot(struct nfs_client *clp) 1922static void nfs41_ping_server(struct nfs_client *clp)
1911{ 1923{
1912 set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); 1924 /* Use CHECK_LEASE to ping the server with a SEQUENCE */
1913 dprintk("%s: scheduling slot recall for server %s\n", __func__, 1925 set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
1914 clp->cl_hostname);
1915 nfs4_schedule_state_manager(clp); 1926 nfs4_schedule_state_manager(clp);
1916} 1927}
1917 1928
1929void nfs41_server_notify_target_slotid_update(struct nfs_client *clp)
1930{
1931 nfs41_ping_server(clp);
1932}
1933
1934void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp)
1935{
1936 nfs41_ping_server(clp);
1937}
1938
1918static void nfs4_reset_all_state(struct nfs_client *clp) 1939static void nfs4_reset_all_state(struct nfs_client *clp)
1919{ 1940{
1920 if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { 1941 if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
@@ -2024,35 +2045,6 @@ out:
2024 return status; 2045 return status;
2025} 2046}
2026 2047
2027static int nfs4_recall_slot(struct nfs_client *clp)
2028{
2029 struct nfs4_slot_table *fc_tbl;
2030 struct nfs4_slot *new, *old;
2031 int i;
2032
2033 if (!nfs4_has_session(clp))
2034 return 0;
2035 nfs4_begin_drain_session(clp);
2036 fc_tbl = &clp->cl_session->fc_slot_table;
2037 new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot),
2038 GFP_NOFS);
2039 if (!new)
2040 return -ENOMEM;
2041
2042 spin_lock(&fc_tbl->slot_tbl_lock);
2043 for (i = 0; i < fc_tbl->target_max_slots; i++)
2044 new[i].seq_nr = fc_tbl->slots[i].seq_nr;
2045 old = fc_tbl->slots;
2046 fc_tbl->slots = new;
2047 fc_tbl->max_slots = fc_tbl->target_max_slots;
2048 fc_tbl->target_max_slots = 0;
2049 clp->cl_session->fc_attrs.max_reqs = fc_tbl->max_slots;
2050 spin_unlock(&fc_tbl->slot_tbl_lock);
2051
2052 kfree(old);
2053 return 0;
2054}
2055
2056static int nfs4_bind_conn_to_session(struct nfs_client *clp) 2048static int nfs4_bind_conn_to_session(struct nfs_client *clp)
2057{ 2049{
2058 struct rpc_cred *cred; 2050 struct rpc_cred *cred;
@@ -2083,7 +2075,6 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
2083#else /* CONFIG_NFS_V4_1 */ 2075#else /* CONFIG_NFS_V4_1 */
2084static int nfs4_reset_session(struct nfs_client *clp) { return 0; } 2076static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
2085static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; } 2077static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; }
2086static int nfs4_recall_slot(struct nfs_client *clp) { return 0; }
2087 2078
2088static int nfs4_bind_conn_to_session(struct nfs_client *clp) 2079static int nfs4_bind_conn_to_session(struct nfs_client *clp)
2089{ 2080{
@@ -2115,15 +2106,6 @@ static void nfs4_state_manager(struct nfs_client *clp)
2115 continue; 2106 continue;
2116 } 2107 }
2117 2108
2118 if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
2119 section = "check lease";
2120 status = nfs4_check_lease(clp);
2121 if (status < 0)
2122 goto out_error;
2123 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
2124 continue;
2125 }
2126
2127 /* Initialize or reset the session */ 2109 /* Initialize or reset the session */
2128 if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) { 2110 if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) {
2129 section = "reset session"; 2111 section = "reset session";
@@ -2144,10 +2126,9 @@ static void nfs4_state_manager(struct nfs_client *clp)
2144 continue; 2126 continue;
2145 } 2127 }
2146 2128
2147 /* Recall session slots */ 2129 if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
2148 if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)) { 2130 section = "check lease";
2149 section = "recall slot"; 2131 status = nfs4_check_lease(clp);
2150 status = nfs4_recall_slot(clp);
2151 if (status < 0) 2132 if (status < 0)
2152 goto out_error; 2133 goto out_error;
2153 continue; 2134 continue;
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index bd61221ad2c5..84d2e9e2f313 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -51,6 +51,7 @@ static const struct super_operations nfs4_sops = {
51 .alloc_inode = nfs_alloc_inode, 51 .alloc_inode = nfs_alloc_inode,
52 .destroy_inode = nfs_destroy_inode, 52 .destroy_inode = nfs_destroy_inode,
53 .write_inode = nfs4_write_inode, 53 .write_inode = nfs4_write_inode,
54 .drop_inode = nfs_drop_inode,
54 .put_super = nfs_put_super, 55 .put_super = nfs_put_super,
55 .statfs = nfs_statfs, 56 .statfs = nfs_statfs,
56 .evict_inode = nfs4_evict_inode, 57 .evict_inode = nfs4_evict_inode,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 40836ee5dc3a..26b143920433 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -56,6 +56,7 @@
56 56
57#include "nfs4_fs.h" 57#include "nfs4_fs.h"
58#include "internal.h" 58#include "internal.h"
59#include "nfs4session.h"
59#include "pnfs.h" 60#include "pnfs.h"
60#include "netns.h" 61#include "netns.h"
61 62
@@ -270,6 +271,8 @@ static int nfs4_stat_to_errno(int);
270 271
271#if defined(CONFIG_NFS_V4_1) 272#if defined(CONFIG_NFS_V4_1)
272#define NFS4_MAX_MACHINE_NAME_LEN (64) 273#define NFS4_MAX_MACHINE_NAME_LEN (64)
274#define IMPL_NAME_LIMIT (sizeof(utsname()->sysname) + sizeof(utsname()->release) + \
275 sizeof(utsname()->version) + sizeof(utsname()->machine) + 8)
273 276
274#define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \ 277#define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
275 encode_verifier_maxsz + \ 278 encode_verifier_maxsz + \
@@ -282,7 +285,7 @@ static int nfs4_stat_to_errno(int);
282 1 /* nii_domain */ + \ 285 1 /* nii_domain */ + \
283 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ 286 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
284 1 /* nii_name */ + \ 287 1 /* nii_name */ + \
285 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ 288 XDR_QUADLEN(IMPL_NAME_LIMIT) + \
286 3 /* nii_date */) 289 3 /* nii_date */)
287#define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \ 290#define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \
288 2 /* eir_clientid */ + \ 291 2 /* eir_clientid */ + \
@@ -936,7 +939,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
936 * but this is not required as a MUST for the server to do so. */ 939 * but this is not required as a MUST for the server to do so. */
937 hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen; 940 hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen;
938 941
939 BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); 942 WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN);
940 encode_string(xdr, hdr->taglen, hdr->tag); 943 encode_string(xdr, hdr->taglen, hdr->tag);
941 p = reserve_space(xdr, 8); 944 p = reserve_space(xdr, 8);
942 *p++ = cpu_to_be32(hdr->minorversion); 945 *p++ = cpu_to_be32(hdr->minorversion);
@@ -955,7 +958,7 @@ static void encode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 op,
955 958
956static void encode_nops(struct compound_hdr *hdr) 959static void encode_nops(struct compound_hdr *hdr)
957{ 960{
958 BUG_ON(hdr->nops > NFS4_MAX_OPS); 961 WARN_ON_ONCE(hdr->nops > NFS4_MAX_OPS);
959 *hdr->nops_p = htonl(hdr->nops); 962 *hdr->nops_p = htonl(hdr->nops);
960} 963}
961 964
@@ -1403,7 +1406,6 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
1403 *p = cpu_to_be32(NFS4_OPEN_NOCREATE); 1406 *p = cpu_to_be32(NFS4_OPEN_NOCREATE);
1404 break; 1407 break;
1405 default: 1408 default:
1406 BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
1407 *p = cpu_to_be32(NFS4_OPEN_CREATE); 1409 *p = cpu_to_be32(NFS4_OPEN_CREATE);
1408 encode_createmode(xdr, arg); 1410 encode_createmode(xdr, arg);
1409 } 1411 }
@@ -1621,7 +1623,6 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
1621 p = reserve_space(xdr, 2*4); 1623 p = reserve_space(xdr, 2*4);
1622 *p++ = cpu_to_be32(1); 1624 *p++ = cpu_to_be32(1);
1623 *p = cpu_to_be32(FATTR4_WORD0_ACL); 1625 *p = cpu_to_be32(FATTR4_WORD0_ACL);
1624 BUG_ON(arg->acl_len % 4);
1625 p = reserve_space(xdr, 4); 1626 p = reserve_space(xdr, 4);
1626 *p = cpu_to_be32(arg->acl_len); 1627 *p = cpu_to_be32(arg->acl_len);
1627 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); 1628 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
@@ -1713,7 +1714,7 @@ static void encode_exchange_id(struct xdr_stream *xdr,
1713 struct compound_hdr *hdr) 1714 struct compound_hdr *hdr)
1714{ 1715{
1715 __be32 *p; 1716 __be32 *p;
1716 char impl_name[NFS4_OPAQUE_LIMIT]; 1717 char impl_name[IMPL_NAME_LIMIT];
1717 int len = 0; 1718 int len = 0;
1718 1719
1719 encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr); 1720 encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr);
@@ -1728,7 +1729,7 @@ static void encode_exchange_id(struct xdr_stream *xdr,
1728 if (send_implementation_id && 1729 if (send_implementation_id &&
1729 sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 && 1730 sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 &&
1730 sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) 1731 sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN)
1731 <= NFS4_OPAQUE_LIMIT + 1) 1732 <= sizeof(impl_name) + 1)
1732 len = snprintf(impl_name, sizeof(impl_name), "%s %s %s %s", 1733 len = snprintf(impl_name, sizeof(impl_name), "%s %s %s %s",
1733 utsname()->sysname, utsname()->release, 1734 utsname()->sysname, utsname()->release,
1734 utsname()->version, utsname()->machine); 1735 utsname()->version, utsname()->machine);
@@ -1835,18 +1836,16 @@ static void encode_sequence(struct xdr_stream *xdr,
1835 struct compound_hdr *hdr) 1836 struct compound_hdr *hdr)
1836{ 1837{
1837#if defined(CONFIG_NFS_V4_1) 1838#if defined(CONFIG_NFS_V4_1)
1838 struct nfs4_session *session = args->sa_session; 1839 struct nfs4_session *session;
1839 struct nfs4_slot_table *tp; 1840 struct nfs4_slot_table *tp;
1840 struct nfs4_slot *slot; 1841 struct nfs4_slot *slot = args->sa_slot;
1841 __be32 *p; 1842 __be32 *p;
1842 1843
1843 if (!session) 1844 if (slot == NULL)
1844 return; 1845 return;
1845 1846
1846 tp = &session->fc_slot_table; 1847 tp = slot->table;
1847 1848 session = tp->session;
1848 WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
1849 slot = tp->slots + args->sa_slotid;
1850 1849
1851 encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr); 1850 encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr);
1852 1851
@@ -1860,12 +1859,12 @@ static void encode_sequence(struct xdr_stream *xdr,
1860 ((u32 *)session->sess_id.data)[1], 1859 ((u32 *)session->sess_id.data)[1],
1861 ((u32 *)session->sess_id.data)[2], 1860 ((u32 *)session->sess_id.data)[2],
1862 ((u32 *)session->sess_id.data)[3], 1861 ((u32 *)session->sess_id.data)[3],
1863 slot->seq_nr, args->sa_slotid, 1862 slot->seq_nr, slot->slot_nr,
1864 tp->highest_used_slotid, args->sa_cache_this); 1863 tp->highest_used_slotid, args->sa_cache_this);
1865 p = reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 16); 1864 p = reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 16);
1866 p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); 1865 p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
1867 *p++ = cpu_to_be32(slot->seq_nr); 1866 *p++ = cpu_to_be32(slot->seq_nr);
1868 *p++ = cpu_to_be32(args->sa_slotid); 1867 *p++ = cpu_to_be32(slot->slot_nr);
1869 *p++ = cpu_to_be32(tp->highest_used_slotid); 1868 *p++ = cpu_to_be32(tp->highest_used_slotid);
1870 *p = cpu_to_be32(args->sa_cache_this); 1869 *p = cpu_to_be32(args->sa_cache_this);
1871#endif /* CONFIG_NFS_V4_1 */ 1870#endif /* CONFIG_NFS_V4_1 */
@@ -2027,8 +2026,9 @@ static void encode_free_stateid(struct xdr_stream *xdr,
2027static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args) 2026static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
2028{ 2027{
2029#if defined(CONFIG_NFS_V4_1) 2028#if defined(CONFIG_NFS_V4_1)
2030 if (args->sa_session) 2029
2031 return args->sa_session->clp->cl_mvops->minor_version; 2030 if (args->sa_slot)
2031 return args->sa_slot->table->session->clp->cl_mvops->minor_version;
2032#endif /* CONFIG_NFS_V4_1 */ 2032#endif /* CONFIG_NFS_V4_1 */
2033 return 0; 2033 return 0;
2034} 2034}
@@ -5509,12 +5509,13 @@ static int decode_sequence(struct xdr_stream *xdr,
5509 struct rpc_rqst *rqstp) 5509 struct rpc_rqst *rqstp)
5510{ 5510{
5511#if defined(CONFIG_NFS_V4_1) 5511#if defined(CONFIG_NFS_V4_1)
5512 struct nfs4_session *session;
5512 struct nfs4_sessionid id; 5513 struct nfs4_sessionid id;
5513 u32 dummy; 5514 u32 dummy;
5514 int status; 5515 int status;
5515 __be32 *p; 5516 __be32 *p;
5516 5517
5517 if (!res->sr_session) 5518 if (res->sr_slot == NULL)
5518 return 0; 5519 return 0;
5519 5520
5520 status = decode_op_hdr(xdr, OP_SEQUENCE); 5521 status = decode_op_hdr(xdr, OP_SEQUENCE);
@@ -5528,8 +5529,9 @@ static int decode_sequence(struct xdr_stream *xdr,
5528 * sequence number, the server is looney tunes. 5529 * sequence number, the server is looney tunes.
5529 */ 5530 */
5530 status = -EREMOTEIO; 5531 status = -EREMOTEIO;
5532 session = res->sr_slot->table->session;
5531 5533
5532 if (memcmp(id.data, res->sr_session->sess_id.data, 5534 if (memcmp(id.data, session->sess_id.data,
5533 NFS4_MAX_SESSIONID_LEN)) { 5535 NFS4_MAX_SESSIONID_LEN)) {
5534 dprintk("%s Invalid session id\n", __func__); 5536 dprintk("%s Invalid session id\n", __func__);
5535 goto out_err; 5537 goto out_err;
@@ -5547,14 +5549,14 @@ static int decode_sequence(struct xdr_stream *xdr,
5547 } 5549 }
5548 /* slot id */ 5550 /* slot id */
5549 dummy = be32_to_cpup(p++); 5551 dummy = be32_to_cpup(p++);
5550 if (dummy != res->sr_slot - res->sr_session->fc_slot_table.slots) { 5552 if (dummy != res->sr_slot->slot_nr) {
5551 dprintk("%s Invalid slot id\n", __func__); 5553 dprintk("%s Invalid slot id\n", __func__);
5552 goto out_err; 5554 goto out_err;
5553 } 5555 }
5554 /* highest slot id - currently not processed */ 5556 /* highest slot id */
5555 dummy = be32_to_cpup(p++); 5557 res->sr_highest_slotid = be32_to_cpup(p++);
5556 /* target highest slot id - currently not processed */ 5558 /* target highest slot id */
5557 dummy = be32_to_cpup(p++); 5559 res->sr_target_highest_slotid = be32_to_cpup(p++);
5558 /* result flags */ 5560 /* result flags */
5559 res->sr_status_flags = be32_to_cpup(p); 5561 res->sr_status_flags = be32_to_cpup(p);
5560 status = 0; 5562 status = 0;
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 874613545301..a9ebd817278b 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -148,17 +148,6 @@ end_offset(u64 start, u64 len)
148 return end >= start ? end : NFS4_MAX_UINT64; 148 return end >= start ? end : NFS4_MAX_UINT64;
149} 149}
150 150
151/* last octet in a range */
152static inline u64
153last_byte_offset(u64 start, u64 len)
154{
155 u64 end;
156
157 BUG_ON(!len);
158 end = start + len;
159 return end > start ? end - 1 : NFS4_MAX_UINT64;
160}
161
162static void _fix_verify_io_params(struct pnfs_layout_segment *lseg, 151static void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
163 struct page ***p_pages, unsigned *p_pgbase, 152 struct page ***p_pages, unsigned *p_pgbase,
164 u64 offset, unsigned long count) 153 u64 offset, unsigned long count)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 2878f97bd78d..e7165d915362 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -369,17 +369,6 @@ end_offset(u64 start, u64 len)
369 return end >= start ? end : NFS4_MAX_UINT64; 369 return end >= start ? end : NFS4_MAX_UINT64;
370} 370}
371 371
372/* last octet in a range */
373static inline u64
374last_byte_offset(u64 start, u64 len)
375{
376 u64 end;
377
378 BUG_ON(!len);
379 end = start + len;
380 return end > start ? end - 1 : NFS4_MAX_UINT64;
381}
382
383/* 372/*
384 * is l2 fully contained in l1? 373 * is l2 fully contained in l1?
385 * start1 end1 374 * start1 end1
@@ -645,7 +634,6 @@ send_layoutget(struct pnfs_layout_hdr *lo,
645 634
646 dprintk("--> %s\n", __func__); 635 dprintk("--> %s\n", __func__);
647 636
648 BUG_ON(ctx == NULL);
649 lgp = kzalloc(sizeof(*lgp), gfp_flags); 637 lgp = kzalloc(sizeof(*lgp), gfp_flags);
650 if (lgp == NULL) 638 if (lgp == NULL)
651 return NULL; 639 return NULL;
@@ -1126,7 +1114,6 @@ pnfs_update_layout(struct inode *ino,
1126 * chance of a CB_LAYOUTRECALL(FILE) coming in. 1114 * chance of a CB_LAYOUTRECALL(FILE) coming in.
1127 */ 1115 */
1128 spin_lock(&clp->cl_lock); 1116 spin_lock(&clp->cl_lock);
1129 BUG_ON(!list_empty(&lo->plh_layouts));
1130 list_add_tail(&lo->plh_layouts, &server->layouts); 1117 list_add_tail(&lo->plh_layouts, &server->layouts);
1131 spin_unlock(&clp->cl_lock); 1118 spin_unlock(&clp->cl_lock);
1132 } 1119 }
@@ -1222,7 +1209,7 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
1222{ 1209{
1223 u64 rd_size = req->wb_bytes; 1210 u64 rd_size = req->wb_bytes;
1224 1211
1225 BUG_ON(pgio->pg_lseg != NULL); 1212 WARN_ON_ONCE(pgio->pg_lseg != NULL);
1226 1213
1227 if (req->wb_offset != req->wb_pgbase) { 1214 if (req->wb_offset != req->wb_pgbase) {
1228 nfs_pageio_reset_read_mds(pgio); 1215 nfs_pageio_reset_read_mds(pgio);
@@ -1251,7 +1238,7 @@ void
1251pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, 1238pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
1252 struct nfs_page *req, u64 wb_size) 1239 struct nfs_page *req, u64 wb_size)
1253{ 1240{
1254 BUG_ON(pgio->pg_lseg != NULL); 1241 WARN_ON_ONCE(pgio->pg_lseg != NULL);
1255 1242
1256 if (req->wb_offset != req->wb_pgbase) { 1243 if (req->wb_offset != req->wb_pgbase) {
1257 nfs_pageio_reset_write_mds(pgio); 1244 nfs_pageio_reset_write_mds(pgio);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 50a88c3546ed..f084dac948e1 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -47,39 +47,6 @@
47#define NFSDBG_FACILITY NFSDBG_PROC 47#define NFSDBG_FACILITY NFSDBG_PROC
48 48
49/* 49/*
50 * wrapper to handle the -EKEYEXPIRED error message. This should generally
51 * only happen if using krb5 auth and a user's TGT expires. NFSv2 doesn't
52 * support the NFSERR_JUKEBOX error code, but we handle this situation in the
53 * same way that we handle that error with NFSv3.
54 */
55static int
56nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
57{
58 int res;
59 do {
60 res = rpc_call_sync(clnt, msg, flags);
61 if (res != -EKEYEXPIRED)
62 break;
63 freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
64 res = -ERESTARTSYS;
65 } while (!fatal_signal_pending(current));
66 return res;
67}
68
69#define rpc_call_sync(clnt, msg, flags) nfs_rpc_wrapper(clnt, msg, flags)
70
71static int
72nfs_async_handle_expired_key(struct rpc_task *task)
73{
74 if (task->tk_status != -EKEYEXPIRED)
75 return 0;
76 task->tk_status = 0;
77 rpc_restart_call(task);
78 rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
79 return 1;
80}
81
82/*
83 * Bare-bones access to getattr: this is for nfs_read_super. 50 * Bare-bones access to getattr: this is for nfs_read_super.
84 */ 51 */
85static int 52static int
@@ -364,8 +331,6 @@ static void nfs_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlink
364 331
365static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) 332static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
366{ 333{
367 if (nfs_async_handle_expired_key(task))
368 return 0;
369 nfs_mark_for_revalidate(dir); 334 nfs_mark_for_revalidate(dir);
370 return 1; 335 return 1;
371} 336}
@@ -385,8 +350,6 @@ static int
385nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir, 350nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
386 struct inode *new_dir) 351 struct inode *new_dir)
387{ 352{
388 if (nfs_async_handle_expired_key(task))
389 return 0;
390 nfs_mark_for_revalidate(old_dir); 353 nfs_mark_for_revalidate(old_dir);
391 nfs_mark_for_revalidate(new_dir); 354 nfs_mark_for_revalidate(new_dir);
392 return 1; 355 return 1;
@@ -642,9 +605,6 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
642{ 605{
643 struct inode *inode = data->header->inode; 606 struct inode *inode = data->header->inode;
644 607
645 if (nfs_async_handle_expired_key(task))
646 return -EAGAIN;
647
648 nfs_invalidate_atime(inode); 608 nfs_invalidate_atime(inode);
649 if (task->tk_status >= 0) { 609 if (task->tk_status >= 0) {
650 nfs_refresh_inode(inode, data->res.fattr); 610 nfs_refresh_inode(inode, data->res.fattr);
@@ -671,9 +631,6 @@ static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
671{ 631{
672 struct inode *inode = data->header->inode; 632 struct inode *inode = data->header->inode;
673 633
674 if (nfs_async_handle_expired_key(task))
675 return -EAGAIN;
676
677 if (task->tk_status >= 0) 634 if (task->tk_status >= 0)
678 nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); 635 nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
679 return 0; 636 return 0;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 652d3f7176a9..aa5315bb3666 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -64,6 +64,7 @@
64#include "iostat.h" 64#include "iostat.h"
65#include "internal.h" 65#include "internal.h"
66#include "fscache.h" 66#include "fscache.h"
67#include "nfs4session.h"
67#include "pnfs.h" 68#include "pnfs.h"
68#include "nfs.h" 69#include "nfs.h"
69 70
@@ -307,6 +308,7 @@ const struct super_operations nfs_sops = {
307 .alloc_inode = nfs_alloc_inode, 308 .alloc_inode = nfs_alloc_inode,
308 .destroy_inode = nfs_destroy_inode, 309 .destroy_inode = nfs_destroy_inode,
309 .write_inode = nfs_write_inode, 310 .write_inode = nfs_write_inode,
311 .drop_inode = nfs_drop_inode,
310 .put_super = nfs_put_super, 312 .put_super = nfs_put_super,
311 .statfs = nfs_statfs, 313 .statfs = nfs_statfs,
312 .evict_inode = nfs_evict_inode, 314 .evict_inode = nfs_evict_inode,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9347ab7c9574..5209916e1222 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -202,7 +202,6 @@ out:
202/* A writeback failed: mark the page as bad, and invalidate the page cache */ 202/* A writeback failed: mark the page as bad, and invalidate the page cache */
203static void nfs_set_pageerror(struct page *page) 203static void nfs_set_pageerror(struct page *page)
204{ 204{
205 SetPageError(page);
206 nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page)); 205 nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page));
207} 206}
208 207
@@ -239,21 +238,18 @@ int nfs_congestion_kb;
239#define NFS_CONGESTION_OFF_THRESH \ 238#define NFS_CONGESTION_OFF_THRESH \
240 (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2)) 239 (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
241 240
242static int nfs_set_page_writeback(struct page *page) 241static void nfs_set_page_writeback(struct page *page)
243{ 242{
243 struct nfs_server *nfss = NFS_SERVER(page_file_mapping(page)->host);
244 int ret = test_set_page_writeback(page); 244 int ret = test_set_page_writeback(page);
245 245
246 if (!ret) { 246 WARN_ON_ONCE(ret != 0);
247 struct inode *inode = page_file_mapping(page)->host;
248 struct nfs_server *nfss = NFS_SERVER(inode);
249 247
250 if (atomic_long_inc_return(&nfss->writeback) > 248 if (atomic_long_inc_return(&nfss->writeback) >
251 NFS_CONGESTION_ON_THRESH) { 249 NFS_CONGESTION_ON_THRESH) {
252 set_bdi_congested(&nfss->backing_dev_info, 250 set_bdi_congested(&nfss->backing_dev_info,
253 BLK_RW_ASYNC); 251 BLK_RW_ASYNC);
254 }
255 } 252 }
256 return ret;
257} 253}
258 254
259static void nfs_end_page_writeback(struct page *page) 255static void nfs_end_page_writeback(struct page *page)
@@ -315,10 +311,10 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
315 if (IS_ERR(req)) 311 if (IS_ERR(req))
316 goto out; 312 goto out;
317 313
318 ret = nfs_set_page_writeback(page); 314 nfs_set_page_writeback(page);
319 BUG_ON(ret != 0); 315 WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
320 BUG_ON(test_bit(PG_CLEAN, &req->wb_flags));
321 316
317 ret = 0;
322 if (!nfs_pageio_add_request(pgio, req)) { 318 if (!nfs_pageio_add_request(pgio, req)) {
323 nfs_redirty_request(req); 319 nfs_redirty_request(req);
324 ret = pgio->pg_error; 320 ret = pgio->pg_error;
@@ -451,8 +447,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
451 struct inode *inode = req->wb_context->dentry->d_inode; 447 struct inode *inode = req->wb_context->dentry->d_inode;
452 struct nfs_inode *nfsi = NFS_I(inode); 448 struct nfs_inode *nfsi = NFS_I(inode);
453 449
454 BUG_ON (!NFS_WBACK_BUSY(req));
455
456 spin_lock(&inode->i_lock); 450 spin_lock(&inode->i_lock);
457 if (likely(!PageSwapCache(req->wb_page))) { 451 if (likely(!PageSwapCache(req->wb_page))) {
458 set_page_private(req->wb_page, 0); 452 set_page_private(req->wb_page, 0);
@@ -884,7 +878,7 @@ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
884{ 878{
885 if (nfs_have_delegated_attributes(inode)) 879 if (nfs_have_delegated_attributes(inode))
886 goto out; 880 goto out;
887 if (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE) 881 if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
888 return false; 882 return false;
889out: 883out:
890 return PageUptodate(page) != 0; 884 return PageUptodate(page) != 0;
@@ -1727,7 +1721,6 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
1727 struct nfs_page *req; 1721 struct nfs_page *req;
1728 int ret = 0; 1722 int ret = 0;
1729 1723
1730 BUG_ON(!PageLocked(page));
1731 for (;;) { 1724 for (;;) {
1732 wait_on_page_writeback(page); 1725 wait_on_page_writeback(page);
1733 req = nfs_page_find_request(page); 1726 req = nfs_page_find_request(page);
@@ -1829,7 +1822,7 @@ int __init nfs_init_writepagecache(void)
1829 goto out_destroy_write_mempool; 1822 goto out_destroy_write_mempool;
1830 1823
1831 nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT, 1824 nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
1832 nfs_wdata_cachep); 1825 nfs_cdata_cachep);
1833 if (nfs_commit_mempool == NULL) 1826 if (nfs_commit_mempool == NULL)
1834 goto out_destroy_commit_cache; 1827 goto out_destroy_commit_cache;
1835 1828
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 3e7b2a0dc0c8..07f76db04ec7 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -431,7 +431,7 @@ void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
431 mapping->host = inode; 431 mapping->host = inode;
432 mapping->flags = 0; 432 mapping->flags = 0;
433 mapping_set_gfp_mask(mapping, GFP_NOFS); 433 mapping_set_gfp_mask(mapping, GFP_NOFS);
434 mapping->assoc_mapping = NULL; 434 mapping->private_data = NULL;
435 mapping->backing_dev_info = bdi; 435 mapping->backing_dev_info = bdi;
436 mapping->a_ops = &empty_aops; 436 mapping->a_ops = &empty_aops;
437} 437}
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index ae5f33a6d868..96d3420d0242 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,5 +1,5 @@
1obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o \ 1obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o \
2 mark.o vfsmount_mark.o 2 mark.o vfsmount_mark.o fdinfo.o
3 3
4obj-y += dnotify/ 4obj-y += dnotify/
5obj-y += inotify/ 5obj-y += inotify/
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 7dceff005a67..e5f911bd80d2 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -4,7 +4,7 @@ config FANOTIFY
4 select ANON_INODES 4 select ANON_INODES
5 default n 5 default n
6 ---help--- 6 ---help---
7 Say Y here to enable fanotify suport. fanotify is a file access 7 Say Y here to enable fanotify support. fanotify is a file access
8 notification system which differs from inotify in that it sends 8 notification system which differs from inotify in that it sends
9 an open file descriptor to the userspace listener along with 9 an open file descriptor to the userspace listener along with
10 the event. 10 the event.
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index f35794b97e8e..a50636025364 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -21,6 +21,7 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
21 if ((old->path.mnt == new->path.mnt) && 21 if ((old->path.mnt == new->path.mnt) &&
22 (old->path.dentry == new->path.dentry)) 22 (old->path.dentry == new->path.dentry))
23 return true; 23 return true;
24 break;
24 case (FSNOTIFY_EVENT_NONE): 25 case (FSNOTIFY_EVENT_NONE):
25 return true; 26 return true;
26 default: 27 default:
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 721d692fa8d4..a5cd9bba022f 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -17,6 +17,7 @@
17#include <asm/ioctls.h> 17#include <asm/ioctls.h>
18 18
19#include "../../mount.h" 19#include "../../mount.h"
20#include "../fdinfo.h"
20 21
21#define FANOTIFY_DEFAULT_MAX_EVENTS 16384 22#define FANOTIFY_DEFAULT_MAX_EVENTS 16384
22#define FANOTIFY_DEFAULT_MAX_MARKS 8192 23#define FANOTIFY_DEFAULT_MAX_MARKS 8192
@@ -258,7 +259,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
258 if (ret) 259 if (ret)
259 goto out_close_fd; 260 goto out_close_fd;
260 261
261 fd_install(fd, f); 262 if (fd != FAN_NOFD)
263 fd_install(fd, f);
262 return fanotify_event_metadata.event_len; 264 return fanotify_event_metadata.event_len;
263 265
264out_close_fd: 266out_close_fd:
@@ -427,6 +429,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
427} 429}
428 430
429static const struct file_operations fanotify_fops = { 431static const struct file_operations fanotify_fops = {
432 .show_fdinfo = fanotify_show_fdinfo,
430 .poll = fanotify_poll, 433 .poll = fanotify_poll,
431 .read = fanotify_read, 434 .read = fanotify_read,
432 .write = fanotify_write, 435 .write = fanotify_write,
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
new file mode 100644
index 000000000000..514c4b81483d
--- /dev/null
+++ b/fs/notify/fdinfo.c
@@ -0,0 +1,179 @@
1#include <linux/file.h>
2#include <linux/fs.h>
3#include <linux/fsnotify_backend.h>
4#include <linux/idr.h>
5#include <linux/init.h>
6#include <linux/inotify.h>
7#include <linux/fanotify.h>
8#include <linux/kernel.h>
9#include <linux/namei.h>
10#include <linux/sched.h>
11#include <linux/types.h>
12#include <linux/seq_file.h>
13#include <linux/proc_fs.h>
14#include <linux/exportfs.h>
15
16#include "inotify/inotify.h"
17#include "../fs/mount.h"
18
19#if defined(CONFIG_PROC_FS)
20
21#if defined(CONFIG_INOTIFY_USER) || defined(CONFIG_FANOTIFY)
22
23static int show_fdinfo(struct seq_file *m, struct file *f,
24 int (*show)(struct seq_file *m, struct fsnotify_mark *mark))
25{
26 struct fsnotify_group *group = f->private_data;
27 struct fsnotify_mark *mark;
28 int ret = 0;
29
30 spin_lock(&group->mark_lock);
31 list_for_each_entry(mark, &group->marks_list, g_list) {
32 ret = show(m, mark);
33 if (ret)
34 break;
35 }
36 spin_unlock(&group->mark_lock);
37 return ret;
38}
39
40#if defined(CONFIG_EXPORTFS)
41static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
42{
43 struct {
44 struct file_handle handle;
45 u8 pad[64];
46 } f;
47 int size, ret, i;
48
49 f.handle.handle_bytes = sizeof(f.pad);
50 size = f.handle.handle_bytes >> 2;
51
52 ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
53 if ((ret == 255) || (ret == -ENOSPC)) {
54 WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
55 return 0;
56 }
57
58 f.handle.handle_type = ret;
59 f.handle.handle_bytes = size * sizeof(u32);
60
61 ret = seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:",
62 f.handle.handle_bytes, f.handle.handle_type);
63
64 for (i = 0; i < f.handle.handle_bytes; i++)
65 ret |= seq_printf(m, "%02x", (int)f.handle.f_handle[i]);
66
67 return ret;
68}
69#else
70static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
71{
72 return 0;
73}
74#endif
75
76#ifdef CONFIG_INOTIFY_USER
77
78static int inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
79{
80 struct inotify_inode_mark *inode_mark;
81 struct inode *inode;
82 int ret = 0;
83
84 if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE)))
85 return 0;
86
87 inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
88 inode = igrab(mark->i.inode);
89 if (inode) {
90 ret = seq_printf(m, "inotify wd:%x ino:%lx sdev:%x "
91 "mask:%x ignored_mask:%x ",
92 inode_mark->wd, inode->i_ino,
93 inode->i_sb->s_dev,
94 mark->mask, mark->ignored_mask);
95 ret |= show_mark_fhandle(m, inode);
96 ret |= seq_putc(m, '\n');
97 iput(inode);
98 }
99
100 return ret;
101}
102
103int inotify_show_fdinfo(struct seq_file *m, struct file *f)
104{
105 return show_fdinfo(m, f, inotify_fdinfo);
106}
107
108#endif /* CONFIG_INOTIFY_USER */
109
110#ifdef CONFIG_FANOTIFY
111
112static int fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
113{
114 unsigned int mflags = 0;
115 struct inode *inode;
116 int ret = 0;
117
118 if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE))
119 return 0;
120
121 if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
122 mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
123
124 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
125 inode = igrab(mark->i.inode);
126 if (!inode)
127 goto out;
128 ret = seq_printf(m, "fanotify ino:%lx sdev:%x "
129 "mflags:%x mask:%x ignored_mask:%x ",
130 inode->i_ino, inode->i_sb->s_dev,
131 mflags, mark->mask, mark->ignored_mask);
132 ret |= show_mark_fhandle(m, inode);
133 ret |= seq_putc(m, '\n');
134 iput(inode);
135 } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) {
136 struct mount *mnt = real_mount(mark->m.mnt);
137
138 ret = seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x "
139 "ignored_mask:%x\n", mnt->mnt_id, mflags,
140 mark->mask, mark->ignored_mask);
141 }
142out:
143 return ret;
144}
145
146int fanotify_show_fdinfo(struct seq_file *m, struct file *f)
147{
148 struct fsnotify_group *group = f->private_data;
149 unsigned int flags = 0;
150
151 switch (group->priority) {
152 case FS_PRIO_0:
153 flags |= FAN_CLASS_NOTIF;
154 break;
155 case FS_PRIO_1:
156 flags |= FAN_CLASS_CONTENT;
157 break;
158 case FS_PRIO_2:
159 flags |= FAN_CLASS_PRE_CONTENT;
160 break;
161 }
162
163 if (group->max_events == UINT_MAX)
164 flags |= FAN_UNLIMITED_QUEUE;
165
166 if (group->fanotify_data.max_marks == UINT_MAX)
167 flags |= FAN_UNLIMITED_MARKS;
168
169 seq_printf(m, "fanotify flags:%x event-flags:%x\n",
170 flags, group->fanotify_data.f_flags);
171
172 return show_fdinfo(m, f, fanotify_fdinfo);
173}
174
175#endif /* CONFIG_FANOTIFY */
176
177#endif /* CONFIG_INOTIFY_USER || CONFIG_FANOTIFY */
178
179#endif /* CONFIG_PROC_FS */
diff --git a/fs/notify/fdinfo.h b/fs/notify/fdinfo.h
new file mode 100644
index 000000000000..556afda990e9
--- /dev/null
+++ b/fs/notify/fdinfo.h
@@ -0,0 +1,27 @@
1#ifndef __FSNOTIFY_FDINFO_H__
2#define __FSNOTIFY_FDINFO_H__
3
4#include <linux/errno.h>
5#include <linux/proc_fs.h>
6
7struct seq_file;
8struct file;
9
10#ifdef CONFIG_PROC_FS
11
12#ifdef CONFIG_INOTIFY_USER
13extern int inotify_show_fdinfo(struct seq_file *m, struct file *f);
14#endif
15
16#ifdef CONFIG_FANOTIFY
17extern int fanotify_show_fdinfo(struct seq_file *m, struct file *f);
18#endif
19
20#else /* CONFIG_PROC_FS */
21
22#define inotify_show_fdinfo NULL
23#define fanotify_show_fdinfo NULL
24
25#endif /* CONFIG_PROC_FS */
26
27#endif /* __FSNOTIFY_FDINFO_H__ */
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index b13c00ac48eb..f3035691f528 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -116,8 +116,9 @@ void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
116 * given a group and inode, find the mark associated with that combination. 116 * given a group and inode, find the mark associated with that combination.
117 * if found take a reference to that mark and return it, else return NULL 117 * if found take a reference to that mark and return it, else return NULL
118 */ 118 */
119struct fsnotify_mark *fsnotify_find_inode_mark_locked(struct fsnotify_group *group, 119static struct fsnotify_mark *fsnotify_find_inode_mark_locked(
120 struct inode *inode) 120 struct fsnotify_group *group,
121 struct inode *inode)
121{ 122{
122 struct fsnotify_mark *mark; 123 struct fsnotify_mark *mark;
123 struct hlist_node *pos; 124 struct hlist_node *pos;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index c311dda054a3..36cb013c7c13 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -40,6 +40,7 @@
40#include <linux/wait.h> 40#include <linux/wait.h>
41 41
42#include "inotify.h" 42#include "inotify.h"
43#include "../fdinfo.h"
43 44
44#include <asm/ioctls.h> 45#include <asm/ioctls.h>
45 46
@@ -335,6 +336,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
335} 336}
336 337
337static const struct file_operations inotify_fops = { 338static const struct file_operations inotify_fops = {
339 .show_fdinfo = inotify_show_fdinfo,
338 .poll = inotify_poll, 340 .poll = inotify_poll,
339 .read = inotify_read, 341 .read = inotify_read,
340 .fasync = inotify_fasync, 342 .fasync = inotify_fasync,
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index c887b1378f7e..48cb994e4922 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -18,7 +18,7 @@
18 18
19/* 19/*
20 * Basic idea behind the notification queue: An fsnotify group (like inotify) 20 * Basic idea behind the notification queue: An fsnotify group (like inotify)
21 * sends the userspace notification about events asyncronously some time after 21 * sends the userspace notification about events asynchronously some time after
22 * the event happened. When inotify gets an event it will need to add that 22 * the event happened. When inotify gets an event it will need to add that
23 * event to the group notify queue. Since a single event might need to be on 23 * event to the group notify queue. Since a single event might need to be on
24 * multiple group's notification queues we can't add the event directly to each 24 * multiple group's notification queues we can't add the event directly to each
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 70b5863a2d64..f487aa343442 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -832,7 +832,7 @@ out:
832 return ret; 832 return ret;
833} 833}
834 834
835int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin) 835int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
836{ 836{
837 struct inode *inode = file->f_mapping->host; 837 struct inode *inode = file->f_mapping->host;
838 int ret; 838 int ret;
@@ -843,7 +843,7 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
843 struct buffer_head *di_bh = NULL; 843 struct buffer_head *di_bh = NULL;
844 struct ocfs2_extent_rec rec; 844 struct ocfs2_extent_rec rec;
845 845
846 BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE); 846 BUG_ON(whence != SEEK_DATA && whence != SEEK_HOLE);
847 847
848 ret = ocfs2_inode_lock(inode, &di_bh, 0); 848 ret = ocfs2_inode_lock(inode, &di_bh, 0);
849 if (ret) { 849 if (ret) {
@@ -859,7 +859,7 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
859 } 859 }
860 860
861 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 861 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
862 if (origin == SEEK_HOLE) 862 if (whence == SEEK_HOLE)
863 *offset = inode->i_size; 863 *offset = inode->i_size;
864 goto out_unlock; 864 goto out_unlock;
865 } 865 }
@@ -888,8 +888,8 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
888 is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1; 888 is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1;
889 } 889 }
890 890
891 if ((!is_data && origin == SEEK_HOLE) || 891 if ((!is_data && whence == SEEK_HOLE) ||
892 (is_data && origin == SEEK_DATA)) { 892 (is_data && whence == SEEK_DATA)) {
893 if (extoff > *offset) 893 if (extoff > *offset)
894 *offset = extoff; 894 *offset = extoff;
895 goto out_unlock; 895 goto out_unlock;
@@ -899,7 +899,7 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
899 cpos += clen; 899 cpos += clen;
900 } 900 }
901 901
902 if (origin == SEEK_HOLE) { 902 if (whence == SEEK_HOLE) {
903 extoff = cpos; 903 extoff = cpos;
904 extoff <<= cs_bits; 904 extoff <<= cs_bits;
905 extlen = clen; 905 extlen = clen;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 5a4ee77cec51..fe492e1a3cfc 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2513,18 +2513,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
2513 ret = sd.num_spliced; 2513 ret = sd.num_spliced;
2514 2514
2515 if (ret > 0) { 2515 if (ret > 0) {
2516 unsigned long nr_pages;
2517 int err; 2516 int err;
2518 2517
2519 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2520
2521 err = generic_write_sync(out, *ppos, ret); 2518 err = generic_write_sync(out, *ppos, ret);
2522 if (err) 2519 if (err)
2523 ret = err; 2520 ret = err;
2524 else 2521 else
2525 *ppos += ret; 2522 *ppos += ret;
2526 2523
2527 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 2524 balance_dirty_pages_ratelimited(mapping);
2528 } 2525 }
2529 2526
2530 return ret; 2527 return ret;
@@ -2640,14 +2637,14 @@ bail:
2640} 2637}
2641 2638
2642/* Refer generic_file_llseek_unlocked() */ 2639/* Refer generic_file_llseek_unlocked() */
2643static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin) 2640static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
2644{ 2641{
2645 struct inode *inode = file->f_mapping->host; 2642 struct inode *inode = file->f_mapping->host;
2646 int ret = 0; 2643 int ret = 0;
2647 2644
2648 mutex_lock(&inode->i_mutex); 2645 mutex_lock(&inode->i_mutex);
2649 2646
2650 switch (origin) { 2647 switch (whence) {
2651 case SEEK_SET: 2648 case SEEK_SET:
2652 break; 2649 break;
2653 case SEEK_END: 2650 case SEEK_END:
@@ -2662,7 +2659,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)
2662 break; 2659 break;
2663 case SEEK_DATA: 2660 case SEEK_DATA:
2664 case SEEK_HOLE: 2661 case SEEK_HOLE:
2665 ret = ocfs2_seek_data_hole_offset(file, &offset, origin); 2662 ret = ocfs2_seek_data_hole_offset(file, &offset, whence);
2666 if (ret) 2663 if (ret)
2667 goto out; 2664 goto out;
2668 break; 2665 break;
diff --git a/fs/open.c b/fs/open.c
index 59071f55bf7f..182d8667b7bd 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -435,7 +435,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
435 goto dput_and_out; 435 goto dput_and_out;
436 436
437 error = -EPERM; 437 error = -EPERM;
438 if (!capable(CAP_SYS_CHROOT)) 438 if (!nsown_capable(CAP_SYS_CHROOT))
439 goto dput_and_out; 439 goto dput_and_out;
440 error = security_path_chroot(&path); 440 error = security_path_chroot(&path);
441 if (error) 441 if (error)
diff --git a/fs/pnode.h b/fs/pnode.h
index 65c60979d541..19b853a3445c 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -22,6 +22,7 @@
22#define CL_COPY_ALL 0x04 22#define CL_COPY_ALL 0x04
23#define CL_MAKE_SHARED 0x08 23#define CL_MAKE_SHARED 0x08
24#define CL_PRIVATE 0x10 24#define CL_PRIVATE 0x10
25#define CL_SHARED_TO_SLAVE 0x20
25 26
26static inline void set_mnt_shared(struct mount *mnt) 27static inline void set_mnt_shared(struct mount *mnt)
27{ 28{
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 99349efbbc2b..981b05601931 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -21,6 +21,7 @@ proc-y += uptime.o
21proc-y += version.o 21proc-y += version.o
22proc-y += softirqs.o 22proc-y += softirqs.o
23proc-y += namespaces.o 23proc-y += namespaces.o
24proc-y += self.o
24proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o 25proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
25proc-$(CONFIG_NET) += proc_net.o 26proc-$(CONFIG_NET) += proc_net.o
26proc-$(CONFIG_PROC_KCORE) += kcore.o 27proc-$(CONFIG_PROC_KCORE) += kcore.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index c1c207c36cae..6a91e6ffbcbd 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -162,7 +162,7 @@ static inline const char *get_task_state(struct task_struct *tsk)
162static inline void task_state(struct seq_file *m, struct pid_namespace *ns, 162static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
163 struct pid *pid, struct task_struct *p) 163 struct pid *pid, struct task_struct *p)
164{ 164{
165 struct user_namespace *user_ns = current_user_ns(); 165 struct user_namespace *user_ns = seq_user_ns(m);
166 struct group_info *group_info; 166 struct group_info *group_info;
167 int g; 167 int g;
168 struct fdtable *fdt = NULL; 168 struct fdtable *fdt = NULL;
@@ -212,7 +212,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
212 group_info = cred->group_info; 212 group_info = cred->group_info;
213 task_unlock(p); 213 task_unlock(p);
214 214
215 for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++) 215 for (g = 0; g < group_info->ngroups; g++)
216 seq_printf(m, "%d ", 216 seq_printf(m, "%d ",
217 from_kgid_munged(user_ns, GROUP_AT(group_info, g))); 217 from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
218 put_cred(cred); 218 put_cred(cred);
@@ -220,7 +220,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
220 seq_putc(m, '\n'); 220 seq_putc(m, '\n');
221} 221}
222 222
223static void render_sigset_t(struct seq_file *m, const char *header, 223void render_sigset_t(struct seq_file *m, const char *header,
224 sigset_t *set) 224 sigset_t *set)
225{ 225{
226 int i; 226 int i;
@@ -308,6 +308,10 @@ static void render_cap_t(struct seq_file *m, const char *header,
308 seq_putc(m, '\n'); 308 seq_putc(m, '\n');
309} 309}
310 310
311/* Remove non-existent capabilities */
312#define NORM_CAPS(v) (v.cap[CAP_TO_INDEX(CAP_LAST_CAP)] &= \
313 CAP_TO_MASK(CAP_LAST_CAP + 1) - 1)
314
311static inline void task_cap(struct seq_file *m, struct task_struct *p) 315static inline void task_cap(struct seq_file *m, struct task_struct *p)
312{ 316{
313 const struct cred *cred; 317 const struct cred *cred;
@@ -321,12 +325,24 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
321 cap_bset = cred->cap_bset; 325 cap_bset = cred->cap_bset;
322 rcu_read_unlock(); 326 rcu_read_unlock();
323 327
328 NORM_CAPS(cap_inheritable);
329 NORM_CAPS(cap_permitted);
330 NORM_CAPS(cap_effective);
331 NORM_CAPS(cap_bset);
332
324 render_cap_t(m, "CapInh:\t", &cap_inheritable); 333 render_cap_t(m, "CapInh:\t", &cap_inheritable);
325 render_cap_t(m, "CapPrm:\t", &cap_permitted); 334 render_cap_t(m, "CapPrm:\t", &cap_permitted);
326 render_cap_t(m, "CapEff:\t", &cap_effective); 335 render_cap_t(m, "CapEff:\t", &cap_effective);
327 render_cap_t(m, "CapBnd:\t", &cap_bset); 336 render_cap_t(m, "CapBnd:\t", &cap_bset);
328} 337}
329 338
339static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
340{
341#ifdef CONFIG_SECCOMP
342 seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode);
343#endif
344}
345
330static inline void task_context_switch_counts(struct seq_file *m, 346static inline void task_context_switch_counts(struct seq_file *m,
331 struct task_struct *p) 347 struct task_struct *p)
332{ 348{
@@ -360,6 +376,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
360 } 376 }
361 task_sig(m, task); 377 task_sig(m, task);
362 task_cap(m, task); 378 task_cap(m, task);
379 task_seccomp(m, task);
363 task_cpus_allowed(m, task); 380 task_cpus_allowed(m, task);
364 cpuset_task_status_allowed(m, task); 381 cpuset_task_status_allowed(m, task);
365 task_context_switch_counts(m, task); 382 task_context_switch_counts(m, task);
@@ -438,7 +455,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
438 455
439 min_flt += sig->min_flt; 456 min_flt += sig->min_flt;
440 maj_flt += sig->maj_flt; 457 maj_flt += sig->maj_flt;
441 thread_group_times(task, &utime, &stime); 458 thread_group_cputime_adjusted(task, &utime, &stime);
442 gtime += sig->gtime; 459 gtime += sig->gtime;
443 } 460 }
444 461
@@ -454,7 +471,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
454 if (!whole) { 471 if (!whole) {
455 min_flt = task->min_flt; 472 min_flt = task->min_flt;
456 maj_flt = task->maj_flt; 473 maj_flt = task->maj_flt;
457 task_times(task, &utime, &stime); 474 task_cputime_adjusted(task, &utime, &stime);
458 gtime = task->gtime; 475 gtime = task->gtime;
459 } 476 }
460 477
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 144a96732dd7..5a5a0be40e40 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -873,12 +873,119 @@ static const struct file_operations proc_environ_operations = {
873 .release = mem_release, 873 .release = mem_release,
874}; 874};
875 875
876static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
877 loff_t *ppos)
878{
879 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
880 char buffer[PROC_NUMBUF];
881 int oom_adj = OOM_ADJUST_MIN;
882 size_t len;
883 unsigned long flags;
884
885 if (!task)
886 return -ESRCH;
887 if (lock_task_sighand(task, &flags)) {
888 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
889 oom_adj = OOM_ADJUST_MAX;
890 else
891 oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
892 OOM_SCORE_ADJ_MAX;
893 unlock_task_sighand(task, &flags);
894 }
895 put_task_struct(task);
896 len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
897 return simple_read_from_buffer(buf, count, ppos, buffer, len);
898}
899
900static ssize_t oom_adj_write(struct file *file, const char __user *buf,
901 size_t count, loff_t *ppos)
902{
903 struct task_struct *task;
904 char buffer[PROC_NUMBUF];
905 int oom_adj;
906 unsigned long flags;
907 int err;
908
909 memset(buffer, 0, sizeof(buffer));
910 if (count > sizeof(buffer) - 1)
911 count = sizeof(buffer) - 1;
912 if (copy_from_user(buffer, buf, count)) {
913 err = -EFAULT;
914 goto out;
915 }
916
917 err = kstrtoint(strstrip(buffer), 0, &oom_adj);
918 if (err)
919 goto out;
920 if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
921 oom_adj != OOM_DISABLE) {
922 err = -EINVAL;
923 goto out;
924 }
925
926 task = get_proc_task(file->f_path.dentry->d_inode);
927 if (!task) {
928 err = -ESRCH;
929 goto out;
930 }
931
932 task_lock(task);
933 if (!task->mm) {
934 err = -EINVAL;
935 goto err_task_lock;
936 }
937
938 if (!lock_task_sighand(task, &flags)) {
939 err = -ESRCH;
940 goto err_task_lock;
941 }
942
943 /*
944 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
945 * value is always attainable.
946 */
947 if (oom_adj == OOM_ADJUST_MAX)
948 oom_adj = OOM_SCORE_ADJ_MAX;
949 else
950 oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
951
952 if (oom_adj < task->signal->oom_score_adj &&
953 !capable(CAP_SYS_RESOURCE)) {
954 err = -EACCES;
955 goto err_sighand;
956 }
957
958 /*
959 * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
960 * /proc/pid/oom_score_adj instead.
961 */
962 printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
963 current->comm, task_pid_nr(current), task_pid_nr(task),
964 task_pid_nr(task));
965
966 task->signal->oom_score_adj = oom_adj;
967 trace_oom_score_adj_update(task);
968err_sighand:
969 unlock_task_sighand(task, &flags);
970err_task_lock:
971 task_unlock(task);
972 put_task_struct(task);
973out:
974 return err < 0 ? err : count;
975}
976
977static const struct file_operations proc_oom_adj_operations = {
978 .read = oom_adj_read,
979 .write = oom_adj_write,
980 .llseek = generic_file_llseek,
981};
982
876static ssize_t oom_score_adj_read(struct file *file, char __user *buf, 983static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
877 size_t count, loff_t *ppos) 984 size_t count, loff_t *ppos)
878{ 985{
879 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 986 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
880 char buffer[PROC_NUMBUF]; 987 char buffer[PROC_NUMBUF];
881 int oom_score_adj = OOM_SCORE_ADJ_MIN; 988 short oom_score_adj = OOM_SCORE_ADJ_MIN;
882 unsigned long flags; 989 unsigned long flags;
883 size_t len; 990 size_t len;
884 991
@@ -889,7 +996,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
889 unlock_task_sighand(task, &flags); 996 unlock_task_sighand(task, &flags);
890 } 997 }
891 put_task_struct(task); 998 put_task_struct(task);
892 len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj); 999 len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
893 return simple_read_from_buffer(buf, count, ppos, buffer, len); 1000 return simple_read_from_buffer(buf, count, ppos, buffer, len);
894} 1001}
895 1002
@@ -936,15 +1043,15 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
936 goto err_task_lock; 1043 goto err_task_lock;
937 } 1044 }
938 1045
939 if (oom_score_adj < task->signal->oom_score_adj_min && 1046 if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
940 !capable(CAP_SYS_RESOURCE)) { 1047 !capable(CAP_SYS_RESOURCE)) {
941 err = -EACCES; 1048 err = -EACCES;
942 goto err_sighand; 1049 goto err_sighand;
943 } 1050 }
944 1051
945 task->signal->oom_score_adj = oom_score_adj; 1052 task->signal->oom_score_adj = (short)oom_score_adj;
946 if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) 1053 if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
947 task->signal->oom_score_adj_min = oom_score_adj; 1054 task->signal->oom_score_adj_min = (short)oom_score_adj;
948 trace_oom_score_adj_update(task); 1055 trace_oom_score_adj_update(task);
949 1056
950err_sighand: 1057err_sighand:
@@ -1770,8 +1877,9 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
1770 if (!vma) 1877 if (!vma)
1771 goto out_no_vma; 1878 goto out_no_vma;
1772 1879
1773 result = proc_map_files_instantiate(dir, dentry, task, 1880 if (vma->vm_file)
1774 (void *)(unsigned long)vma->vm_file->f_mode); 1881 result = proc_map_files_instantiate(dir, dentry, task,
1882 (void *)(unsigned long)vma->vm_file->f_mode);
1775 1883
1776out_no_vma: 1884out_no_vma:
1777 up_read(&mm->mmap_sem); 1885 up_read(&mm->mmap_sem);
@@ -2237,146 +2345,6 @@ static const struct file_operations proc_coredump_filter_operations = {
2237}; 2345};
2238#endif 2346#endif
2239 2347
2240/*
2241 * /proc/self:
2242 */
2243static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
2244 int buflen)
2245{
2246 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
2247 pid_t tgid = task_tgid_nr_ns(current, ns);
2248 char tmp[PROC_NUMBUF];
2249 if (!tgid)
2250 return -ENOENT;
2251 sprintf(tmp, "%d", tgid);
2252 return vfs_readlink(dentry,buffer,buflen,tmp);
2253}
2254
2255static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
2256{
2257 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
2258 pid_t tgid = task_tgid_nr_ns(current, ns);
2259 char *name = ERR_PTR(-ENOENT);
2260 if (tgid) {
2261 /* 11 for max length of signed int in decimal + NULL term */
2262 name = kmalloc(12, GFP_KERNEL);
2263 if (!name)
2264 name = ERR_PTR(-ENOMEM);
2265 else
2266 sprintf(name, "%d", tgid);
2267 }
2268 nd_set_link(nd, name);
2269 return NULL;
2270}
2271
2272static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
2273 void *cookie)
2274{
2275 char *s = nd_get_link(nd);
2276 if (!IS_ERR(s))
2277 kfree(s);
2278}
2279
2280static const struct inode_operations proc_self_inode_operations = {
2281 .readlink = proc_self_readlink,
2282 .follow_link = proc_self_follow_link,
2283 .put_link = proc_self_put_link,
2284};
2285
2286/*
2287 * proc base
2288 *
2289 * These are the directory entries in the root directory of /proc
2290 * that properly belong to the /proc filesystem, as they describe
2291 * describe something that is process related.
2292 */
2293static const struct pid_entry proc_base_stuff[] = {
2294 NOD("self", S_IFLNK|S_IRWXUGO,
2295 &proc_self_inode_operations, NULL, {}),
2296};
2297
2298static struct dentry *proc_base_instantiate(struct inode *dir,
2299 struct dentry *dentry, struct task_struct *task, const void *ptr)
2300{
2301 const struct pid_entry *p = ptr;
2302 struct inode *inode;
2303 struct proc_inode *ei;
2304 struct dentry *error;
2305
2306 /* Allocate the inode */
2307 error = ERR_PTR(-ENOMEM);
2308 inode = new_inode(dir->i_sb);
2309 if (!inode)
2310 goto out;
2311
2312 /* Initialize the inode */
2313 ei = PROC_I(inode);
2314 inode->i_ino = get_next_ino();
2315 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
2316
2317 /*
2318 * grab the reference to the task.
2319 */
2320 ei->pid = get_task_pid(task, PIDTYPE_PID);
2321 if (!ei->pid)
2322 goto out_iput;
2323
2324 inode->i_mode = p->mode;
2325 if (S_ISDIR(inode->i_mode))
2326 set_nlink(inode, 2);
2327 if (S_ISLNK(inode->i_mode))
2328 inode->i_size = 64;
2329 if (p->iop)
2330 inode->i_op = p->iop;
2331 if (p->fop)
2332 inode->i_fop = p->fop;
2333 ei->op = p->op;
2334 d_add(dentry, inode);
2335 error = NULL;
2336out:
2337 return error;
2338out_iput:
2339 iput(inode);
2340 goto out;
2341}
2342
2343static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry)
2344{
2345 struct dentry *error;
2346 struct task_struct *task = get_proc_task(dir);
2347 const struct pid_entry *p, *last;
2348
2349 error = ERR_PTR(-ENOENT);
2350
2351 if (!task)
2352 goto out_no_task;
2353
2354 /* Lookup the directory entry */
2355 last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1];
2356 for (p = proc_base_stuff; p <= last; p++) {
2357 if (p->len != dentry->d_name.len)
2358 continue;
2359 if (!memcmp(dentry->d_name.name, p->name, p->len))
2360 break;
2361 }
2362 if (p > last)
2363 goto out;
2364
2365 error = proc_base_instantiate(dir, dentry, task, p);
2366
2367out:
2368 put_task_struct(task);
2369out_no_task:
2370 return error;
2371}
2372
2373static int proc_base_fill_cache(struct file *filp, void *dirent,
2374 filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
2375{
2376 return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
2377 proc_base_instantiate, task, p);
2378}
2379
2380#ifdef CONFIG_TASK_IO_ACCOUNTING 2348#ifdef CONFIG_TASK_IO_ACCOUNTING
2381static int do_io_accounting(struct task_struct *task, char *buffer, int whole) 2349static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
2382{ 2350{
@@ -2598,6 +2566,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2598 REG("cgroup", S_IRUGO, proc_cgroup_operations), 2566 REG("cgroup", S_IRUGO, proc_cgroup_operations),
2599#endif 2567#endif
2600 INF("oom_score", S_IRUGO, proc_oom_score), 2568 INF("oom_score", S_IRUGO, proc_oom_score),
2569 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
2601 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), 2570 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
2602#ifdef CONFIG_AUDITSYSCALL 2571#ifdef CONFIG_AUDITSYSCALL
2603 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 2572 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
@@ -2730,10 +2699,6 @@ void proc_flush_task(struct task_struct *task)
2730 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, 2699 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
2731 tgid->numbers[i].nr); 2700 tgid->numbers[i].nr);
2732 } 2701 }
2733
2734 upid = &pid->numbers[pid->level];
2735 if (upid->nr == 1)
2736 pid_ns_release_proc(upid->ns);
2737} 2702}
2738 2703
2739static struct dentry *proc_pid_instantiate(struct inode *dir, 2704static struct dentry *proc_pid_instantiate(struct inode *dir,
@@ -2767,15 +2732,11 @@ out:
2767 2732
2768struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 2733struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
2769{ 2734{
2770 struct dentry *result; 2735 struct dentry *result = NULL;
2771 struct task_struct *task; 2736 struct task_struct *task;
2772 unsigned tgid; 2737 unsigned tgid;
2773 struct pid_namespace *ns; 2738 struct pid_namespace *ns;
2774 2739
2775 result = proc_base_lookup(dir, dentry);
2776 if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT)
2777 goto out;
2778
2779 tgid = name_to_int(dentry); 2740 tgid = name_to_int(dentry);
2780 if (tgid == ~0U) 2741 if (tgid == ~0U)
2781 goto out; 2742 goto out;
@@ -2838,7 +2799,7 @@ retry:
2838 return iter; 2799 return iter;
2839} 2800}
2840 2801
2841#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff)) 2802#define TGID_OFFSET (FIRST_PROCESS_ENTRY)
2842 2803
2843static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir, 2804static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
2844 struct tgid_iter iter) 2805 struct tgid_iter iter)
@@ -2858,25 +2819,12 @@ static int fake_filldir(void *buf, const char *name, int namelen,
2858/* for the /proc/ directory itself, after non-process stuff has been done */ 2819/* for the /proc/ directory itself, after non-process stuff has been done */
2859int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) 2820int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
2860{ 2821{
2861 unsigned int nr;
2862 struct task_struct *reaper;
2863 struct tgid_iter iter; 2822 struct tgid_iter iter;
2864 struct pid_namespace *ns; 2823 struct pid_namespace *ns;
2865 filldir_t __filldir; 2824 filldir_t __filldir;
2866 2825
2867 if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) 2826 if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
2868 goto out_no_task; 2827 goto out;
2869 nr = filp->f_pos - FIRST_PROCESS_ENTRY;
2870
2871 reaper = get_proc_task(filp->f_path.dentry->d_inode);
2872 if (!reaper)
2873 goto out_no_task;
2874
2875 for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
2876 const struct pid_entry *p = &proc_base_stuff[nr];
2877 if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
2878 goto out;
2879 }
2880 2828
2881 ns = filp->f_dentry->d_sb->s_fs_info; 2829 ns = filp->f_dentry->d_sb->s_fs_info;
2882 iter.task = NULL; 2830 iter.task = NULL;
@@ -2897,8 +2845,6 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
2897 } 2845 }
2898 filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; 2846 filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
2899out: 2847out:
2900 put_task_struct(reaper);
2901out_no_task:
2902 return 0; 2848 return 0;
2903} 2849}
2904 2850
@@ -2964,6 +2910,7 @@ static const struct pid_entry tid_base_stuff[] = {
2964 REG("cgroup", S_IRUGO, proc_cgroup_operations), 2910 REG("cgroup", S_IRUGO, proc_cgroup_operations),
2965#endif 2911#endif
2966 INF("oom_score", S_IRUGO, proc_oom_score), 2912 INF("oom_score", S_IRUGO, proc_oom_score),
2913 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
2967 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), 2914 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
2968#ifdef CONFIG_AUDITSYSCALL 2915#ifdef CONFIG_AUDITSYSCALL
2969 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 2916 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index f28a875f8779..d7a4a28ef630 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -50,6 +50,8 @@ static int seq_show(struct seq_file *m, void *v)
50 if (!ret) { 50 if (!ret) {
51 seq_printf(m, "pos:\t%lli\nflags:\t0%o\n", 51 seq_printf(m, "pos:\t%lli\nflags:\t0%o\n",
52 (long long)file->f_pos, f_flags); 52 (long long)file->f_pos, f_flags);
53 if (file->f_op->show_fdinfo)
54 ret = file->f_op->show_fdinfo(m, file);
53 fput(file); 55 fput(file);
54 } 56 }
55 57
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 0d80cef4cfb9..7b3ae3cc0ef9 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -350,14 +350,14 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
350 * Return an inode number between PROC_DYNAMIC_FIRST and 350 * Return an inode number between PROC_DYNAMIC_FIRST and
351 * 0xffffffff, or zero on failure. 351 * 0xffffffff, or zero on failure.
352 */ 352 */
353static unsigned int get_inode_number(void) 353int proc_alloc_inum(unsigned int *inum)
354{ 354{
355 unsigned int i; 355 unsigned int i;
356 int error; 356 int error;
357 357
358retry: 358retry:
359 if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0) 359 if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL))
360 return 0; 360 return -ENOMEM;
361 361
362 spin_lock(&proc_inum_lock); 362 spin_lock(&proc_inum_lock);
363 error = ida_get_new(&proc_inum_ida, &i); 363 error = ida_get_new(&proc_inum_ida, &i);
@@ -365,18 +365,19 @@ retry:
365 if (error == -EAGAIN) 365 if (error == -EAGAIN)
366 goto retry; 366 goto retry;
367 else if (error) 367 else if (error)
368 return 0; 368 return error;
369 369
370 if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { 370 if (i > UINT_MAX - PROC_DYNAMIC_FIRST) {
371 spin_lock(&proc_inum_lock); 371 spin_lock(&proc_inum_lock);
372 ida_remove(&proc_inum_ida, i); 372 ida_remove(&proc_inum_ida, i);
373 spin_unlock(&proc_inum_lock); 373 spin_unlock(&proc_inum_lock);
374 return 0; 374 return -ENOSPC;
375 } 375 }
376 return PROC_DYNAMIC_FIRST + i; 376 *inum = PROC_DYNAMIC_FIRST + i;
377 return 0;
377} 378}
378 379
379static void release_inode_number(unsigned int inum) 380void proc_free_inum(unsigned int inum)
380{ 381{
381 spin_lock(&proc_inum_lock); 382 spin_lock(&proc_inum_lock);
382 ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); 383 ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
@@ -554,13 +555,12 @@ static const struct inode_operations proc_dir_inode_operations = {
554 555
555static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) 556static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
556{ 557{
557 unsigned int i;
558 struct proc_dir_entry *tmp; 558 struct proc_dir_entry *tmp;
559 int ret;
559 560
560 i = get_inode_number(); 561 ret = proc_alloc_inum(&dp->low_ino);
561 if (i == 0) 562 if (ret)
562 return -EAGAIN; 563 return ret;
563 dp->low_ino = i;
564 564
565 if (S_ISDIR(dp->mode)) { 565 if (S_ISDIR(dp->mode)) {
566 if (dp->proc_iops == NULL) { 566 if (dp->proc_iops == NULL) {
@@ -764,7 +764,7 @@ EXPORT_SYMBOL(proc_create_data);
764 764
765static void free_proc_entry(struct proc_dir_entry *de) 765static void free_proc_entry(struct proc_dir_entry *de)
766{ 766{
767 release_inode_number(de->low_ino); 767 proc_free_inum(de->low_ino);
768 768
769 if (S_ISLNK(de->mode)) 769 if (S_ISLNK(de->mode))
770 kfree(de->data); 770 kfree(de->data);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 3b22bbdee9ec..439ae6886507 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -31,6 +31,7 @@ static void proc_evict_inode(struct inode *inode)
31 struct proc_dir_entry *de; 31 struct proc_dir_entry *de;
32 struct ctl_table_header *head; 32 struct ctl_table_header *head;
33 const struct proc_ns_operations *ns_ops; 33 const struct proc_ns_operations *ns_ops;
34 void *ns;
34 35
35 truncate_inode_pages(&inode->i_data, 0); 36 truncate_inode_pages(&inode->i_data, 0);
36 clear_inode(inode); 37 clear_inode(inode);
@@ -49,8 +50,9 @@ static void proc_evict_inode(struct inode *inode)
49 } 50 }
50 /* Release any associated namespace */ 51 /* Release any associated namespace */
51 ns_ops = PROC_I(inode)->ns_ops; 52 ns_ops = PROC_I(inode)->ns_ops;
52 if (ns_ops && ns_ops->put) 53 ns = PROC_I(inode)->ns;
53 ns_ops->put(PROC_I(inode)->ns); 54 if (ns_ops && ns)
55 ns_ops->put(ns);
54} 56}
55 57
56static struct kmem_cache * proc_inode_cachep; 58static struct kmem_cache * proc_inode_cachep;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 43973b084abf..252544c05207 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -15,6 +15,7 @@ struct ctl_table_header;
15struct mempolicy; 15struct mempolicy;
16 16
17extern struct proc_dir_entry proc_root; 17extern struct proc_dir_entry proc_root;
18extern void proc_self_init(void);
18#ifdef CONFIG_PROC_SYSCTL 19#ifdef CONFIG_PROC_SYSCTL
19extern int proc_sys_init(void); 20extern int proc_sys_init(void);
20extern void sysctl_head_put(struct ctl_table_header *head); 21extern void sysctl_head_put(struct ctl_table_header *head);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 86c67eee439f..e96d4f18ca3a 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -249,7 +249,7 @@ static int kcore_update_ram(void)
249 /* Not inialized....update now */ 249 /* Not inialized....update now */
250 /* find out "max pfn" */ 250 /* find out "max pfn" */
251 end_pfn = 0; 251 end_pfn = 0;
252 for_each_node_state(nid, N_HIGH_MEMORY) { 252 for_each_node_state(nid, N_MEMORY) {
253 unsigned long node_end; 253 unsigned long node_end;
254 node_end = NODE_DATA(nid)->node_start_pfn + 254 node_end = NODE_DATA(nid)->node_start_pfn +
255 NODE_DATA(nid)->node_spanned_pages; 255 NODE_DATA(nid)->node_spanned_pages;
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index b178ed733c36..b7a47196c8c3 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -11,6 +11,7 @@
11#include <net/net_namespace.h> 11#include <net/net_namespace.h>
12#include <linux/ipc_namespace.h> 12#include <linux/ipc_namespace.h>
13#include <linux/pid_namespace.h> 13#include <linux/pid_namespace.h>
14#include <linux/user_namespace.h>
14#include "internal.h" 15#include "internal.h"
15 16
16 17
@@ -24,12 +25,168 @@ static const struct proc_ns_operations *ns_entries[] = {
24#ifdef CONFIG_IPC_NS 25#ifdef CONFIG_IPC_NS
25 &ipcns_operations, 26 &ipcns_operations,
26#endif 27#endif
28#ifdef CONFIG_PID_NS
29 &pidns_operations,
30#endif
31#ifdef CONFIG_USER_NS
32 &userns_operations,
33#endif
34 &mntns_operations,
27}; 35};
28 36
29static const struct file_operations ns_file_operations = { 37static const struct file_operations ns_file_operations = {
30 .llseek = no_llseek, 38 .llseek = no_llseek,
31}; 39};
32 40
41static const struct inode_operations ns_inode_operations = {
42 .setattr = proc_setattr,
43};
44
45static int ns_delete_dentry(const struct dentry *dentry)
46{
47 /* Don't cache namespace inodes when not in use */
48 return 1;
49}
50
51static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
52{
53 struct inode *inode = dentry->d_inode;
54 const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
55
56 return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
57 ns_ops->name, inode->i_ino);
58}
59
60const struct dentry_operations ns_dentry_operations =
61{
62 .d_delete = ns_delete_dentry,
63 .d_dname = ns_dname,
64};
65
66static struct dentry *proc_ns_get_dentry(struct super_block *sb,
67 struct task_struct *task, const struct proc_ns_operations *ns_ops)
68{
69 struct dentry *dentry, *result;
70 struct inode *inode;
71 struct proc_inode *ei;
72 struct qstr qname = { .name = "", };
73 void *ns;
74
75 ns = ns_ops->get(task);
76 if (!ns)
77 return ERR_PTR(-ENOENT);
78
79 dentry = d_alloc_pseudo(sb, &qname);
80 if (!dentry) {
81 ns_ops->put(ns);
82 return ERR_PTR(-ENOMEM);
83 }
84
85 inode = iget_locked(sb, ns_ops->inum(ns));
86 if (!inode) {
87 dput(dentry);
88 ns_ops->put(ns);
89 return ERR_PTR(-ENOMEM);
90 }
91
92 ei = PROC_I(inode);
93 if (inode->i_state & I_NEW) {
94 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
95 inode->i_op = &ns_inode_operations;
96 inode->i_mode = S_IFREG | S_IRUGO;
97 inode->i_fop = &ns_file_operations;
98 ei->ns_ops = ns_ops;
99 ei->ns = ns;
100 unlock_new_inode(inode);
101 } else {
102 ns_ops->put(ns);
103 }
104
105 d_set_d_op(dentry, &ns_dentry_operations);
106 result = d_instantiate_unique(dentry, inode);
107 if (result) {
108 dput(dentry);
109 dentry = result;
110 }
111
112 return dentry;
113}
114
115static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
116{
117 struct inode *inode = dentry->d_inode;
118 struct super_block *sb = inode->i_sb;
119 struct proc_inode *ei = PROC_I(inode);
120 struct task_struct *task;
121 struct dentry *ns_dentry;
122 void *error = ERR_PTR(-EACCES);
123
124 task = get_proc_task(inode);
125 if (!task)
126 goto out;
127
128 if (!ptrace_may_access(task, PTRACE_MODE_READ))
129 goto out_put_task;
130
131 ns_dentry = proc_ns_get_dentry(sb, task, ei->ns_ops);
132 if (IS_ERR(ns_dentry)) {
133 error = ERR_CAST(ns_dentry);
134 goto out_put_task;
135 }
136
137 dput(nd->path.dentry);
138 nd->path.dentry = ns_dentry;
139 error = NULL;
140
141out_put_task:
142 put_task_struct(task);
143out:
144 return error;
145}
146
147static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
148{
149 struct inode *inode = dentry->d_inode;
150 struct proc_inode *ei = PROC_I(inode);
151 const struct proc_ns_operations *ns_ops = ei->ns_ops;
152 struct task_struct *task;
153 void *ns;
154 char name[50];
155 int len = -EACCES;
156
157 task = get_proc_task(inode);
158 if (!task)
159 goto out;
160
161 if (!ptrace_may_access(task, PTRACE_MODE_READ))
162 goto out_put_task;
163
164 len = -ENOENT;
165 ns = ns_ops->get(task);
166 if (!ns)
167 goto out_put_task;
168
169 snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns));
170 len = strlen(name);
171
172 if (len > buflen)
173 len = buflen;
174 if (copy_to_user(buffer, name, len))
175 len = -EFAULT;
176
177 ns_ops->put(ns);
178out_put_task:
179 put_task_struct(task);
180out:
181 return len;
182}
183
184static const struct inode_operations proc_ns_link_inode_operations = {
185 .readlink = proc_ns_readlink,
186 .follow_link = proc_ns_follow_link,
187 .setattr = proc_setattr,
188};
189
33static struct dentry *proc_ns_instantiate(struct inode *dir, 190static struct dentry *proc_ns_instantiate(struct inode *dir,
34 struct dentry *dentry, struct task_struct *task, const void *ptr) 191 struct dentry *dentry, struct task_struct *task, const void *ptr)
35{ 192{
@@ -37,21 +194,15 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
37 struct inode *inode; 194 struct inode *inode;
38 struct proc_inode *ei; 195 struct proc_inode *ei;
39 struct dentry *error = ERR_PTR(-ENOENT); 196 struct dentry *error = ERR_PTR(-ENOENT);
40 void *ns;
41 197
42 inode = proc_pid_make_inode(dir->i_sb, task); 198 inode = proc_pid_make_inode(dir->i_sb, task);
43 if (!inode) 199 if (!inode)
44 goto out; 200 goto out;
45 201
46 ns = ns_ops->get(task);
47 if (!ns)
48 goto out_iput;
49
50 ei = PROC_I(inode); 202 ei = PROC_I(inode);
51 inode->i_mode = S_IFREG|S_IRUSR; 203 inode->i_mode = S_IFLNK|S_IRWXUGO;
52 inode->i_fop = &ns_file_operations; 204 inode->i_op = &proc_ns_link_inode_operations;
53 ei->ns_ops = ns_ops; 205 ei->ns_ops = ns_ops;
54 ei->ns = ns;
55 206
56 d_set_d_op(dentry, &pid_dentry_operations); 207 d_set_d_op(dentry, &pid_dentry_operations);
57 d_add(dentry, inode); 208 d_add(dentry, inode);
@@ -60,9 +211,6 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
60 error = NULL; 211 error = NULL;
61out: 212out:
62 return error; 213 return error;
63out_iput:
64 iput(inode);
65 goto out;
66} 214}
67 215
68static int proc_ns_fill_cache(struct file *filp, void *dirent, 216static int proc_ns_fill_cache(struct file *filp, void *dirent,
@@ -89,10 +237,6 @@ static int proc_ns_dir_readdir(struct file *filp, void *dirent,
89 if (!task) 237 if (!task)
90 goto out_no_task; 238 goto out_no_task;
91 239
92 ret = -EPERM;
93 if (!ptrace_may_access(task, PTRACE_MODE_READ))
94 goto out;
95
96 ret = 0; 240 ret = 0;
97 i = filp->f_pos; 241 i = filp->f_pos;
98 switch (i) { 242 switch (i) {
@@ -152,10 +296,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
152 if (!task) 296 if (!task)
153 goto out_no_task; 297 goto out_no_task;
154 298
155 error = ERR_PTR(-EPERM);
156 if (!ptrace_may_access(task, PTRACE_MODE_READ))
157 goto out;
158
159 last = &ns_entries[ARRAY_SIZE(ns_entries)]; 299 last = &ns_entries[ARRAY_SIZE(ns_entries)];
160 for (entry = ns_entries; entry < last; entry++) { 300 for (entry = ns_entries; entry < last; entry++) {
161 if (strlen((*entry)->name) != len) 301 if (strlen((*entry)->name) != len)
@@ -163,7 +303,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
163 if (!memcmp(dentry->d_name.name, (*entry)->name, len)) 303 if (!memcmp(dentry->d_name.name, (*entry)->name, len))
164 break; 304 break;
165 } 305 }
166 error = ERR_PTR(-ENOENT);
167 if (entry == last) 306 if (entry == last)
168 goto out; 307 goto out;
169 308
@@ -198,3 +337,7 @@ out_invalid:
198 return ERR_PTR(-EINVAL); 337 return ERR_PTR(-EINVAL);
199} 338}
200 339
340bool proc_ns_inode(struct inode *inode)
341{
342 return inode->i_fop == &ns_file_operations;
343}
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index df7dd08d4391..de20ec480fa0 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -195,11 +195,7 @@ void proc_device_tree_add_node(struct device_node *np,
195 set_node_proc_entry(np, de); 195 set_node_proc_entry(np, de);
196 for (child = NULL; (child = of_get_next_child(np, child));) { 196 for (child = NULL; (child = of_get_next_child(np, child));) {
197 /* Use everything after the last slash, or the full name */ 197 /* Use everything after the last slash, or the full name */
198 p = strrchr(child->full_name, '/'); 198 p = kbasename(child->full_name);
199 if (!p)
200 p = child->full_name;
201 else
202 ++p;
203 199
204 if (duplicate_name(de, p)) 200 if (duplicate_name(de, p))
205 p = fixup_name(np, de, p); 201 p = fixup_name(np, de, p);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index a781bdf06694..701580ddfcc3 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -378,12 +378,13 @@ static int test_perm(int mode, int op)
378 return -EACCES; 378 return -EACCES;
379} 379}
380 380
381static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) 381static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op)
382{ 382{
383 struct ctl_table_root *root = head->root;
383 int mode; 384 int mode;
384 385
385 if (root->permissions) 386 if (root->permissions)
386 mode = root->permissions(root, current->nsproxy, table); 387 mode = root->permissions(head, table);
387 else 388 else
388 mode = table->mode; 389 mode = table->mode;
389 390
@@ -491,7 +492,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
491 * and won't be until we finish. 492 * and won't be until we finish.
492 */ 493 */
493 error = -EPERM; 494 error = -EPERM;
494 if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ)) 495 if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ))
495 goto out; 496 goto out;
496 497
497 /* if that can happen at all, it should be -EINVAL, not -EISDIR */ 498 /* if that can happen at all, it should be -EINVAL, not -EISDIR */
@@ -717,7 +718,7 @@ static int proc_sys_permission(struct inode *inode, int mask)
717 if (!table) /* global root - r-xr-xr-x */ 718 if (!table) /* global root - r-xr-xr-x */
718 error = mask & MAY_WRITE ? -EACCES : 0; 719 error = mask & MAY_WRITE ? -EACCES : 0;
719 else /* Use the permissions on the sysctl table entry */ 720 else /* Use the permissions on the sysctl table entry */
720 error = sysctl_perm(head->root, table, mask & ~MAY_NOT_BLOCK); 721 error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK);
721 722
722 sysctl_head_finish(head); 723 sysctl_head_finish(head);
723 return error; 724 return error;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 9889a92d2e01..c6e9fac26bac 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -100,14 +100,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
100 int err; 100 int err;
101 struct super_block *sb; 101 struct super_block *sb;
102 struct pid_namespace *ns; 102 struct pid_namespace *ns;
103 struct proc_inode *ei;
104 char *options; 103 char *options;
105 104
106 if (flags & MS_KERNMOUNT) { 105 if (flags & MS_KERNMOUNT) {
107 ns = (struct pid_namespace *)data; 106 ns = (struct pid_namespace *)data;
108 options = NULL; 107 options = NULL;
109 } else { 108 } else {
110 ns = current->nsproxy->pid_ns; 109 ns = task_active_pid_ns(current);
111 options = data; 110 options = data;
112 } 111 }
113 112
@@ -130,13 +129,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
130 sb->s_flags |= MS_ACTIVE; 129 sb->s_flags |= MS_ACTIVE;
131 } 130 }
132 131
133 ei = PROC_I(sb->s_root->d_inode);
134 if (!ei->pid) {
135 rcu_read_lock();
136 ei->pid = get_pid(find_pid_ns(1, ns));
137 rcu_read_unlock();
138 }
139
140 return dget(sb->s_root); 132 return dget(sb->s_root);
141} 133}
142 134
@@ -153,6 +145,7 @@ static struct file_system_type proc_fs_type = {
153 .name = "proc", 145 .name = "proc",
154 .mount = proc_mount, 146 .mount = proc_mount,
155 .kill_sb = proc_kill_sb, 147 .kill_sb = proc_kill_sb,
148 .fs_flags = FS_USERNS_MOUNT,
156}; 149};
157 150
158void __init proc_root_init(void) 151void __init proc_root_init(void)
@@ -163,12 +156,8 @@ void __init proc_root_init(void)
163 err = register_filesystem(&proc_fs_type); 156 err = register_filesystem(&proc_fs_type);
164 if (err) 157 if (err)
165 return; 158 return;
166 err = pid_ns_prepare_proc(&init_pid_ns);
167 if (err) {
168 unregister_filesystem(&proc_fs_type);
169 return;
170 }
171 159
160 proc_self_init();
172 proc_symlink("mounts", NULL, "self/mounts"); 161 proc_symlink("mounts", NULL, "self/mounts");
173 162
174 proc_net_init(); 163 proc_net_init();
diff --git a/fs/proc/self.c b/fs/proc/self.c
new file mode 100644
index 000000000000..aa5cc3bff140
--- /dev/null
+++ b/fs/proc/self.c
@@ -0,0 +1,59 @@
1#include <linux/proc_fs.h>
2#include <linux/sched.h>
3#include <linux/namei.h>
4
5/*
6 * /proc/self:
7 */
8static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
9 int buflen)
10{
11 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
12 pid_t tgid = task_tgid_nr_ns(current, ns);
13 char tmp[PROC_NUMBUF];
14 if (!tgid)
15 return -ENOENT;
16 sprintf(tmp, "%d", tgid);
17 return vfs_readlink(dentry,buffer,buflen,tmp);
18}
19
20static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
21{
22 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
23 pid_t tgid = task_tgid_nr_ns(current, ns);
24 char *name = ERR_PTR(-ENOENT);
25 if (tgid) {
26 /* 11 for max length of signed int in decimal + NULL term */
27 name = kmalloc(12, GFP_KERNEL);
28 if (!name)
29 name = ERR_PTR(-ENOMEM);
30 else
31 sprintf(name, "%d", tgid);
32 }
33 nd_set_link(nd, name);
34 return NULL;
35}
36
37static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
38 void *cookie)
39{
40 char *s = nd_get_link(nd);
41 if (!IS_ERR(s))
42 kfree(s);
43}
44
45static const struct inode_operations proc_self_inode_operations = {
46 .readlink = proc_self_readlink,
47 .follow_link = proc_self_follow_link,
48 .put_link = proc_self_put_link,
49};
50
51void __init proc_self_init(void)
52{
53 struct proc_dir_entry *proc_self_symlink;
54 mode_t mode;
55
56 mode = S_IFLNK | S_IRWXUGO;
57 proc_self_symlink = proc_create("self", mode, NULL, NULL );
58 proc_self_symlink->proc_iops = &proc_self_inode_operations;
59}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 90c63f9392a5..448455b7fd91 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -526,6 +526,57 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
526 return 0; 526 return 0;
527} 527}
528 528
529static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
530{
531 /*
532 * Don't forget to update Documentation/ on changes.
533 */
534 static const char mnemonics[BITS_PER_LONG][2] = {
535 /*
536 * In case if we meet a flag we don't know about.
537 */
538 [0 ... (BITS_PER_LONG-1)] = "??",
539
540 [ilog2(VM_READ)] = "rd",
541 [ilog2(VM_WRITE)] = "wr",
542 [ilog2(VM_EXEC)] = "ex",
543 [ilog2(VM_SHARED)] = "sh",
544 [ilog2(VM_MAYREAD)] = "mr",
545 [ilog2(VM_MAYWRITE)] = "mw",
546 [ilog2(VM_MAYEXEC)] = "me",
547 [ilog2(VM_MAYSHARE)] = "ms",
548 [ilog2(VM_GROWSDOWN)] = "gd",
549 [ilog2(VM_PFNMAP)] = "pf",
550 [ilog2(VM_DENYWRITE)] = "dw",
551 [ilog2(VM_LOCKED)] = "lo",
552 [ilog2(VM_IO)] = "io",
553 [ilog2(VM_SEQ_READ)] = "sr",
554 [ilog2(VM_RAND_READ)] = "rr",
555 [ilog2(VM_DONTCOPY)] = "dc",
556 [ilog2(VM_DONTEXPAND)] = "de",
557 [ilog2(VM_ACCOUNT)] = "ac",
558 [ilog2(VM_NORESERVE)] = "nr",
559 [ilog2(VM_HUGETLB)] = "ht",
560 [ilog2(VM_NONLINEAR)] = "nl",
561 [ilog2(VM_ARCH_1)] = "ar",
562 [ilog2(VM_DONTDUMP)] = "dd",
563 [ilog2(VM_MIXEDMAP)] = "mm",
564 [ilog2(VM_HUGEPAGE)] = "hg",
565 [ilog2(VM_NOHUGEPAGE)] = "nh",
566 [ilog2(VM_MERGEABLE)] = "mg",
567 };
568 size_t i;
569
570 seq_puts(m, "VmFlags: ");
571 for (i = 0; i < BITS_PER_LONG; i++) {
572 if (vma->vm_flags & (1UL << i)) {
573 seq_printf(m, "%c%c ",
574 mnemonics[i][0], mnemonics[i][1]);
575 }
576 }
577 seq_putc(m, '\n');
578}
579
529static int show_smap(struct seq_file *m, void *v, int is_pid) 580static int show_smap(struct seq_file *m, void *v, int is_pid)
530{ 581{
531 struct proc_maps_private *priv = m->private; 582 struct proc_maps_private *priv = m->private;
@@ -581,6 +632,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
581 seq_printf(m, "Nonlinear: %8lu kB\n", 632 seq_printf(m, "Nonlinear: %8lu kB\n",
582 mss.nonlinear >> 10); 633 mss.nonlinear >> 10);
583 634
635 show_smap_vma_flags(m, vma);
636
584 if (m->count < m->size) /* vma is copied successfully */ 637 if (m->count < m->size) /* vma is copied successfully */
585 m->version = (vma != get_gate_vma(task->mm)) 638 m->version = (vma != get_gate_vma(task->mm))
586 ? vma->vm_start : 0; 639 ? vma->vm_start : 0;
@@ -643,7 +696,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
643 spinlock_t *ptl; 696 spinlock_t *ptl;
644 struct page *page; 697 struct page *page;
645 698
646 split_huge_page_pmd(walk->mm, pmd); 699 split_huge_page_pmd(vma, addr, pmd);
647 if (pmd_trans_unstable(pmd)) 700 if (pmd_trans_unstable(pmd))
648 return 0; 701 return 0;
649 702
@@ -1126,7 +1179,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
1126 return NULL; 1179 return NULL;
1127 1180
1128 nid = page_to_nid(page); 1181 nid = page_to_nid(page);
1129 if (!node_isset(nid, node_states[N_HIGH_MEMORY])) 1182 if (!node_isset(nid, node_states[N_MEMORY]))
1130 return NULL; 1183 return NULL;
1131 1184
1132 return page; 1185 return page;
@@ -1279,7 +1332,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1279 if (md->writeback) 1332 if (md->writeback)
1280 seq_printf(m, " writeback=%lu", md->writeback); 1333 seq_printf(m, " writeback=%lu", md->writeback);
1281 1334
1282 for_each_node_state(n, N_HIGH_MEMORY) 1335 for_each_node_state(n, N_MEMORY)
1283 if (md->node[n]) 1336 if (md->node[n])
1284 seq_printf(m, " N%d=%lu", n, md->node[n]); 1337 seq_printf(m, " N%d=%lu", n, md->node[n]);
1285out: 1338out:
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 2d57e1ac0115..43b12807a51d 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -28,7 +28,9 @@
28#include "internal.h" 28#include "internal.h"
29 29
30static void notrace pstore_ftrace_call(unsigned long ip, 30static void notrace pstore_ftrace_call(unsigned long ip,
31 unsigned long parent_ip) 31 unsigned long parent_ip,
32 struct ftrace_ops *op,
33 struct pt_regs *regs)
32{ 34{
33 unsigned long flags; 35 unsigned long flags;
34 struct pstore_ftrace_record rec = {}; 36 struct pstore_ftrace_record rec = {};
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 4ab572e6d277..67de74ca85f4 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -49,6 +49,7 @@ struct pstore_private {
49 struct pstore_info *psi; 49 struct pstore_info *psi;
50 enum pstore_type_id type; 50 enum pstore_type_id type;
51 u64 id; 51 u64 id;
52 int count;
52 ssize_t size; 53 ssize_t size;
53 char data[]; 54 char data[];
54}; 55};
@@ -150,13 +151,13 @@ static int pstore_file_open(struct inode *inode, struct file *file)
150 return 0; 151 return 0;
151} 152}
152 153
153static loff_t pstore_file_llseek(struct file *file, loff_t off, int origin) 154static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence)
154{ 155{
155 struct seq_file *sf = file->private_data; 156 struct seq_file *sf = file->private_data;
156 157
157 if (sf->op) 158 if (sf->op)
158 return seq_lseek(file, off, origin); 159 return seq_lseek(file, off, whence);
159 return default_llseek(file, off, origin); 160 return default_llseek(file, off, whence);
160} 161}
161 162
162static const struct file_operations pstore_file_operations = { 163static const struct file_operations pstore_file_operations = {
@@ -175,7 +176,8 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
175 struct pstore_private *p = dentry->d_inode->i_private; 176 struct pstore_private *p = dentry->d_inode->i_private;
176 177
177 if (p->psi->erase) 178 if (p->psi->erase)
178 p->psi->erase(p->type, p->id, p->psi); 179 p->psi->erase(p->type, p->id, p->count,
180 dentry->d_inode->i_ctime, p->psi);
179 181
180 return simple_unlink(dir, dentry); 182 return simple_unlink(dir, dentry);
181} 183}
@@ -270,7 +272,7 @@ int pstore_is_mounted(void)
270 * Load it up with "size" bytes of data from "buf". 272 * Load it up with "size" bytes of data from "buf".
271 * Set the mtime & ctime to the date that this record was originally stored. 273 * Set the mtime & ctime to the date that this record was originally stored.
272 */ 274 */
273int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, 275int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
274 char *data, size_t size, struct timespec time, 276 char *data, size_t size, struct timespec time,
275 struct pstore_info *psi) 277 struct pstore_info *psi)
276{ 278{
@@ -306,6 +308,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
306 goto fail_alloc; 308 goto fail_alloc;
307 private->type = type; 309 private->type = type;
308 private->id = id; 310 private->id = id;
311 private->count = count;
309 private->psi = psi; 312 private->psi = psi;
310 313
311 switch (type) { 314 switch (type) {
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 4847f588b7d5..937d820f273c 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -50,7 +50,7 @@ extern struct pstore_info *psinfo;
50extern void pstore_set_kmsg_bytes(int); 50extern void pstore_set_kmsg_bytes(int);
51extern void pstore_get_records(int); 51extern void pstore_get_records(int);
52extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, 52extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
53 char *data, size_t size, 53 int count, char *data, size_t size,
54 struct timespec time, struct pstore_info *psi); 54 struct timespec time, struct pstore_info *psi);
55extern int pstore_is_mounted(void); 55extern int pstore_is_mounted(void);
56 56
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index a40da07e93d6..5ea2e77ff023 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -136,7 +136,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
136 break; 136 break;
137 137
138 ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, 138 ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
139 hsize + len, psinfo); 139 oopscount, hsize + len, psinfo);
140 if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) 140 if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
141 pstore_new_entry = 1; 141 pstore_new_entry = 1;
142 142
@@ -161,6 +161,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
161 161
162 while (s < e) { 162 while (s < e) {
163 unsigned long flags; 163 unsigned long flags;
164 u64 id;
164 165
165 if (c > psinfo->bufsize) 166 if (c > psinfo->bufsize)
166 c = psinfo->bufsize; 167 c = psinfo->bufsize;
@@ -172,7 +173,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
172 spin_lock_irqsave(&psinfo->buf_lock, flags); 173 spin_lock_irqsave(&psinfo->buf_lock, flags);
173 } 174 }
174 memcpy(psinfo->buf, s, c); 175 memcpy(psinfo->buf, s, c);
175 psinfo->write(PSTORE_TYPE_CONSOLE, 0, NULL, 0, c, psinfo); 176 psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, c, psinfo);
176 spin_unlock_irqrestore(&psinfo->buf_lock, flags); 177 spin_unlock_irqrestore(&psinfo->buf_lock, flags);
177 s += c; 178 s += c;
178 c = e - s; 179 c = e - s;
@@ -196,7 +197,7 @@ static void pstore_register_console(void) {}
196 197
197static int pstore_write_compat(enum pstore_type_id type, 198static int pstore_write_compat(enum pstore_type_id type,
198 enum kmsg_dump_reason reason, 199 enum kmsg_dump_reason reason,
199 u64 *id, unsigned int part, 200 u64 *id, unsigned int part, int count,
200 size_t size, struct pstore_info *psi) 201 size_t size, struct pstore_info *psi)
201{ 202{
202 return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi); 203 return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi);
@@ -266,6 +267,7 @@ void pstore_get_records(int quiet)
266 char *buf = NULL; 267 char *buf = NULL;
267 ssize_t size; 268 ssize_t size;
268 u64 id; 269 u64 id;
270 int count;
269 enum pstore_type_id type; 271 enum pstore_type_id type;
270 struct timespec time; 272 struct timespec time;
271 int failed = 0, rc; 273 int failed = 0, rc;
@@ -277,9 +279,9 @@ void pstore_get_records(int quiet)
277 if (psi->open && psi->open(psi)) 279 if (psi->open && psi->open(psi))
278 goto out; 280 goto out;
279 281
280 while ((size = psi->read(&id, &type, &time, &buf, psi)) > 0) { 282 while ((size = psi->read(&id, &type, &count, &time, &buf, psi)) > 0) {
281 rc = pstore_mkfile(type, psi->name, id, buf, (size_t)size, 283 rc = pstore_mkfile(type, psi->name, id, count, buf,
282 time, psi); 284 (size_t)size, time, psi);
283 kfree(buf); 285 kfree(buf);
284 buf = NULL; 286 buf = NULL;
285 if (rc && (rc != -EEXIST || !quiet)) 287 if (rc && (rc != -EEXIST || !quiet))
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 1a4f6da58eab..f883e7e74305 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -132,9 +132,8 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
132} 132}
133 133
134static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, 134static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
135 struct timespec *time, 135 int *count, struct timespec *time,
136 char **buf, 136 char **buf, struct pstore_info *psi)
137 struct pstore_info *psi)
138{ 137{
139 ssize_t size; 138 ssize_t size;
140 struct ramoops_context *cxt = psi->data; 139 struct ramoops_context *cxt = psi->data;
@@ -189,7 +188,7 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
189 struct pstore_info *psi) 188 struct pstore_info *psi)
190{ 189{
191 struct ramoops_context *cxt = psi->data; 190 struct ramoops_context *cxt = psi->data;
192 struct persistent_ram_zone *prz = cxt->przs[cxt->dump_write_cnt]; 191 struct persistent_ram_zone *prz;
193 size_t hlen; 192 size_t hlen;
194 193
195 if (type == PSTORE_TYPE_CONSOLE) { 194 if (type == PSTORE_TYPE_CONSOLE) {
@@ -226,6 +225,11 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
226 if (part != 1) 225 if (part != 1)
227 return -ENOSPC; 226 return -ENOSPC;
228 227
228 if (!cxt->przs)
229 return -ENOSPC;
230
231 prz = cxt->przs[cxt->dump_write_cnt];
232
229 hlen = ramoops_write_kmsg_hdr(prz); 233 hlen = ramoops_write_kmsg_hdr(prz);
230 if (size + hlen > prz->buffer_size) 234 if (size + hlen > prz->buffer_size)
231 size = prz->buffer_size - hlen; 235 size = prz->buffer_size - hlen;
@@ -236,8 +240,8 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
236 return 0; 240 return 0;
237} 241}
238 242
239static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, 243static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
240 struct pstore_info *psi) 244 struct timespec time, struct pstore_info *psi)
241{ 245{
242 struct ramoops_context *cxt = psi->data; 246 struct ramoops_context *cxt = psi->data;
243 struct persistent_ram_zone *prz; 247 struct persistent_ram_zone *prz;
@@ -287,8 +291,9 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
287 kfree(cxt->przs); 291 kfree(cxt->przs);
288} 292}
289 293
290static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, 294static int __devinit ramoops_init_przs(struct device *dev,
291 phys_addr_t *paddr, size_t dump_mem_sz) 295 struct ramoops_context *cxt,
296 phys_addr_t *paddr, size_t dump_mem_sz)
292{ 297{
293 int err = -ENOMEM; 298 int err = -ENOMEM;
294 int i; 299 int i;
@@ -296,6 +301,11 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
296 if (!cxt->record_size) 301 if (!cxt->record_size)
297 return 0; 302 return 0;
298 303
304 if (*paddr + dump_mem_sz - cxt->phys_addr > cxt->size) {
305 dev_err(dev, "no room for dumps\n");
306 return -ENOMEM;
307 }
308
299 cxt->max_dump_cnt = dump_mem_sz / cxt->record_size; 309 cxt->max_dump_cnt = dump_mem_sz / cxt->record_size;
300 if (!cxt->max_dump_cnt) 310 if (!cxt->max_dump_cnt)
301 return -ENOMEM; 311 return -ENOMEM;
@@ -326,15 +336,20 @@ fail_prz:
326 return err; 336 return err;
327} 337}
328 338
329static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, 339static int __devinit ramoops_init_prz(struct device *dev,
330 struct persistent_ram_zone **prz, 340 struct ramoops_context *cxt,
331 phys_addr_t *paddr, size_t sz, u32 sig) 341 struct persistent_ram_zone **prz,
342 phys_addr_t *paddr, size_t sz, u32 sig)
332{ 343{
333 if (!sz) 344 if (!sz)
334 return 0; 345 return 0;
335 346
336 if (*paddr + sz > *paddr + cxt->size) 347 if (*paddr + sz - cxt->phys_addr > cxt->size) {
348 dev_err(dev, "no room for mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n",
349 sz, (unsigned long long)*paddr,
350 cxt->size, (unsigned long long)cxt->phys_addr);
337 return -ENOMEM; 351 return -ENOMEM;
352 }
338 353
339 *prz = persistent_ram_new(*paddr, sz, sig, cxt->ecc_size); 354 *prz = persistent_ram_new(*paddr, sz, sig, cxt->ecc_size);
340 if (IS_ERR(*prz)) { 355 if (IS_ERR(*prz)) {
@@ -374,10 +389,14 @@ static int __devinit ramoops_probe(struct platform_device *pdev)
374 goto fail_out; 389 goto fail_out;
375 } 390 }
376 391
377 pdata->mem_size = rounddown_pow_of_two(pdata->mem_size); 392 if (!is_power_of_2(pdata->mem_size))
378 pdata->record_size = rounddown_pow_of_two(pdata->record_size); 393 pdata->mem_size = rounddown_pow_of_two(pdata->mem_size);
379 pdata->console_size = rounddown_pow_of_two(pdata->console_size); 394 if (!is_power_of_2(pdata->record_size))
380 pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size); 395 pdata->record_size = rounddown_pow_of_two(pdata->record_size);
396 if (!is_power_of_2(pdata->console_size))
397 pdata->console_size = rounddown_pow_of_two(pdata->console_size);
398 if (!is_power_of_2(pdata->ftrace_size))
399 pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
381 400
382 cxt->dump_read_cnt = 0; 401 cxt->dump_read_cnt = 0;
383 cxt->size = pdata->mem_size; 402 cxt->size = pdata->mem_size;
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index af1661f7a54f..c7314f1771f5 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -307,6 +307,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
307 } 307 }
308} 308}
309 309
310#ifdef CONFIG_BLOCK
311
310/* Return 1 if 'cmd' will block on frozen filesystem */ 312/* Return 1 if 'cmd' will block on frozen filesystem */
311static int quotactl_cmd_write(int cmd) 313static int quotactl_cmd_write(int cmd)
312{ 314{
@@ -322,6 +324,8 @@ static int quotactl_cmd_write(int cmd)
322 return 1; 324 return 1;
323} 325}
324 326
327#endif /* CONFIG_BLOCK */
328
325/* 329/*
326 * look up a superblock on which quota ops will be performed 330 * look up a superblock on which quota ops will be performed
327 * - use the name of a block device to find the superblock thereon 331 * - use the name of a block device to find the superblock thereon
diff --git a/fs/read_write.c b/fs/read_write.c
index d06534857e9e..1edaf099ddd7 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -54,7 +54,7 @@ static loff_t lseek_execute(struct file *file, struct inode *inode,
54 * generic_file_llseek_size - generic llseek implementation for regular files 54 * generic_file_llseek_size - generic llseek implementation for regular files
55 * @file: file structure to seek on 55 * @file: file structure to seek on
56 * @offset: file offset to seek to 56 * @offset: file offset to seek to
57 * @origin: type of seek 57 * @whence: type of seek
58 * @size: max size of this file in file system 58 * @size: max size of this file in file system
59 * @eof: offset used for SEEK_END position 59 * @eof: offset used for SEEK_END position
60 * 60 *
@@ -67,12 +67,12 @@ static loff_t lseek_execute(struct file *file, struct inode *inode,
67 * read/writes behave like SEEK_SET against seeks. 67 * read/writes behave like SEEK_SET against seeks.
68 */ 68 */
69loff_t 69loff_t
70generic_file_llseek_size(struct file *file, loff_t offset, int origin, 70generic_file_llseek_size(struct file *file, loff_t offset, int whence,
71 loff_t maxsize, loff_t eof) 71 loff_t maxsize, loff_t eof)
72{ 72{
73 struct inode *inode = file->f_mapping->host; 73 struct inode *inode = file->f_mapping->host;
74 74
75 switch (origin) { 75 switch (whence) {
76 case SEEK_END: 76 case SEEK_END:
77 offset += eof; 77 offset += eof;
78 break; 78 break;
@@ -122,17 +122,17 @@ EXPORT_SYMBOL(generic_file_llseek_size);
122 * generic_file_llseek - generic llseek implementation for regular files 122 * generic_file_llseek - generic llseek implementation for regular files
123 * @file: file structure to seek on 123 * @file: file structure to seek on
124 * @offset: file offset to seek to 124 * @offset: file offset to seek to
125 * @origin: type of seek 125 * @whence: type of seek
126 * 126 *
127 * This is a generic implemenation of ->llseek useable for all normal local 127 * This is a generic implemenation of ->llseek useable for all normal local
128 * filesystems. It just updates the file offset to the value specified by 128 * filesystems. It just updates the file offset to the value specified by
129 * @offset and @origin under i_mutex. 129 * @offset and @whence under i_mutex.
130 */ 130 */
131loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) 131loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
132{ 132{
133 struct inode *inode = file->f_mapping->host; 133 struct inode *inode = file->f_mapping->host;
134 134
135 return generic_file_llseek_size(file, offset, origin, 135 return generic_file_llseek_size(file, offset, whence,
136 inode->i_sb->s_maxbytes, 136 inode->i_sb->s_maxbytes,
137 i_size_read(inode)); 137 i_size_read(inode));
138} 138}
@@ -142,32 +142,32 @@ EXPORT_SYMBOL(generic_file_llseek);
142 * noop_llseek - No Operation Performed llseek implementation 142 * noop_llseek - No Operation Performed llseek implementation
143 * @file: file structure to seek on 143 * @file: file structure to seek on
144 * @offset: file offset to seek to 144 * @offset: file offset to seek to
145 * @origin: type of seek 145 * @whence: type of seek
146 * 146 *
147 * This is an implementation of ->llseek useable for the rare special case when 147 * This is an implementation of ->llseek useable for the rare special case when
148 * userspace expects the seek to succeed but the (device) file is actually not 148 * userspace expects the seek to succeed but the (device) file is actually not
149 * able to perform the seek. In this case you use noop_llseek() instead of 149 * able to perform the seek. In this case you use noop_llseek() instead of
150 * falling back to the default implementation of ->llseek. 150 * falling back to the default implementation of ->llseek.
151 */ 151 */
152loff_t noop_llseek(struct file *file, loff_t offset, int origin) 152loff_t noop_llseek(struct file *file, loff_t offset, int whence)
153{ 153{
154 return file->f_pos; 154 return file->f_pos;
155} 155}
156EXPORT_SYMBOL(noop_llseek); 156EXPORT_SYMBOL(noop_llseek);
157 157
158loff_t no_llseek(struct file *file, loff_t offset, int origin) 158loff_t no_llseek(struct file *file, loff_t offset, int whence)
159{ 159{
160 return -ESPIPE; 160 return -ESPIPE;
161} 161}
162EXPORT_SYMBOL(no_llseek); 162EXPORT_SYMBOL(no_llseek);
163 163
164loff_t default_llseek(struct file *file, loff_t offset, int origin) 164loff_t default_llseek(struct file *file, loff_t offset, int whence)
165{ 165{
166 struct inode *inode = file->f_path.dentry->d_inode; 166 struct inode *inode = file->f_path.dentry->d_inode;
167 loff_t retval; 167 loff_t retval;
168 168
169 mutex_lock(&inode->i_mutex); 169 mutex_lock(&inode->i_mutex);
170 switch (origin) { 170 switch (whence) {
171 case SEEK_END: 171 case SEEK_END:
172 offset += i_size_read(inode); 172 offset += i_size_read(inode);
173 break; 173 break;
@@ -216,7 +216,7 @@ out:
216} 216}
217EXPORT_SYMBOL(default_llseek); 217EXPORT_SYMBOL(default_llseek);
218 218
219loff_t vfs_llseek(struct file *file, loff_t offset, int origin) 219loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
220{ 220{
221 loff_t (*fn)(struct file *, loff_t, int); 221 loff_t (*fn)(struct file *, loff_t, int);
222 222
@@ -225,11 +225,11 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
225 if (file->f_op && file->f_op->llseek) 225 if (file->f_op && file->f_op->llseek)
226 fn = file->f_op->llseek; 226 fn = file->f_op->llseek;
227 } 227 }
228 return fn(file, offset, origin); 228 return fn(file, offset, whence);
229} 229}
230EXPORT_SYMBOL(vfs_llseek); 230EXPORT_SYMBOL(vfs_llseek);
231 231
232SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin) 232SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
233{ 233{
234 off_t retval; 234 off_t retval;
235 struct fd f = fdget(fd); 235 struct fd f = fdget(fd);
@@ -237,8 +237,8 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
237 return -EBADF; 237 return -EBADF;
238 238
239 retval = -EINVAL; 239 retval = -EINVAL;
240 if (origin <= SEEK_MAX) { 240 if (whence <= SEEK_MAX) {
241 loff_t res = vfs_llseek(f.file, offset, origin); 241 loff_t res = vfs_llseek(f.file, offset, whence);
242 retval = res; 242 retval = res;
243 if (res != (loff_t)retval) 243 if (res != (loff_t)retval)
244 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ 244 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
@@ -250,7 +250,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
250#ifdef __ARCH_WANT_SYS_LLSEEK 250#ifdef __ARCH_WANT_SYS_LLSEEK
251SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, 251SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
252 unsigned long, offset_low, loff_t __user *, result, 252 unsigned long, offset_low, loff_t __user *, result,
253 unsigned int, origin) 253 unsigned int, whence)
254{ 254{
255 int retval; 255 int retval;
256 struct fd f = fdget(fd); 256 struct fd f = fdget(fd);
@@ -260,11 +260,11 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
260 return -EBADF; 260 return -EBADF;
261 261
262 retval = -EINVAL; 262 retval = -EINVAL;
263 if (origin > SEEK_MAX) 263 if (whence > SEEK_MAX)
264 goto out_putf; 264 goto out_putf;
265 265
266 offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low, 266 offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
267 origin); 267 whence);
268 268
269 retval = (int)offset; 269 retval = (int)offset;
270 if (offset >= 0) { 270 if (offset >= 0) {
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index f27f01a98aa2..d83736fbc26c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1782,8 +1782,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1782 1782
1783 BUG_ON(!th->t_trans_id); 1783 BUG_ON(!th->t_trans_id);
1784 1784
1785 dquot_initialize(inode); 1785 reiserfs_write_unlock(inode->i_sb);
1786 err = dquot_alloc_inode(inode); 1786 err = dquot_alloc_inode(inode);
1787 reiserfs_write_lock(inode->i_sb);
1787 if (err) 1788 if (err)
1788 goto out_end_trans; 1789 goto out_end_trans;
1789 if (!dir->i_nlink) { 1790 if (!dir->i_nlink) {
@@ -1979,8 +1980,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1979 1980
1980 out_end_trans: 1981 out_end_trans:
1981 journal_end(th, th->t_super, th->t_blocks_allocated); 1982 journal_end(th, th->t_super, th->t_blocks_allocated);
1983 reiserfs_write_unlock(inode->i_sb);
1982 /* Drop can be outside and it needs more credits so it's better to have it outside */ 1984 /* Drop can be outside and it needs more credits so it's better to have it outside */
1983 dquot_drop(inode); 1985 dquot_drop(inode);
1986 reiserfs_write_lock(inode->i_sb);
1984 inode->i_flags |= S_NOQUOTA; 1987 inode->i_flags |= S_NOQUOTA;
1985 make_bad_inode(inode); 1988 make_bad_inode(inode);
1986 1989
@@ -3103,10 +3106,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3103 /* must be turned off for recursive notify_change calls */ 3106 /* must be turned off for recursive notify_change calls */
3104 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); 3107 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
3105 3108
3106 depth = reiserfs_write_lock_once(inode->i_sb);
3107 if (is_quota_modification(inode, attr)) 3109 if (is_quota_modification(inode, attr))
3108 dquot_initialize(inode); 3110 dquot_initialize(inode);
3109 3111 depth = reiserfs_write_lock_once(inode->i_sb);
3110 if (attr->ia_valid & ATTR_SIZE) { 3112 if (attr->ia_valid & ATTR_SIZE) {
3111 /* version 2 items will be caught by the s_maxbytes check 3113 /* version 2 items will be caught by the s_maxbytes check
3112 ** done for us in vmtruncate 3114 ** done for us in vmtruncate
@@ -3170,7 +3172,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3170 error = journal_begin(&th, inode->i_sb, jbegin_count); 3172 error = journal_begin(&th, inode->i_sb, jbegin_count);
3171 if (error) 3173 if (error)
3172 goto out; 3174 goto out;
3175 reiserfs_write_unlock_once(inode->i_sb, depth);
3173 error = dquot_transfer(inode, attr); 3176 error = dquot_transfer(inode, attr);
3177 depth = reiserfs_write_lock_once(inode->i_sb);
3174 if (error) { 3178 if (error) {
3175 journal_end(&th, inode->i_sb, jbegin_count); 3179 journal_end(&th, inode->i_sb, jbegin_count);
3176 goto out; 3180 goto out;
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index f8afa4b162b8..2f40a4c70a4d 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1968,7 +1968,9 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
1968 key2type(&(key->on_disk_key))); 1968 key2type(&(key->on_disk_key)));
1969#endif 1969#endif
1970 1970
1971 reiserfs_write_unlock(inode->i_sb);
1971 retval = dquot_alloc_space_nodirty(inode, pasted_size); 1972 retval = dquot_alloc_space_nodirty(inode, pasted_size);
1973 reiserfs_write_lock(inode->i_sb);
1972 if (retval) { 1974 if (retval) {
1973 pathrelse(search_path); 1975 pathrelse(search_path);
1974 return retval; 1976 return retval;
@@ -2061,9 +2063,11 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
2061 "reiserquota insert_item(): allocating %u id=%u type=%c", 2063 "reiserquota insert_item(): allocating %u id=%u type=%c",
2062 quota_bytes, inode->i_uid, head2type(ih)); 2064 quota_bytes, inode->i_uid, head2type(ih));
2063#endif 2065#endif
2066 reiserfs_write_unlock(inode->i_sb);
2064 /* We can't dirty inode here. It would be immediately written but 2067 /* We can't dirty inode here. It would be immediately written but
2065 * appropriate stat item isn't inserted yet... */ 2068 * appropriate stat item isn't inserted yet... */
2066 retval = dquot_alloc_space_nodirty(inode, quota_bytes); 2069 retval = dquot_alloc_space_nodirty(inode, quota_bytes);
2070 reiserfs_write_lock(inode->i_sb);
2067 if (retval) { 2071 if (retval) {
2068 pathrelse(path); 2072 pathrelse(path);
2069 return retval; 2073 return retval;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 1078ae179993..418bdc3a57da 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -298,7 +298,9 @@ static int finish_unfinished(struct super_block *s)
298 retval = remove_save_link_only(s, &save_link_key, 0); 298 retval = remove_save_link_only(s, &save_link_key, 0);
299 continue; 299 continue;
300 } 300 }
301 reiserfs_write_unlock(s);
301 dquot_initialize(inode); 302 dquot_initialize(inode);
303 reiserfs_write_lock(s);
302 304
303 if (truncate && S_ISDIR(inode->i_mode)) { 305 if (truncate && S_ISDIR(inode->i_mode)) {
304 /* We got a truncate request for a dir which is impossible. 306 /* We got a truncate request for a dir which is impossible.
@@ -1335,7 +1337,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1335 kfree(qf_names[i]); 1337 kfree(qf_names[i]);
1336#endif 1338#endif
1337 err = -EINVAL; 1339 err = -EINVAL;
1338 goto out_err; 1340 goto out_unlock;
1339 } 1341 }
1340#ifdef CONFIG_QUOTA 1342#ifdef CONFIG_QUOTA
1341 handle_quota_files(s, qf_names, &qfmt); 1343 handle_quota_files(s, qf_names, &qfmt);
@@ -1379,7 +1381,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1379 if (blocks) { 1381 if (blocks) {
1380 err = reiserfs_resize(s, blocks); 1382 err = reiserfs_resize(s, blocks);
1381 if (err != 0) 1383 if (err != 0)
1382 goto out_err; 1384 goto out_unlock;
1383 } 1385 }
1384 1386
1385 if (*mount_flags & MS_RDONLY) { 1387 if (*mount_flags & MS_RDONLY) {
@@ -1389,9 +1391,15 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1389 /* it is read-only already */ 1391 /* it is read-only already */
1390 goto out_ok; 1392 goto out_ok;
1391 1393
1394 /*
1395 * Drop write lock. Quota will retake it when needed and lock
1396 * ordering requires calling dquot_suspend() without it.
1397 */
1398 reiserfs_write_unlock(s);
1392 err = dquot_suspend(s, -1); 1399 err = dquot_suspend(s, -1);
1393 if (err < 0) 1400 if (err < 0)
1394 goto out_err; 1401 goto out_err;
1402 reiserfs_write_lock(s);
1395 1403
1396 /* try to remount file system with read-only permissions */ 1404 /* try to remount file system with read-only permissions */
1397 if (sb_umount_state(rs) == REISERFS_VALID_FS 1405 if (sb_umount_state(rs) == REISERFS_VALID_FS
@@ -1401,7 +1409,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1401 1409
1402 err = journal_begin(&th, s, 10); 1410 err = journal_begin(&th, s, 10);
1403 if (err) 1411 if (err)
1404 goto out_err; 1412 goto out_unlock;
1405 1413
1406 /* Mounting a rw partition read-only. */ 1414 /* Mounting a rw partition read-only. */
1407 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); 1415 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
@@ -1416,7 +1424,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1416 1424
1417 if (reiserfs_is_journal_aborted(journal)) { 1425 if (reiserfs_is_journal_aborted(journal)) {
1418 err = journal->j_errno; 1426 err = journal->j_errno;
1419 goto out_err; 1427 goto out_unlock;
1420 } 1428 }
1421 1429
1422 handle_data_mode(s, mount_options); 1430 handle_data_mode(s, mount_options);
@@ -1425,7 +1433,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1425 s->s_flags &= ~MS_RDONLY; /* now it is safe to call journal_begin */ 1433 s->s_flags &= ~MS_RDONLY; /* now it is safe to call journal_begin */
1426 err = journal_begin(&th, s, 10); 1434 err = journal_begin(&th, s, 10);
1427 if (err) 1435 if (err)
1428 goto out_err; 1436 goto out_unlock;
1429 1437
1430 /* Mount a partition which is read-only, read-write */ 1438 /* Mount a partition which is read-only, read-write */
1431 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); 1439 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
@@ -1442,10 +1450,16 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1442 SB_JOURNAL(s)->j_must_wait = 1; 1450 SB_JOURNAL(s)->j_must_wait = 1;
1443 err = journal_end(&th, s, 10); 1451 err = journal_end(&th, s, 10);
1444 if (err) 1452 if (err)
1445 goto out_err; 1453 goto out_unlock;
1446 1454
1447 if (!(*mount_flags & MS_RDONLY)) { 1455 if (!(*mount_flags & MS_RDONLY)) {
1456 /*
1457 * Drop write lock. Quota will retake it when needed and lock
1458 * ordering requires calling dquot_resume() without it.
1459 */
1460 reiserfs_write_unlock(s);
1448 dquot_resume(s, -1); 1461 dquot_resume(s, -1);
1462 reiserfs_write_lock(s);
1449 finish_unfinished(s); 1463 finish_unfinished(s);
1450 reiserfs_xattr_init(s, *mount_flags); 1464 reiserfs_xattr_init(s, *mount_flags);
1451 } 1465 }
@@ -1455,9 +1469,10 @@ out_ok:
1455 reiserfs_write_unlock(s); 1469 reiserfs_write_unlock(s);
1456 return 0; 1470 return 0;
1457 1471
1472out_unlock:
1473 reiserfs_write_unlock(s);
1458out_err: 1474out_err:
1459 kfree(new_opts); 1475 kfree(new_opts);
1460 reiserfs_write_unlock(s);
1461 return err; 1476 return err;
1462} 1477}
1463 1478
@@ -2095,13 +2110,15 @@ static int reiserfs_write_dquot(struct dquot *dquot)
2095 REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); 2110 REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
2096 if (ret) 2111 if (ret)
2097 goto out; 2112 goto out;
2113 reiserfs_write_unlock(dquot->dq_sb);
2098 ret = dquot_commit(dquot); 2114 ret = dquot_commit(dquot);
2115 reiserfs_write_lock(dquot->dq_sb);
2099 err = 2116 err =
2100 journal_end(&th, dquot->dq_sb, 2117 journal_end(&th, dquot->dq_sb,
2101 REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); 2118 REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
2102 if (!ret && err) 2119 if (!ret && err)
2103 ret = err; 2120 ret = err;
2104 out: 2121out:
2105 reiserfs_write_unlock(dquot->dq_sb); 2122 reiserfs_write_unlock(dquot->dq_sb);
2106 return ret; 2123 return ret;
2107} 2124}
@@ -2117,13 +2134,15 @@ static int reiserfs_acquire_dquot(struct dquot *dquot)
2117 REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); 2134 REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
2118 if (ret) 2135 if (ret)
2119 goto out; 2136 goto out;
2137 reiserfs_write_unlock(dquot->dq_sb);
2120 ret = dquot_acquire(dquot); 2138 ret = dquot_acquire(dquot);
2139 reiserfs_write_lock(dquot->dq_sb);
2121 err = 2140 err =
2122 journal_end(&th, dquot->dq_sb, 2141 journal_end(&th, dquot->dq_sb,
2123 REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); 2142 REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
2124 if (!ret && err) 2143 if (!ret && err)
2125 ret = err; 2144 ret = err;
2126 out: 2145out:
2127 reiserfs_write_unlock(dquot->dq_sb); 2146 reiserfs_write_unlock(dquot->dq_sb);
2128 return ret; 2147 return ret;
2129} 2148}
@@ -2137,19 +2156,21 @@ static int reiserfs_release_dquot(struct dquot *dquot)
2137 ret = 2156 ret =
2138 journal_begin(&th, dquot->dq_sb, 2157 journal_begin(&th, dquot->dq_sb,
2139 REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb)); 2158 REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
2159 reiserfs_write_unlock(dquot->dq_sb);
2140 if (ret) { 2160 if (ret) {
2141 /* Release dquot anyway to avoid endless cycle in dqput() */ 2161 /* Release dquot anyway to avoid endless cycle in dqput() */
2142 dquot_release(dquot); 2162 dquot_release(dquot);
2143 goto out; 2163 goto out;
2144 } 2164 }
2145 ret = dquot_release(dquot); 2165 ret = dquot_release(dquot);
2166 reiserfs_write_lock(dquot->dq_sb);
2146 err = 2167 err =
2147 journal_end(&th, dquot->dq_sb, 2168 journal_end(&th, dquot->dq_sb,
2148 REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb)); 2169 REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
2149 if (!ret && err) 2170 if (!ret && err)
2150 ret = err; 2171 ret = err;
2151 out:
2152 reiserfs_write_unlock(dquot->dq_sb); 2172 reiserfs_write_unlock(dquot->dq_sb);
2173out:
2153 return ret; 2174 return ret;
2154} 2175}
2155 2176
@@ -2174,11 +2195,13 @@ static int reiserfs_write_info(struct super_block *sb, int type)
2174 ret = journal_begin(&th, sb, 2); 2195 ret = journal_begin(&th, sb, 2);
2175 if (ret) 2196 if (ret)
2176 goto out; 2197 goto out;
2198 reiserfs_write_unlock(sb);
2177 ret = dquot_commit_info(sb, type); 2199 ret = dquot_commit_info(sb, type);
2200 reiserfs_write_lock(sb);
2178 err = journal_end(&th, sb, 2); 2201 err = journal_end(&th, sb, 2);
2179 if (!ret && err) 2202 if (!ret && err)
2180 ret = err; 2203 ret = err;
2181 out: 2204out:
2182 reiserfs_write_unlock(sb); 2205 reiserfs_write_unlock(sb);
2183 return ret; 2206 return ret;
2184} 2207}
@@ -2203,8 +2226,11 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2203 struct reiserfs_transaction_handle th; 2226 struct reiserfs_transaction_handle th;
2204 int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA; 2227 int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA;
2205 2228
2206 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) 2229 reiserfs_write_lock(sb);
2207 return -EINVAL; 2230 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) {
2231 err = -EINVAL;
2232 goto out;
2233 }
2208 2234
2209 /* Quotafile not on the same filesystem? */ 2235 /* Quotafile not on the same filesystem? */
2210 if (path->dentry->d_sb != sb) { 2236 if (path->dentry->d_sb != sb) {
@@ -2246,8 +2272,10 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2246 if (err) 2272 if (err)
2247 goto out; 2273 goto out;
2248 } 2274 }
2249 err = dquot_quota_on(sb, type, format_id, path); 2275 reiserfs_write_unlock(sb);
2276 return dquot_quota_on(sb, type, format_id, path);
2250out: 2277out:
2278 reiserfs_write_unlock(sb);
2251 return err; 2279 return err;
2252} 2280}
2253 2281
@@ -2320,7 +2348,9 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
2320 tocopy = sb->s_blocksize - offset < towrite ? 2348 tocopy = sb->s_blocksize - offset < towrite ?
2321 sb->s_blocksize - offset : towrite; 2349 sb->s_blocksize - offset : towrite;
2322 tmp_bh.b_state = 0; 2350 tmp_bh.b_state = 0;
2351 reiserfs_write_lock(sb);
2323 err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE); 2352 err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE);
2353 reiserfs_write_unlock(sb);
2324 if (err) 2354 if (err)
2325 goto out; 2355 goto out;
2326 if (offset || tocopy != sb->s_blocksize) 2356 if (offset || tocopy != sb->s_blocksize)
@@ -2336,10 +2366,12 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
2336 flush_dcache_page(bh->b_page); 2366 flush_dcache_page(bh->b_page);
2337 set_buffer_uptodate(bh); 2367 set_buffer_uptodate(bh);
2338 unlock_buffer(bh); 2368 unlock_buffer(bh);
2369 reiserfs_write_lock(sb);
2339 reiserfs_prepare_for_journal(sb, bh, 1); 2370 reiserfs_prepare_for_journal(sb, bh, 1);
2340 journal_mark_dirty(current->journal_info, sb, bh); 2371 journal_mark_dirty(current->journal_info, sb, bh);
2341 if (!journal_quota) 2372 if (!journal_quota)
2342 reiserfs_add_ordered_list(inode, bh); 2373 reiserfs_add_ordered_list(inode, bh);
2374 reiserfs_write_unlock(sb);
2343 brelse(bh); 2375 brelse(bh);
2344 offset = 0; 2376 offset = 0;
2345 towrite -= tocopy; 2377 towrite -= tocopy;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 99dffab4c4e4..9d863fb501f9 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -300,14 +300,14 @@ EXPORT_SYMBOL(seq_read);
300 * 300 *
301 * Ready-made ->f_op->llseek() 301 * Ready-made ->f_op->llseek()
302 */ 302 */
303loff_t seq_lseek(struct file *file, loff_t offset, int origin) 303loff_t seq_lseek(struct file *file, loff_t offset, int whence)
304{ 304{
305 struct seq_file *m = file->private_data; 305 struct seq_file *m = file->private_data;
306 loff_t retval = -EINVAL; 306 loff_t retval = -EINVAL;
307 307
308 mutex_lock(&m->lock); 308 mutex_lock(&m->lock);
309 m->version = file->f_version; 309 m->version = file->f_version;
310 switch (origin) { 310 switch (whence) {
311 case 1: 311 case 1:
312 offset += file->f_pos; 312 offset += file->f_pos;
313 case 0: 313 case 0:
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 8bee4e570911..b53486961735 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -29,6 +29,7 @@
29#include <linux/anon_inodes.h> 29#include <linux/anon_inodes.h>
30#include <linux/signalfd.h> 30#include <linux/signalfd.h>
31#include <linux/syscalls.h> 31#include <linux/syscalls.h>
32#include <linux/proc_fs.h>
32 33
33void signalfd_cleanup(struct sighand_struct *sighand) 34void signalfd_cleanup(struct sighand_struct *sighand)
34{ 35{
@@ -227,7 +228,24 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
227 return total ? total: ret; 228 return total ? total: ret;
228} 229}
229 230
231#ifdef CONFIG_PROC_FS
232static int signalfd_show_fdinfo(struct seq_file *m, struct file *f)
233{
234 struct signalfd_ctx *ctx = f->private_data;
235 sigset_t sigmask;
236
237 sigmask = ctx->sigmask;
238 signotset(&sigmask);
239 render_sigset_t(m, "sigmask:\t", &sigmask);
240
241 return 0;
242}
243#endif
244
230static const struct file_operations signalfd_fops = { 245static const struct file_operations signalfd_fops = {
246#ifdef CONFIG_PROC_FS
247 .show_fdinfo = signalfd_show_fdinfo,
248#endif
231 .release = signalfd_release, 249 .release = signalfd_release,
232 .poll = signalfd_poll, 250 .poll = signalfd_poll,
233 .read = signalfd_read, 251 .read = signalfd_read,
diff --git a/fs/splice.c b/fs/splice.c
index 13e5b4776e7a..8890604e3fcd 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1024,17 +1024,14 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
1024 ret = sd.num_spliced; 1024 ret = sd.num_spliced;
1025 1025
1026 if (ret > 0) { 1026 if (ret > 0) {
1027 unsigned long nr_pages;
1028 int err; 1027 int err;
1029 1028
1030 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1031
1032 err = generic_write_sync(out, *ppos, ret); 1029 err = generic_write_sync(out, *ppos, ret);
1033 if (err) 1030 if (err)
1034 ret = err; 1031 ret = err;
1035 else 1032 else
1036 *ppos += ret; 1033 *ppos += ret;
1037 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 1034 balance_dirty_pages_ratelimited(mapping);
1038 } 1035 }
1039 sb_end_write(inode->i_sb); 1036 sb_end_write(inode->i_sb);
1040 1037
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 00012e31829d..602f56db0442 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -485,8 +485,8 @@ const struct file_operations sysfs_file_operations = {
485 .poll = sysfs_poll, 485 .poll = sysfs_poll,
486}; 486};
487 487
488int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr, 488static int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
489 const void **pns) 489 const void **pns)
490{ 490{
491 struct sysfs_dirent *dir_sd = kobj->sd; 491 struct sysfs_dirent *dir_sd = kobj->sd;
492 const struct sysfs_ops *ops; 492 const struct sysfs_ops *ops;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 71eb7e253927..db940a9be045 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -149,6 +149,7 @@ static struct file_system_type sysfs_fs_type = {
149 .name = "sysfs", 149 .name = "sysfs",
150 .mount = sysfs_mount, 150 .mount = sysfs_mount,
151 .kill_sb = sysfs_kill_sb, 151 .kill_sb = sysfs_kill_sb,
152 .fs_flags = FS_USERNS_MOUNT,
152}; 153};
153 154
154int __init sysfs_init(void) 155int __init sysfs_init(void)
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 62911637e12f..12817ffc7345 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2560,7 +2560,7 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write)
2560static int corrupt_data(const struct ubifs_info *c, const void *buf, 2560static int corrupt_data(const struct ubifs_info *c, const void *buf,
2561 unsigned int len) 2561 unsigned int len)
2562{ 2562{
2563 unsigned int from, to, i, ffs = chance(1, 2); 2563 unsigned int from, to, ffs = chance(1, 2);
2564 unsigned char *p = (void *)buf; 2564 unsigned char *p = (void *)buf;
2565 2565
2566 from = random32() % (len + 1); 2566 from = random32() % (len + 1);
@@ -2571,11 +2571,9 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf,
2571 ffs ? "0xFFs" : "random data"); 2571 ffs ? "0xFFs" : "random data");
2572 2572
2573 if (ffs) 2573 if (ffs)
2574 for (i = from; i < to; i++) 2574 memset(p + from, 0xFF, to - from);
2575 p[i] = 0xFF;
2576 else 2575 else
2577 for (i = from; i < to; i++) 2576 prandom_bytes(p + from, to - from);
2578 p[i] = random32() % 0x100;
2579 2577
2580 return to; 2578 return to;
2581} 2579}
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index e271fba1651b..8a574776a493 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -453,11 +453,11 @@ out:
453} 453}
454 454
455/* If a directory is seeked, we have to free saved readdir() state */ 455/* If a directory is seeked, we have to free saved readdir() state */
456static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin) 456static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence)
457{ 457{
458 kfree(file->private_data); 458 kfree(file->private_data);
459 file->private_data = NULL; 459 file->private_data = NULL;
460 return generic_file_llseek(file, offset, origin); 460 return generic_file_llseek(file, offset, whence);
461} 461}
462 462
463/* Free saved readdir() state when the directory is closed */ 463/* Free saved readdir() state when the directory is closed */
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 28ec13af28d9..2dcf3d473fec 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -681,8 +681,16 @@ int ubifs_find_free_leb_for_idx(struct ubifs_info *c)
681 if (!lprops) { 681 if (!lprops) {
682 lprops = ubifs_fast_find_freeable(c); 682 lprops = ubifs_fast_find_freeable(c);
683 if (!lprops) { 683 if (!lprops) {
684 ubifs_assert(c->freeable_cnt == 0); 684 /*
685 if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) { 685 * The first condition means the following: go scan the
686 * LPT if there are uncategorized lprops, which means
687 * there may be freeable LEBs there (UBIFS does not
688 * store the information about freeable LEBs in the
689 * master node).
690 */
691 if (c->in_a_category_cnt != c->main_lebs ||
692 c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
693 ubifs_assert(c->freeable_cnt == 0);
686 lprops = scan_for_leb_for_idx(c); 694 lprops = scan_for_leb_for_idx(c);
687 if (IS_ERR(lprops)) { 695 if (IS_ERR(lprops)) {
688 err = PTR_ERR(lprops); 696 err = PTR_ERR(lprops);
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index e5a2a35a46dc..46190a7c42a6 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -300,8 +300,11 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
300 default: 300 default:
301 ubifs_assert(0); 301 ubifs_assert(0);
302 } 302 }
303
303 lprops->flags &= ~LPROPS_CAT_MASK; 304 lprops->flags &= ~LPROPS_CAT_MASK;
304 lprops->flags |= cat; 305 lprops->flags |= cat;
306 c->in_a_category_cnt += 1;
307 ubifs_assert(c->in_a_category_cnt <= c->main_lebs);
305} 308}
306 309
307/** 310/**
@@ -334,6 +337,9 @@ static void ubifs_remove_from_cat(struct ubifs_info *c,
334 default: 337 default:
335 ubifs_assert(0); 338 ubifs_assert(0);
336 } 339 }
340
341 c->in_a_category_cnt -= 1;
342 ubifs_assert(c->in_a_category_cnt >= 0);
337} 343}
338 344
339/** 345/**
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 5486346d0a3f..d133c276fe05 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1183,6 +1183,8 @@ struct ubifs_debug_info;
1183 * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size) 1183 * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size)
1184 * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size) 1184 * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size)
1185 * @freeable_cnt: number of freeable LEBs in @freeable_list 1185 * @freeable_cnt: number of freeable LEBs in @freeable_list
1186 * @in_a_category_cnt: count of lprops which are in a certain category, which
1187 * basically meants that they were loaded from the flash
1186 * 1188 *
1187 * @ltab_lnum: LEB number of LPT's own lprops table 1189 * @ltab_lnum: LEB number of LPT's own lprops table
1188 * @ltab_offs: offset of LPT's own lprops table 1190 * @ltab_offs: offset of LPT's own lprops table
@@ -1412,6 +1414,7 @@ struct ubifs_info {
1412 struct list_head freeable_list; 1414 struct list_head freeable_list;
1413 struct list_head frdi_idx_list; 1415 struct list_head frdi_idx_list;
1414 int freeable_cnt; 1416 int freeable_cnt;
1417 int in_a_category_cnt;
1415 1418
1416 int ltab_lnum; 1419 int ltab_lnum;
1417 int ltab_offs; 1420 int ltab_offs;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index df88b957ccf0..cbae1ed0b7c1 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -587,7 +587,6 @@ out:
587static sector_t inode_getblk(struct inode *inode, sector_t block, 587static sector_t inode_getblk(struct inode *inode, sector_t block,
588 int *err, int *new) 588 int *err, int *new)
589{ 589{
590 static sector_t last_block;
591 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE]; 590 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE];
592 struct extent_position prev_epos, cur_epos, next_epos; 591 struct extent_position prev_epos, cur_epos, next_epos;
593 int count = 0, startnum = 0, endnum = 0; 592 int count = 0, startnum = 0, endnum = 0;
@@ -601,6 +600,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
601 struct udf_inode_info *iinfo = UDF_I(inode); 600 struct udf_inode_info *iinfo = UDF_I(inode);
602 int goal = 0, pgoal = iinfo->i_location.logicalBlockNum; 601 int goal = 0, pgoal = iinfo->i_location.logicalBlockNum;
603 int lastblock = 0; 602 int lastblock = 0;
603 bool isBeyondEOF;
604 604
605 *err = 0; 605 *err = 0;
606 *new = 0; 606 *new = 0;
@@ -676,11 +676,10 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
676 return newblock; 676 return newblock;
677 } 677 }
678 678
679 last_block = block;
680 /* Are we beyond EOF? */ 679 /* Are we beyond EOF? */
681 if (etype == -1) { 680 if (etype == -1) {
682 int ret; 681 int ret;
683 682 isBeyondEOF = 1;
684 if (count) { 683 if (count) {
685 if (c) 684 if (c)
686 laarr[0] = laarr[1]; 685 laarr[0] = laarr[1];
@@ -718,11 +717,11 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
718 memset(&laarr[c].extLocation, 0x00, 717 memset(&laarr[c].extLocation, 0x00,
719 sizeof(struct kernel_lb_addr)); 718 sizeof(struct kernel_lb_addr));
720 count++; 719 count++;
721 endnum++;
722 } 720 }
723 endnum = c + 1; 721 endnum = c + 1;
724 lastblock = 1; 722 lastblock = 1;
725 } else { 723 } else {
724 isBeyondEOF = 0;
726 endnum = startnum = ((count > 2) ? 2 : count); 725 endnum = startnum = ((count > 2) ? 2 : count);
727 726
728 /* if the current extent is in position 0, 727 /* if the current extent is in position 0,
@@ -765,10 +764,13 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
765 goal, err); 764 goal, err);
766 if (!newblocknum) { 765 if (!newblocknum) {
767 brelse(prev_epos.bh); 766 brelse(prev_epos.bh);
767 brelse(cur_epos.bh);
768 brelse(next_epos.bh);
768 *err = -ENOSPC; 769 *err = -ENOSPC;
769 return 0; 770 return 0;
770 } 771 }
771 iinfo->i_lenExtents += inode->i_sb->s_blocksize; 772 if (isBeyondEOF)
773 iinfo->i_lenExtents += inode->i_sb->s_blocksize;
772 } 774 }
773 775
774 /* if the extent the requsted block is located in contains multiple 776 /* if the extent the requsted block is located in contains multiple
@@ -795,6 +797,8 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
795 udf_update_extents(inode, laarr, startnum, endnum, &prev_epos); 797 udf_update_extents(inode, laarr, startnum, endnum, &prev_epos);
796 798
797 brelse(prev_epos.bh); 799 brelse(prev_epos.bh);
800 brelse(cur_epos.bh);
801 brelse(next_epos.bh);
798 802
799 newblock = udf_get_pblock(inode->i_sb, newblocknum, 803 newblock = udf_get_pblock(inode->i_sb, newblocknum,
800 iinfo->i_location.partitionReferenceNum, 0); 804 iinfo->i_location.partitionReferenceNum, 0);
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 6100ec0fa1d4..5a7ffe54f5d5 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -2,6 +2,7 @@ config XFS_FS
2 tristate "XFS filesystem support" 2 tristate "XFS filesystem support"
3 depends on BLOCK 3 depends on BLOCK
4 select EXPORTFS 4 select EXPORTFS
5 select LIBCRC32C
5 help 6 help
6 XFS is a high performance journaling filesystem which originated 7 XFS is a high performance journaling filesystem which originated
7 on the SGI IRIX platform. It is completely multi-threaded, can 8 on the SGI IRIX platform. It is completely multi-threaded, can
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d2bf974b1a2f..d02201df855b 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -37,9 +37,8 @@ xfs-y += xfs_aops.o \
37 xfs_file.o \ 37 xfs_file.o \
38 xfs_filestream.o \ 38 xfs_filestream.o \
39 xfs_fsops.o \ 39 xfs_fsops.o \
40 xfs_fs_subr.o \
41 xfs_globals.o \ 40 xfs_globals.o \
42 xfs_iget.o \ 41 xfs_icache.o \
43 xfs_ioctl.o \ 42 xfs_ioctl.o \
44 xfs_iomap.o \ 43 xfs_iomap.o \
45 xfs_iops.o \ 44 xfs_iops.o \
@@ -47,7 +46,6 @@ xfs-y += xfs_aops.o \
47 xfs_message.o \ 46 xfs_message.o \
48 xfs_mru_cache.o \ 47 xfs_mru_cache.o \
49 xfs_super.o \ 48 xfs_super.o \
50 xfs_sync.o \
51 xfs_xattr.o \ 49 xfs_xattr.o \
52 xfs_rename.o \ 50 xfs_rename.o \
53 xfs_utils.o \ 51 xfs_utils.o \
diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h
index 4732d71262cc..104db0f3bed6 100644
--- a/fs/xfs/uuid.h
+++ b/fs/xfs/uuid.h
@@ -26,4 +26,10 @@ extern int uuid_is_nil(uuid_t *uuid);
26extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); 26extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
27extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); 27extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
28 28
29static inline void
30uuid_copy(uuid_t *dst, uuid_t *src)
31{
32 memcpy(dst, src, sizeof(uuid_t));
33}
34
29#endif /* __XFS_SUPPORT_UUID_H__ */ 35#endif /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 44d65c1533c0..f2aeedb6a579 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -108,6 +108,8 @@ typedef struct xfs_agf {
108extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, 108extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
109 xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); 109 xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
110 110
111extern const struct xfs_buf_ops xfs_agf_buf_ops;
112
111/* 113/*
112 * Size of the unlinked inode hash table in the agi. 114 * Size of the unlinked inode hash table in the agi.
113 */ 115 */
@@ -161,6 +163,8 @@ typedef struct xfs_agi {
161extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, 163extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
162 xfs_agnumber_t agno, struct xfs_buf **bpp); 164 xfs_agnumber_t agno, struct xfs_buf **bpp);
163 165
166extern const struct xfs_buf_ops xfs_agi_buf_ops;
167
164/* 168/*
165 * The third a.g. block contains the a.g. freelist, an array 169 * The third a.g. block contains the a.g. freelist, an array
166 * of block pointers to blocks owned by the allocation btree code. 170 * of block pointers to blocks owned by the allocation btree code.
@@ -233,6 +237,7 @@ typedef struct xfs_perag {
233#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup 237#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
234 in xfs_inode_ag_iterator */ 238 in xfs_inode_ag_iterator */
235#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ 239#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
240#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */
236 241
237#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) 242#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
238#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ 243#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 4f33c32affe3..393055fe3aef 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -430,6 +430,60 @@ xfs_alloc_fixup_trees(
430 return 0; 430 return 0;
431} 431}
432 432
433static void
434xfs_agfl_verify(
435 struct xfs_buf *bp)
436{
437#ifdef WHEN_CRCS_COME_ALONG
438 /*
439 * we cannot actually do any verification of the AGFL because mkfs does
440 * not initialise the AGFL to zero or NULL. Hence the only valid part of
441 * the AGFL is what the AGF says is active. We can't get to the AGF, so
442 * we can't verify just those entries are valid.
443 *
444 * This problem goes away when the CRC format change comes along as that
445 * requires the AGFL to be initialised by mkfs. At that point, we can
446 * verify the blocks in the agfl -active or not- lie within the bounds
447 * of the AG. Until then, just leave this check ifdef'd out.
448 */
449 struct xfs_mount *mp = bp->b_target->bt_mount;
450 struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
451 int agfl_ok = 1;
452
453 int i;
454
455 for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
456 if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK ||
457 be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
458 agfl_ok = 0;
459 }
460
461 if (!agfl_ok) {
462 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl);
463 xfs_buf_ioerror(bp, EFSCORRUPTED);
464 }
465#endif
466}
467
468static void
469xfs_agfl_write_verify(
470 struct xfs_buf *bp)
471{
472 xfs_agfl_verify(bp);
473}
474
475static void
476xfs_agfl_read_verify(
477 struct xfs_buf *bp)
478{
479 xfs_agfl_verify(bp);
480}
481
482const struct xfs_buf_ops xfs_agfl_buf_ops = {
483 .verify_read = xfs_agfl_read_verify,
484 .verify_write = xfs_agfl_write_verify,
485};
486
433/* 487/*
434 * Read in the allocation group free block array. 488 * Read in the allocation group free block array.
435 */ 489 */
@@ -447,7 +501,7 @@ xfs_alloc_read_agfl(
447 error = xfs_trans_read_buf( 501 error = xfs_trans_read_buf(
448 mp, tp, mp->m_ddev_targp, 502 mp, tp, mp->m_ddev_targp,
449 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), 503 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
450 XFS_FSS_TO_BB(mp, 1), 0, &bp); 504 XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
451 if (error) 505 if (error)
452 return error; 506 return error;
453 ASSERT(!xfs_buf_geterror(bp)); 507 ASSERT(!xfs_buf_geterror(bp));
@@ -1866,6 +1920,7 @@ xfs_alloc_fix_freelist(
1866 /* 1920 /*
1867 * Initialize the args structure. 1921 * Initialize the args structure.
1868 */ 1922 */
1923 memset(&targs, 0, sizeof(targs));
1869 targs.tp = tp; 1924 targs.tp = tp;
1870 targs.mp = mp; 1925 targs.mp = mp;
1871 targs.agbp = agbp; 1926 targs.agbp = agbp;
@@ -2090,6 +2145,63 @@ xfs_alloc_put_freelist(
2090 return 0; 2145 return 0;
2091} 2146}
2092 2147
2148static void
2149xfs_agf_verify(
2150 struct xfs_buf *bp)
2151 {
2152 struct xfs_mount *mp = bp->b_target->bt_mount;
2153 struct xfs_agf *agf;
2154 int agf_ok;
2155
2156 agf = XFS_BUF_TO_AGF(bp);
2157
2158 agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
2159 XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2160 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2161 be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
2162 be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
2163 be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
2164
2165 /*
2166 * during growfs operations, the perag is not fully initialised,
2167 * so we can't use it for any useful checking. growfs ensures we can't
2168 * use it by using uncached buffers that don't have the perag attached
2169 * so we can detect and avoid this problem.
2170 */
2171 if (bp->b_pag)
2172 agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) ==
2173 bp->b_pag->pag_agno;
2174
2175 if (xfs_sb_version_haslazysbcount(&mp->m_sb))
2176 agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
2177 be32_to_cpu(agf->agf_length);
2178
2179 if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
2180 XFS_RANDOM_ALLOC_READ_AGF))) {
2181 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf);
2182 xfs_buf_ioerror(bp, EFSCORRUPTED);
2183 }
2184}
2185
2186static void
2187xfs_agf_read_verify(
2188 struct xfs_buf *bp)
2189{
2190 xfs_agf_verify(bp);
2191}
2192
2193static void
2194xfs_agf_write_verify(
2195 struct xfs_buf *bp)
2196{
2197 xfs_agf_verify(bp);
2198}
2199
2200const struct xfs_buf_ops xfs_agf_buf_ops = {
2201 .verify_read = xfs_agf_read_verify,
2202 .verify_write = xfs_agf_write_verify,
2203};
2204
2093/* 2205/*
2094 * Read in the allocation group header (free/alloc section). 2206 * Read in the allocation group header (free/alloc section).
2095 */ 2207 */
@@ -2101,44 +2213,19 @@ xfs_read_agf(
2101 int flags, /* XFS_BUF_ */ 2213 int flags, /* XFS_BUF_ */
2102 struct xfs_buf **bpp) /* buffer for the ag freelist header */ 2214 struct xfs_buf **bpp) /* buffer for the ag freelist header */
2103{ 2215{
2104 struct xfs_agf *agf; /* ag freelist header */
2105 int agf_ok; /* set if agf is consistent */
2106 int error; 2216 int error;
2107 2217
2108 ASSERT(agno != NULLAGNUMBER); 2218 ASSERT(agno != NULLAGNUMBER);
2109 error = xfs_trans_read_buf( 2219 error = xfs_trans_read_buf(
2110 mp, tp, mp->m_ddev_targp, 2220 mp, tp, mp->m_ddev_targp,
2111 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), 2221 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
2112 XFS_FSS_TO_BB(mp, 1), flags, bpp); 2222 XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
2113 if (error) 2223 if (error)
2114 return error; 2224 return error;
2115 if (!*bpp) 2225 if (!*bpp)
2116 return 0; 2226 return 0;
2117 2227
2118 ASSERT(!(*bpp)->b_error); 2228 ASSERT(!(*bpp)->b_error);
2119 agf = XFS_BUF_TO_AGF(*bpp);
2120
2121 /*
2122 * Validate the magic number of the agf block.
2123 */
2124 agf_ok =
2125 agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
2126 XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2127 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2128 be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
2129 be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
2130 be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) &&
2131 be32_to_cpu(agf->agf_seqno) == agno;
2132 if (xfs_sb_version_haslazysbcount(&mp->m_sb))
2133 agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
2134 be32_to_cpu(agf->agf_length);
2135 if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
2136 XFS_RANDOM_ALLOC_READ_AGF))) {
2137 XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
2138 XFS_ERRLEVEL_LOW, mp, agf);
2139 xfs_trans_brelse(tp, *bpp);
2140 return XFS_ERROR(EFSCORRUPTED);
2141 }
2142 xfs_buf_set_ref(*bpp, XFS_AGF_REF); 2229 xfs_buf_set_ref(*bpp, XFS_AGF_REF);
2143 return 0; 2230 return 0;
2144} 2231}
@@ -2207,7 +2294,7 @@ xfs_alloc_read_agf(
2207 * group or loop over the allocation groups to find the result. 2294 * group or loop over the allocation groups to find the result.
2208 */ 2295 */
2209int /* error */ 2296int /* error */
2210__xfs_alloc_vextent( 2297xfs_alloc_vextent(
2211 xfs_alloc_arg_t *args) /* allocation argument structure */ 2298 xfs_alloc_arg_t *args) /* allocation argument structure */
2212{ 2299{
2213 xfs_agblock_t agsize; /* allocation group size */ 2300 xfs_agblock_t agsize; /* allocation group size */
@@ -2417,46 +2504,6 @@ error0:
2417 return error; 2504 return error;
2418} 2505}
2419 2506
2420static void
2421xfs_alloc_vextent_worker(
2422 struct work_struct *work)
2423{
2424 struct xfs_alloc_arg *args = container_of(work,
2425 struct xfs_alloc_arg, work);
2426 unsigned long pflags;
2427
2428 /* we are in a transaction context here */
2429 current_set_flags_nested(&pflags, PF_FSTRANS);
2430
2431 args->result = __xfs_alloc_vextent(args);
2432 complete(args->done);
2433
2434 current_restore_flags_nested(&pflags, PF_FSTRANS);
2435}
2436
2437/*
2438 * Data allocation requests often come in with little stack to work on. Push
2439 * them off to a worker thread so there is lots of stack to use. Metadata
2440 * requests, OTOH, are generally from low stack usage paths, so avoid the
2441 * context switch overhead here.
2442 */
2443int
2444xfs_alloc_vextent(
2445 struct xfs_alloc_arg *args)
2446{
2447 DECLARE_COMPLETION_ONSTACK(done);
2448
2449 if (!args->userdata)
2450 return __xfs_alloc_vextent(args);
2451
2452
2453 args->done = &done;
2454 INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker);
2455 queue_work(xfs_alloc_wq, &args->work);
2456 wait_for_completion(&done);
2457 return args->result;
2458}
2459
2460/* 2507/*
2461 * Free an extent. 2508 * Free an extent.
2462 * Just break up the extent address and hand off to xfs_free_ag_extent 2509 * Just break up the extent address and hand off to xfs_free_ag_extent
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 93be4a667ca1..99d0a6101558 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -120,9 +120,6 @@ typedef struct xfs_alloc_arg {
120 char isfl; /* set if is freelist blocks - !acctg */ 120 char isfl; /* set if is freelist blocks - !acctg */
121 char userdata; /* set if this is user data */ 121 char userdata; /* set if this is user data */
122 xfs_fsblock_t firstblock; /* io first block allocated */ 122 xfs_fsblock_t firstblock; /* io first block allocated */
123 struct completion *done;
124 struct work_struct work;
125 int result;
126} xfs_alloc_arg_t; 123} xfs_alloc_arg_t;
127 124
128/* 125/*
@@ -234,4 +231,7 @@ xfs_alloc_get_rec(
234 xfs_extlen_t *len, /* output: length of extent */ 231 xfs_extlen_t *len, /* output: length of extent */
235 int *stat); /* output: success/failure */ 232 int *stat); /* output: success/failure */
236 233
234extern const struct xfs_buf_ops xfs_agf_buf_ops;
235extern const struct xfs_buf_ops xfs_agfl_buf_ops;
236
237#endif /* __XFS_ALLOC_H__ */ 237#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index f1647caace8f..b1ddef6b2689 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -121,6 +121,8 @@ xfs_allocbt_free_block(
121 xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, 121 xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
122 XFS_EXTENT_BUSY_SKIP_DISCARD); 122 XFS_EXTENT_BUSY_SKIP_DISCARD);
123 xfs_trans_agbtree_delta(cur->bc_tp, -1); 123 xfs_trans_agbtree_delta(cur->bc_tp, -1);
124
125 xfs_trans_binval(cur->bc_tp, bp);
124 return 0; 126 return 0;
125} 127}
126 128
@@ -270,6 +272,82 @@ xfs_allocbt_key_diff(
270 return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; 272 return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
271} 273}
272 274
275static void
276xfs_allocbt_verify(
277 struct xfs_buf *bp)
278{
279 struct xfs_mount *mp = bp->b_target->bt_mount;
280 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
281 struct xfs_perag *pag = bp->b_pag;
282 unsigned int level;
283 int sblock_ok; /* block passes checks */
284
285 /*
286 * magic number and level verification
287 *
288 * During growfs operations, we can't verify the exact level as the
289 * perag is not fully initialised and hence not attached to the buffer.
290 * In this case, check against the maximum tree depth.
291 */
292 level = be16_to_cpu(block->bb_level);
293 switch (block->bb_magic) {
294 case cpu_to_be32(XFS_ABTB_MAGIC):
295 if (pag)
296 sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi];
297 else
298 sblock_ok = level < mp->m_ag_maxlevels;
299 break;
300 case cpu_to_be32(XFS_ABTC_MAGIC):
301 if (pag)
302 sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi];
303 else
304 sblock_ok = level < mp->m_ag_maxlevels;
305 break;
306 default:
307 sblock_ok = 0;
308 break;
309 }
310
311 /* numrecs verification */
312 sblock_ok = sblock_ok &&
313 be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0];
314
315 /* sibling pointer verification */
316 sblock_ok = sblock_ok &&
317 (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
318 be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
319 block->bb_u.s.bb_leftsib &&
320 (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
321 be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
322 block->bb_u.s.bb_rightsib;
323
324 if (!sblock_ok) {
325 trace_xfs_btree_corrupt(bp, _RET_IP_);
326 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
327 xfs_buf_ioerror(bp, EFSCORRUPTED);
328 }
329}
330
331static void
332xfs_allocbt_read_verify(
333 struct xfs_buf *bp)
334{
335 xfs_allocbt_verify(bp);
336}
337
338static void
339xfs_allocbt_write_verify(
340 struct xfs_buf *bp)
341{
342 xfs_allocbt_verify(bp);
343}
344
345const struct xfs_buf_ops xfs_allocbt_buf_ops = {
346 .verify_read = xfs_allocbt_read_verify,
347 .verify_write = xfs_allocbt_write_verify,
348};
349
350
273#ifdef DEBUG 351#ifdef DEBUG
274STATIC int 352STATIC int
275xfs_allocbt_keys_inorder( 353xfs_allocbt_keys_inorder(
@@ -325,6 +403,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
325 .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, 403 .init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
326 .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, 404 .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
327 .key_diff = xfs_allocbt_key_diff, 405 .key_diff = xfs_allocbt_key_diff,
406 .buf_ops = &xfs_allocbt_buf_ops,
328#ifdef DEBUG 407#ifdef DEBUG
329 .keys_inorder = xfs_allocbt_keys_inorder, 408 .keys_inorder = xfs_allocbt_keys_inorder,
330 .recs_inorder = xfs_allocbt_recs_inorder, 409 .recs_inorder = xfs_allocbt_recs_inorder,
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index 359fb86ed876..7e89a2b429dd 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -93,4 +93,6 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
93 xfs_agnumber_t, xfs_btnum_t); 93 xfs_agnumber_t, xfs_btnum_t);
94extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); 94extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
95 95
96extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
97
96#endif /* __XFS_ALLOC_BTREE_H__ */ 98#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e562dd43f41f..4111a40ebe1a 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -124,7 +124,7 @@ xfs_setfilesize_trans_alloc(
124 ioend->io_append_trans = tp; 124 ioend->io_append_trans = tp;
125 125
126 /* 126 /*
127 * We will pass freeze protection with a transaction. So tell lockdep 127 * We may pass freeze protection with a transaction. So tell lockdep
128 * we released it. 128 * we released it.
129 */ 129 */
130 rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 130 rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
@@ -149,11 +149,13 @@ xfs_setfilesize(
149 xfs_fsize_t isize; 149 xfs_fsize_t isize;
150 150
151 /* 151 /*
152 * The transaction was allocated in the I/O submission thread, 152 * The transaction may have been allocated in the I/O submission thread,
153 * thus we need to mark ourselves as beeing in a transaction 153 * thus we need to mark ourselves as beeing in a transaction manually.
154 * manually. 154 * Similarly for freeze protection.
155 */ 155 */
156 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); 156 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
157 rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
158 0, 1, _THIS_IP_);
157 159
158 xfs_ilock(ip, XFS_ILOCK_EXCL); 160 xfs_ilock(ip, XFS_ILOCK_EXCL);
159 isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); 161 isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
@@ -187,7 +189,8 @@ xfs_finish_ioend(
187 189
188 if (ioend->io_type == XFS_IO_UNWRITTEN) 190 if (ioend->io_type == XFS_IO_UNWRITTEN)
189 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 191 queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
190 else if (ioend->io_append_trans) 192 else if (ioend->io_append_trans ||
193 (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
191 queue_work(mp->m_data_workqueue, &ioend->io_work); 194 queue_work(mp->m_data_workqueue, &ioend->io_work);
192 else 195 else
193 xfs_destroy_ioend(ioend); 196 xfs_destroy_ioend(ioend);
@@ -205,15 +208,6 @@ xfs_end_io(
205 struct xfs_inode *ip = XFS_I(ioend->io_inode); 208 struct xfs_inode *ip = XFS_I(ioend->io_inode);
206 int error = 0; 209 int error = 0;
207 210
208 if (ioend->io_append_trans) {
209 /*
210 * We've got freeze protection passed with the transaction.
211 * Tell lockdep about it.
212 */
213 rwsem_acquire_read(
214 &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
215 0, 1, _THIS_IP_);
216 }
217 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 211 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
218 ioend->io_error = -EIO; 212 ioend->io_error = -EIO;
219 goto done; 213 goto done;
@@ -226,35 +220,31 @@ xfs_end_io(
226 * range to normal written extens after the data I/O has finished. 220 * range to normal written extens after the data I/O has finished.
227 */ 221 */
228 if (ioend->io_type == XFS_IO_UNWRITTEN) { 222 if (ioend->io_type == XFS_IO_UNWRITTEN) {
223 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
224 ioend->io_size);
225 } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
229 /* 226 /*
230 * For buffered I/O we never preallocate a transaction when 227 * For direct I/O we do not know if we need to allocate blocks
231 * doing the unwritten extent conversion, but for direct I/O 228 * or not so we can't preallocate an append transaction as that
232 * we do not know if we are converting an unwritten extent 229 * results in nested reservations and log space deadlocks. Hence
233 * or not at the point where we preallocate the transaction. 230 * allocate the transaction here. While this is sub-optimal and
231 * can block IO completion for some time, we're stuck with doing
232 * it this way until we can pass the ioend to the direct IO
233 * allocation callbacks and avoid nesting that way.
234 */ 234 */
235 if (ioend->io_append_trans) { 235 error = xfs_setfilesize_trans_alloc(ioend);
236 ASSERT(ioend->io_isdirect); 236 if (error)
237
238 current_set_flags_nested(
239 &ioend->io_append_trans->t_pflags, PF_FSTRANS);
240 xfs_trans_cancel(ioend->io_append_trans, 0);
241 }
242
243 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
244 ioend->io_size);
245 if (error) {
246 ioend->io_error = -error;
247 goto done; 237 goto done;
248 } 238 error = xfs_setfilesize(ioend);
249 } else if (ioend->io_append_trans) { 239 } else if (ioend->io_append_trans) {
250 error = xfs_setfilesize(ioend); 240 error = xfs_setfilesize(ioend);
251 if (error)
252 ioend->io_error = -error;
253 } else { 241 } else {
254 ASSERT(!xfs_ioend_is_append(ioend)); 242 ASSERT(!xfs_ioend_is_append(ioend));
255 } 243 }
256 244
257done: 245done:
246 if (error)
247 ioend->io_error = -error;
258 xfs_destroy_ioend(ioend); 248 xfs_destroy_ioend(ioend);
259} 249}
260 250
@@ -481,11 +471,17 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
481 * 471 *
482 * The fix is two passes across the ioend list - one to start writeback on the 472 * The fix is two passes across the ioend list - one to start writeback on the
483 * buffer_heads, and then submit them for I/O on the second pass. 473 * buffer_heads, and then submit them for I/O on the second pass.
474 *
475 * If @fail is non-zero, it means that we have a situation where some part of
476 * the submission process has failed after we have marked paged for writeback
477 * and unlocked them. In this situation, we need to fail the ioend chain rather
478 * than submit it to IO. This typically only happens on a filesystem shutdown.
484 */ 479 */
485STATIC void 480STATIC void
486xfs_submit_ioend( 481xfs_submit_ioend(
487 struct writeback_control *wbc, 482 struct writeback_control *wbc,
488 xfs_ioend_t *ioend) 483 xfs_ioend_t *ioend,
484 int fail)
489{ 485{
490 xfs_ioend_t *head = ioend; 486 xfs_ioend_t *head = ioend;
491 xfs_ioend_t *next; 487 xfs_ioend_t *next;
@@ -506,6 +502,18 @@ xfs_submit_ioend(
506 next = ioend->io_list; 502 next = ioend->io_list;
507 bio = NULL; 503 bio = NULL;
508 504
505 /*
506 * If we are failing the IO now, just mark the ioend with an
507 * error and finish it. This will run IO completion immediately
508 * as there is only one reference to the ioend at this point in
509 * time.
510 */
511 if (fail) {
512 ioend->io_error = -fail;
513 xfs_finish_ioend(ioend);
514 continue;
515 }
516
509 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 517 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
510 518
511 if (!bio) { 519 if (!bio) {
@@ -1060,7 +1068,18 @@ xfs_vm_writepage(
1060 1068
1061 xfs_start_page_writeback(page, 1, count); 1069 xfs_start_page_writeback(page, 1, count);
1062 1070
1063 if (ioend && imap_valid) { 1071 /* if there is no IO to be submitted for this page, we are done */
1072 if (!ioend)
1073 return 0;
1074
1075 ASSERT(iohead);
1076
1077 /*
1078 * Any errors from this point onwards need tobe reported through the IO
1079 * completion path as we have marked the initial page as under writeback
1080 * and unlocked it.
1081 */
1082 if (imap_valid) {
1064 xfs_off_t end_index; 1083 xfs_off_t end_index;
1065 1084
1066 end_index = imap.br_startoff + imap.br_blockcount; 1085 end_index = imap.br_startoff + imap.br_blockcount;
@@ -1079,20 +1098,15 @@ xfs_vm_writepage(
1079 wbc, end_index); 1098 wbc, end_index);
1080 } 1099 }
1081 1100
1082 if (iohead) {
1083 /*
1084 * Reserve log space if we might write beyond the on-disk
1085 * inode size.
1086 */
1087 if (ioend->io_type != XFS_IO_UNWRITTEN &&
1088 xfs_ioend_is_append(ioend)) {
1089 err = xfs_setfilesize_trans_alloc(ioend);
1090 if (err)
1091 goto error;
1092 }
1093 1101
1094 xfs_submit_ioend(wbc, iohead); 1102 /*
1095 } 1103 * Reserve log space if we might write beyond the on-disk inode size.
1104 */
1105 err = 0;
1106 if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
1107 err = xfs_setfilesize_trans_alloc(ioend);
1108
1109 xfs_submit_ioend(wbc, iohead, err);
1096 1110
1097 return 0; 1111 return 0;
1098 1112
@@ -1408,25 +1422,21 @@ xfs_vm_direct_IO(
1408 size_t size = iov_length(iov, nr_segs); 1422 size_t size = iov_length(iov, nr_segs);
1409 1423
1410 /* 1424 /*
1411 * We need to preallocate a transaction for a size update 1425 * We cannot preallocate a size update transaction here as we
1412 * here. In the case that this write both updates the size 1426 * don't know whether allocation is necessary or not. Hence we
1413 * and converts at least on unwritten extent we will cancel 1427 * can only tell IO completion that one is necessary if we are
1414 * the still clean transaction after the I/O has finished. 1428 * not doing unwritten extent conversion.
1415 */ 1429 */
1416 iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT); 1430 iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
1417 if (offset + size > XFS_I(inode)->i_d.di_size) { 1431 if (offset + size > XFS_I(inode)->i_d.di_size)
1418 ret = xfs_setfilesize_trans_alloc(ioend);
1419 if (ret)
1420 goto out_destroy_ioend;
1421 ioend->io_isdirect = 1; 1432 ioend->io_isdirect = 1;
1422 }
1423 1433
1424 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1434 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1425 offset, nr_segs, 1435 offset, nr_segs,
1426 xfs_get_blocks_direct, 1436 xfs_get_blocks_direct,
1427 xfs_end_io_direct_write, NULL, 0); 1437 xfs_end_io_direct_write, NULL, 0);
1428 if (ret != -EIOCBQUEUED && iocb->private) 1438 if (ret != -EIOCBQUEUED && iocb->private)
1429 goto out_trans_cancel; 1439 goto out_destroy_ioend;
1430 } else { 1440 } else {
1431 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1441 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1432 offset, nr_segs, 1442 offset, nr_segs,
@@ -1436,15 +1446,6 @@ xfs_vm_direct_IO(
1436 1446
1437 return ret; 1447 return ret;
1438 1448
1439out_trans_cancel:
1440 if (ioend->io_append_trans) {
1441 current_set_flags_nested(&ioend->io_append_trans->t_pflags,
1442 PF_FSTRANS);
1443 rwsem_acquire_read(
1444 &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
1445 0, 1, _THIS_IP_);
1446 xfs_trans_cancel(ioend->io_append_trans, 0);
1447 }
1448out_destroy_ioend: 1449out_destroy_ioend:
1449 xfs_destroy_ioend(ioend); 1450 xfs_destroy_ioend(ioend);
1450 return ret; 1451 return ret;
@@ -1617,7 +1618,7 @@ xfs_vm_bmap(
1617 1618
1618 trace_xfs_vm_bmap(XFS_I(inode)); 1619 trace_xfs_vm_bmap(XFS_I(inode));
1619 xfs_ilock(ip, XFS_IOLOCK_SHARED); 1620 xfs_ilock(ip, XFS_IOLOCK_SHARED);
1620 xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); 1621 filemap_write_and_wait(mapping);
1621 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 1622 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
1622 return generic_block_bmap(mapping, block, xfs_get_blocks); 1623 return generic_block_bmap(mapping, block, xfs_get_blocks);
1623} 1624}
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 0ca1f0be62d2..aaf472532b3c 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -903,11 +903,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
903 */ 903 */
904 dp = args->dp; 904 dp = args->dp;
905 args->blkno = 0; 905 args->blkno = 0;
906 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, 906 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
907 XFS_ATTR_FORK);
908 if (error) 907 if (error)
909 return(error); 908 return error;
910 ASSERT(bp != NULL);
911 909
912 /* 910 /*
913 * Look up the given attribute in the leaf block. Figure out if 911 * Look up the given attribute in the leaf block. Figure out if
@@ -1031,12 +1029,12 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
1031 * Read in the block containing the "old" attr, then 1029 * Read in the block containing the "old" attr, then
1032 * remove the "old" attr from that block (neat, huh!) 1030 * remove the "old" attr from that block (neat, huh!)
1033 */ 1031 */
1034 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, 1032 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno,
1035 &bp, XFS_ATTR_FORK); 1033 -1, &bp);
1036 if (error) 1034 if (error)
1037 return(error); 1035 return error;
1038 ASSERT(bp != NULL); 1036
1039 (void)xfs_attr_leaf_remove(bp, args); 1037 xfs_attr_leaf_remove(bp, args);
1040 1038
1041 /* 1039 /*
1042 * If the result is small enough, shrink it all into the inode. 1040 * If the result is small enough, shrink it all into the inode.
@@ -1100,20 +1098,17 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
1100 */ 1098 */
1101 dp = args->dp; 1099 dp = args->dp;
1102 args->blkno = 0; 1100 args->blkno = 0;
1103 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, 1101 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
1104 XFS_ATTR_FORK); 1102 if (error)
1105 if (error) { 1103 return error;
1106 return(error);
1107 }
1108 1104
1109 ASSERT(bp != NULL);
1110 error = xfs_attr_leaf_lookup_int(bp, args); 1105 error = xfs_attr_leaf_lookup_int(bp, args);
1111 if (error == ENOATTR) { 1106 if (error == ENOATTR) {
1112 xfs_trans_brelse(args->trans, bp); 1107 xfs_trans_brelse(args->trans, bp);
1113 return(error); 1108 return(error);
1114 } 1109 }
1115 1110
1116 (void)xfs_attr_leaf_remove(bp, args); 1111 xfs_attr_leaf_remove(bp, args);
1117 1112
1118 /* 1113 /*
1119 * If the result is small enough, shrink it all into the inode. 1114 * If the result is small enough, shrink it all into the inode.
@@ -1155,12 +1150,12 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
1155 struct xfs_buf *bp; 1150 struct xfs_buf *bp;
1156 int error; 1151 int error;
1157 1152
1153 trace_xfs_attr_leaf_get(args);
1154
1158 args->blkno = 0; 1155 args->blkno = 0;
1159 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, 1156 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
1160 XFS_ATTR_FORK);
1161 if (error) 1157 if (error)
1162 return(error); 1158 return error;
1163 ASSERT(bp != NULL);
1164 1159
1165 error = xfs_attr_leaf_lookup_int(bp, args); 1160 error = xfs_attr_leaf_lookup_int(bp, args);
1166 if (error != EEXIST) { 1161 if (error != EEXIST) {
@@ -1181,22 +1176,15 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
1181STATIC int 1176STATIC int
1182xfs_attr_leaf_list(xfs_attr_list_context_t *context) 1177xfs_attr_leaf_list(xfs_attr_list_context_t *context)
1183{ 1178{
1184 xfs_attr_leafblock_t *leaf;
1185 int error; 1179 int error;
1186 struct xfs_buf *bp; 1180 struct xfs_buf *bp;
1187 1181
1182 trace_xfs_attr_leaf_list(context);
1183
1188 context->cursor->blkno = 0; 1184 context->cursor->blkno = 0;
1189 error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK); 1185 error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp);
1190 if (error) 1186 if (error)
1191 return XFS_ERROR(error); 1187 return XFS_ERROR(error);
1192 ASSERT(bp != NULL);
1193 leaf = bp->b_addr;
1194 if (unlikely(leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) {
1195 XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW,
1196 context->dp->i_mount, leaf);
1197 xfs_trans_brelse(NULL, bp);
1198 return XFS_ERROR(EFSCORRUPTED);
1199 }
1200 1188
1201 error = xfs_attr_leaf_list_int(bp, context); 1189 error = xfs_attr_leaf_list_int(bp, context);
1202 xfs_trans_brelse(NULL, bp); 1190 xfs_trans_brelse(NULL, bp);
@@ -1600,12 +1588,9 @@ xfs_attr_node_removename(xfs_da_args_t *args)
1600 ASSERT(state->path.blk[0].bp); 1588 ASSERT(state->path.blk[0].bp);
1601 state->path.blk[0].bp = NULL; 1589 state->path.blk[0].bp = NULL;
1602 1590
1603 error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, 1591 error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp);
1604 XFS_ATTR_FORK);
1605 if (error) 1592 if (error)
1606 goto out; 1593 goto out;
1607 ASSERT((((xfs_attr_leafblock_t *)bp->b_addr)->hdr.info.magic) ==
1608 cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1609 1594
1610 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { 1595 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
1611 xfs_bmap_init(args->flist, args->firstblock); 1596 xfs_bmap_init(args->flist, args->firstblock);
@@ -1653,6 +1638,8 @@ xfs_attr_fillstate(xfs_da_state_t *state)
1653 xfs_da_state_blk_t *blk; 1638 xfs_da_state_blk_t *blk;
1654 int level; 1639 int level;
1655 1640
1641 trace_xfs_attr_fillstate(state->args);
1642
1656 /* 1643 /*
1657 * Roll down the "path" in the state structure, storing the on-disk 1644 * Roll down the "path" in the state structure, storing the on-disk
1658 * block number for those buffers in the "path". 1645 * block number for those buffers in the "path".
@@ -1699,6 +1686,8 @@ xfs_attr_refillstate(xfs_da_state_t *state)
1699 xfs_da_state_blk_t *blk; 1686 xfs_da_state_blk_t *blk;
1700 int level, error; 1687 int level, error;
1701 1688
1689 trace_xfs_attr_refillstate(state->args);
1690
1702 /* 1691 /*
1703 * Roll down the "path" in the state structure, storing the on-disk 1692 * Roll down the "path" in the state structure, storing the on-disk
1704 * block number for those buffers in the "path". 1693 * block number for those buffers in the "path".
@@ -1707,7 +1696,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
1707 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); 1696 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1708 for (blk = path->blk, level = 0; level < path->active; blk++, level++) { 1697 for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
1709 if (blk->disk_blkno) { 1698 if (blk->disk_blkno) {
1710 error = xfs_da_read_buf(state->args->trans, 1699 error = xfs_da_node_read(state->args->trans,
1711 state->args->dp, 1700 state->args->dp,
1712 blk->blkno, blk->disk_blkno, 1701 blk->blkno, blk->disk_blkno,
1713 &blk->bp, XFS_ATTR_FORK); 1702 &blk->bp, XFS_ATTR_FORK);
@@ -1726,7 +1715,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
1726 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); 1715 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1727 for (blk = path->blk, level = 0; level < path->active; blk++, level++) { 1716 for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
1728 if (blk->disk_blkno) { 1717 if (blk->disk_blkno) {
1729 error = xfs_da_read_buf(state->args->trans, 1718 error = xfs_da_node_read(state->args->trans,
1730 state->args->dp, 1719 state->args->dp,
1731 blk->blkno, blk->disk_blkno, 1720 blk->blkno, blk->disk_blkno,
1732 &blk->bp, XFS_ATTR_FORK); 1721 &blk->bp, XFS_ATTR_FORK);
@@ -1755,6 +1744,8 @@ xfs_attr_node_get(xfs_da_args_t *args)
1755 int error, retval; 1744 int error, retval;
1756 int i; 1745 int i;
1757 1746
1747 trace_xfs_attr_node_get(args);
1748
1758 state = xfs_da_state_alloc(); 1749 state = xfs_da_state_alloc();
1759 state->args = args; 1750 state->args = args;
1760 state->mp = args->dp->i_mount; 1751 state->mp = args->dp->i_mount;
@@ -1804,6 +1795,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1804 int error, i; 1795 int error, i;
1805 struct xfs_buf *bp; 1796 struct xfs_buf *bp;
1806 1797
1798 trace_xfs_attr_node_list(context);
1799
1807 cursor = context->cursor; 1800 cursor = context->cursor;
1808 cursor->initted = 1; 1801 cursor->initted = 1;
1809 1802
@@ -1814,7 +1807,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1814 */ 1807 */
1815 bp = NULL; 1808 bp = NULL;
1816 if (cursor->blkno > 0) { 1809 if (cursor->blkno > 0) {
1817 error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, 1810 error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1,
1818 &bp, XFS_ATTR_FORK); 1811 &bp, XFS_ATTR_FORK);
1819 if ((error != 0) && (error != EFSCORRUPTED)) 1812 if ((error != 0) && (error != EFSCORRUPTED))
1820 return(error); 1813 return(error);
@@ -1856,17 +1849,11 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1856 if (bp == NULL) { 1849 if (bp == NULL) {
1857 cursor->blkno = 0; 1850 cursor->blkno = 0;
1858 for (;;) { 1851 for (;;) {
1859 error = xfs_da_read_buf(NULL, context->dp, 1852 error = xfs_da_node_read(NULL, context->dp,
1860 cursor->blkno, -1, &bp, 1853 cursor->blkno, -1, &bp,
1861 XFS_ATTR_FORK); 1854 XFS_ATTR_FORK);
1862 if (error) 1855 if (error)
1863 return(error); 1856 return(error);
1864 if (unlikely(bp == NULL)) {
1865 XFS_ERROR_REPORT("xfs_attr_node_list(2)",
1866 XFS_ERRLEVEL_LOW,
1867 context->dp->i_mount);
1868 return(XFS_ERROR(EFSCORRUPTED));
1869 }
1870 node = bp->b_addr; 1857 node = bp->b_addr;
1871 if (node->hdr.info.magic == 1858 if (node->hdr.info.magic ==
1872 cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) 1859 cpu_to_be16(XFS_ATTR_LEAF_MAGIC))
@@ -1907,14 +1894,6 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1907 */ 1894 */
1908 for (;;) { 1895 for (;;) {
1909 leaf = bp->b_addr; 1896 leaf = bp->b_addr;
1910 if (unlikely(leaf->hdr.info.magic !=
1911 cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) {
1912 XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)",
1913 XFS_ERRLEVEL_LOW,
1914 context->dp->i_mount, leaf);
1915 xfs_trans_brelse(NULL, bp);
1916 return(XFS_ERROR(EFSCORRUPTED));
1917 }
1918 error = xfs_attr_leaf_list_int(bp, context); 1897 error = xfs_attr_leaf_list_int(bp, context);
1919 if (error) { 1898 if (error) {
1920 xfs_trans_brelse(NULL, bp); 1899 xfs_trans_brelse(NULL, bp);
@@ -1924,16 +1903,10 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1924 break; 1903 break;
1925 cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); 1904 cursor->blkno = be32_to_cpu(leaf->hdr.info.forw);
1926 xfs_trans_brelse(NULL, bp); 1905 xfs_trans_brelse(NULL, bp);
1927 error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, 1906 error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1,
1928 &bp, XFS_ATTR_FORK); 1907 &bp);
1929 if (error) 1908 if (error)
1930 return(error); 1909 return error;
1931 if (unlikely((bp == NULL))) {
1932 XFS_ERROR_REPORT("xfs_attr_node_list(5)",
1933 XFS_ERRLEVEL_LOW,
1934 context->dp->i_mount);
1935 return(XFS_ERROR(EFSCORRUPTED));
1936 }
1937 } 1910 }
1938 xfs_trans_brelse(NULL, bp); 1911 xfs_trans_brelse(NULL, bp);
1939 return(0); 1912 return(0);
@@ -1959,6 +1932,8 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
1959 int nmap, error, tmp, valuelen, blkcnt, i; 1932 int nmap, error, tmp, valuelen, blkcnt, i;
1960 xfs_dablk_t lblkno; 1933 xfs_dablk_t lblkno;
1961 1934
1935 trace_xfs_attr_rmtval_get(args);
1936
1962 ASSERT(!(args->flags & ATTR_KERNOVAL)); 1937 ASSERT(!(args->flags & ATTR_KERNOVAL));
1963 1938
1964 mp = args->dp->i_mount; 1939 mp = args->dp->i_mount;
@@ -1980,7 +1955,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
1980 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); 1955 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
1981 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); 1956 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
1982 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, 1957 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
1983 dblkno, blkcnt, 0, &bp); 1958 dblkno, blkcnt, 0, &bp, NULL);
1984 if (error) 1959 if (error)
1985 return(error); 1960 return(error);
1986 1961
@@ -2014,6 +1989,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2014 xfs_dablk_t lblkno; 1989 xfs_dablk_t lblkno;
2015 int blkcnt, valuelen, nmap, error, tmp, committed; 1990 int blkcnt, valuelen, nmap, error, tmp, committed;
2016 1991
1992 trace_xfs_attr_rmtval_set(args);
1993
2017 dp = args->dp; 1994 dp = args->dp;
2018 mp = dp->i_mount; 1995 mp = dp->i_mount;
2019 src = args->value; 1996 src = args->value;
@@ -2143,6 +2120,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2143 xfs_dablk_t lblkno; 2120 xfs_dablk_t lblkno;
2144 int valuelen, blkcnt, nmap, error, done, committed; 2121 int valuelen, blkcnt, nmap, error, done, committed;
2145 2122
2123 trace_xfs_attr_rmtval_remove(args);
2124
2146 mp = args->dp->i_mount; 2125 mp = args->dp->i_mount;
2147 2126
2148 /* 2127 /*
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index d330111ca738..ee24993c7d12 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -57,7 +57,8 @@ STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block,
57 struct xfs_buf **bpp); 57 struct xfs_buf **bpp);
58STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer, 58STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer,
59 xfs_da_args_t *args, int freemap_index); 59 xfs_da_args_t *args, int freemap_index);
60STATIC void xfs_attr_leaf_compact(xfs_trans_t *tp, struct xfs_buf *leaf_buffer); 60STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args,
61 struct xfs_buf *leaf_buffer);
61STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state, 62STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state,
62 xfs_da_state_blk_t *blk1, 63 xfs_da_state_blk_t *blk1,
63 xfs_da_state_blk_t *blk2); 64 xfs_da_state_blk_t *blk2);
@@ -87,6 +88,52 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
87 xfs_mount_t *mp); 88 xfs_mount_t *mp);
88STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); 89STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
89 90
91static void
92xfs_attr_leaf_verify(
93 struct xfs_buf *bp)
94{
95 struct xfs_mount *mp = bp->b_target->bt_mount;
96 struct xfs_attr_leaf_hdr *hdr = bp->b_addr;
97 int block_ok = 0;
98
99 block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC);
100 if (!block_ok) {
101 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
102 xfs_buf_ioerror(bp, EFSCORRUPTED);
103 }
104}
105
106static void
107xfs_attr_leaf_read_verify(
108 struct xfs_buf *bp)
109{
110 xfs_attr_leaf_verify(bp);
111}
112
113static void
114xfs_attr_leaf_write_verify(
115 struct xfs_buf *bp)
116{
117 xfs_attr_leaf_verify(bp);
118}
119
120const struct xfs_buf_ops xfs_attr_leaf_buf_ops = {
121 .verify_read = xfs_attr_leaf_read_verify,
122 .verify_write = xfs_attr_leaf_write_verify,
123};
124
125int
126xfs_attr_leaf_read(
127 struct xfs_trans *tp,
128 struct xfs_inode *dp,
129 xfs_dablk_t bno,
130 xfs_daddr_t mappedbno,
131 struct xfs_buf **bpp)
132{
133 return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
134 XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops);
135}
136
90/*======================================================================== 137/*========================================================================
91 * Namespace helper routines 138 * Namespace helper routines
92 *========================================================================*/ 139 *========================================================================*/
@@ -869,17 +916,16 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args)
869 error = xfs_da_grow_inode(args, &blkno); 916 error = xfs_da_grow_inode(args, &blkno);
870 if (error) 917 if (error)
871 goto out; 918 goto out;
872 error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1, 919 error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1);
873 XFS_ATTR_FORK);
874 if (error) 920 if (error)
875 goto out; 921 goto out;
876 ASSERT(bp1 != NULL); 922
877 bp2 = NULL; 923 bp2 = NULL;
878 error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2, 924 error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2,
879 XFS_ATTR_FORK); 925 XFS_ATTR_FORK);
880 if (error) 926 if (error)
881 goto out; 927 goto out;
882 ASSERT(bp2 != NULL); 928 bp2->b_ops = bp1->b_ops;
883 memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount)); 929 memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount));
884 bp1 = NULL; 930 bp1 = NULL;
885 xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); 931 xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
@@ -933,7 +979,7 @@ xfs_attr_leaf_create(
933 XFS_ATTR_FORK); 979 XFS_ATTR_FORK);
934 if (error) 980 if (error)
935 return(error); 981 return(error);
936 ASSERT(bp != NULL); 982 bp->b_ops = &xfs_attr_leaf_buf_ops;
937 leaf = bp->b_addr; 983 leaf = bp->b_addr;
938 memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); 984 memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
939 hdr = &leaf->hdr; 985 hdr = &leaf->hdr;
@@ -1071,7 +1117,7 @@ xfs_attr_leaf_add(
1071 * Compact the entries to coalesce free space. 1117 * Compact the entries to coalesce free space.
1072 * This may change the hdr->count via dropping INCOMPLETE entries. 1118 * This may change the hdr->count via dropping INCOMPLETE entries.
1073 */ 1119 */
1074 xfs_attr_leaf_compact(args->trans, bp); 1120 xfs_attr_leaf_compact(args, bp);
1075 1121
1076 /* 1122 /*
1077 * After compaction, the block is guaranteed to have only one 1123 * After compaction, the block is guaranteed to have only one
@@ -1102,6 +1148,8 @@ xfs_attr_leaf_add_work(
1102 xfs_mount_t *mp; 1148 xfs_mount_t *mp;
1103 int tmp, i; 1149 int tmp, i;
1104 1150
1151 trace_xfs_attr_leaf_add_work(args);
1152
1105 leaf = bp->b_addr; 1153 leaf = bp->b_addr;
1106 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); 1154 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1107 hdr = &leaf->hdr; 1155 hdr = &leaf->hdr;
@@ -1214,15 +1262,17 @@ xfs_attr_leaf_add_work(
1214 */ 1262 */
1215STATIC void 1263STATIC void
1216xfs_attr_leaf_compact( 1264xfs_attr_leaf_compact(
1217 struct xfs_trans *trans, 1265 struct xfs_da_args *args,
1218 struct xfs_buf *bp) 1266 struct xfs_buf *bp)
1219{ 1267{
1220 xfs_attr_leafblock_t *leaf_s, *leaf_d; 1268 xfs_attr_leafblock_t *leaf_s, *leaf_d;
1221 xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; 1269 xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
1222 xfs_mount_t *mp; 1270 struct xfs_trans *trans = args->trans;
1223 char *tmpbuffer; 1271 struct xfs_mount *mp = trans->t_mountp;
1272 char *tmpbuffer;
1273
1274 trace_xfs_attr_leaf_compact(args);
1224 1275
1225 mp = trans->t_mountp;
1226 tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); 1276 tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
1227 ASSERT(tmpbuffer != NULL); 1277 ASSERT(tmpbuffer != NULL);
1228 memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); 1278 memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp));
@@ -1291,6 +1341,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
1291 leaf2 = blk2->bp->b_addr; 1341 leaf2 = blk2->bp->b_addr;
1292 ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); 1342 ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1293 ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); 1343 ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1344 ASSERT(leaf2->hdr.count == 0);
1294 args = state->args; 1345 args = state->args;
1295 1346
1296 trace_xfs_attr_leaf_rebalance(args); 1347 trace_xfs_attr_leaf_rebalance(args);
@@ -1344,9 +1395,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
1344 max = be16_to_cpu(hdr2->firstused) 1395 max = be16_to_cpu(hdr2->firstused)
1345 - sizeof(xfs_attr_leaf_hdr_t); 1396 - sizeof(xfs_attr_leaf_hdr_t);
1346 max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t); 1397 max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t);
1347 if (space > max) { 1398 if (space > max)
1348 xfs_attr_leaf_compact(args->trans, blk2->bp); 1399 xfs_attr_leaf_compact(args, blk2->bp);
1349 }
1350 1400
1351 /* 1401 /*
1352 * Move high entries from leaf1 to low end of leaf2. 1402 * Move high entries from leaf1 to low end of leaf2.
@@ -1361,6 +1411,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
1361 * I assert that since all callers pass in an empty 1411 * I assert that since all callers pass in an empty
1362 * second buffer, this code should never execute. 1412 * second buffer, this code should never execute.
1363 */ 1413 */
1414 ASSERT(0);
1364 1415
1365 /* 1416 /*
1366 * Figure the total bytes to be added to the destination leaf. 1417 * Figure the total bytes to be added to the destination leaf.
@@ -1376,9 +1427,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
1376 max = be16_to_cpu(hdr1->firstused) 1427 max = be16_to_cpu(hdr1->firstused)
1377 - sizeof(xfs_attr_leaf_hdr_t); 1428 - sizeof(xfs_attr_leaf_hdr_t);
1378 max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t); 1429 max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t);
1379 if (space > max) { 1430 if (space > max)
1380 xfs_attr_leaf_compact(args->trans, blk1->bp); 1431 xfs_attr_leaf_compact(args, blk1->bp);
1381 }
1382 1432
1383 /* 1433 /*
1384 * Move low entries from leaf2 to high end of leaf1. 1434 * Move low entries from leaf2 to high end of leaf1.
@@ -1422,10 +1472,24 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
1422 args->index2 = 0; 1472 args->index2 = 0;
1423 args->blkno2 = blk2->blkno; 1473 args->blkno2 = blk2->blkno;
1424 } else { 1474 } else {
1475 /*
1476 * On a double leaf split, the original attr location
1477 * is already stored in blkno2/index2, so don't
1478 * overwrite it overwise we corrupt the tree.
1479 */
1425 blk2->index = blk1->index 1480 blk2->index = blk1->index
1426 - be16_to_cpu(leaf1->hdr.count); 1481 - be16_to_cpu(leaf1->hdr.count);
1427 args->index = args->index2 = blk2->index; 1482 args->index = blk2->index;
1428 args->blkno = args->blkno2 = blk2->blkno; 1483 args->blkno = blk2->blkno;
1484 if (!state->extravalid) {
1485 /*
1486 * set the new attr location to match the old
1487 * one and let the higher level split code
1488 * decide where in the leaf to place it.
1489 */
1490 args->index2 = blk2->index;
1491 args->blkno2 = blk2->blkno;
1492 }
1429 } 1493 }
1430 } else { 1494 } else {
1431 ASSERT(state->inleaf == 1); 1495 ASSERT(state->inleaf == 1);
@@ -1561,6 +1625,8 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
1561 xfs_dablk_t blkno; 1625 xfs_dablk_t blkno;
1562 struct xfs_buf *bp; 1626 struct xfs_buf *bp;
1563 1627
1628 trace_xfs_attr_leaf_toosmall(state->args);
1629
1564 /* 1630 /*
1565 * Check for the degenerate case of the block being over 50% full. 1631 * Check for the degenerate case of the block being over 50% full.
1566 * If so, it's not worth even looking to see if we might be able 1632 * If so, it's not worth even looking to see if we might be able
@@ -1620,18 +1686,16 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
1620 blkno = be32_to_cpu(info->back); 1686 blkno = be32_to_cpu(info->back);
1621 if (blkno == 0) 1687 if (blkno == 0)
1622 continue; 1688 continue;
1623 error = xfs_da_read_buf(state->args->trans, state->args->dp, 1689 error = xfs_attr_leaf_read(state->args->trans, state->args->dp,
1624 blkno, -1, &bp, XFS_ATTR_FORK); 1690 blkno, -1, &bp);
1625 if (error) 1691 if (error)
1626 return(error); 1692 return(error);
1627 ASSERT(bp != NULL);
1628 1693
1629 leaf = (xfs_attr_leafblock_t *)info; 1694 leaf = (xfs_attr_leafblock_t *)info;
1630 count = be16_to_cpu(leaf->hdr.count); 1695 count = be16_to_cpu(leaf->hdr.count);
1631 bytes = state->blocksize - (state->blocksize>>2); 1696 bytes = state->blocksize - (state->blocksize>>2);
1632 bytes -= be16_to_cpu(leaf->hdr.usedbytes); 1697 bytes -= be16_to_cpu(leaf->hdr.usedbytes);
1633 leaf = bp->b_addr; 1698 leaf = bp->b_addr;
1634 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1635 count += be16_to_cpu(leaf->hdr.count); 1699 count += be16_to_cpu(leaf->hdr.count);
1636 bytes -= be16_to_cpu(leaf->hdr.usedbytes); 1700 bytes -= be16_to_cpu(leaf->hdr.usedbytes);
1637 bytes -= count * sizeof(xfs_attr_leaf_entry_t); 1701 bytes -= count * sizeof(xfs_attr_leaf_entry_t);
@@ -1686,6 +1750,8 @@ xfs_attr_leaf_remove(
1686 int tablesize, tmp, i; 1750 int tablesize, tmp, i;
1687 xfs_mount_t *mp; 1751 xfs_mount_t *mp;
1688 1752
1753 trace_xfs_attr_leaf_remove(args);
1754
1689 leaf = bp->b_addr; 1755 leaf = bp->b_addr;
1690 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); 1756 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1691 hdr = &leaf->hdr; 1757 hdr = &leaf->hdr;
@@ -2495,15 +2561,11 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
2495 /* 2561 /*
2496 * Set up the operation. 2562 * Set up the operation.
2497 */ 2563 */
2498 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, 2564 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
2499 XFS_ATTR_FORK); 2565 if (error)
2500 if (error) {
2501 return(error); 2566 return(error);
2502 }
2503 ASSERT(bp != NULL);
2504 2567
2505 leaf = bp->b_addr; 2568 leaf = bp->b_addr;
2506 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
2507 ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); 2569 ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
2508 ASSERT(args->index >= 0); 2570 ASSERT(args->index >= 0);
2509 entry = &leaf->entries[ args->index ]; 2571 entry = &leaf->entries[ args->index ];
@@ -2560,15 +2622,11 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
2560 /* 2622 /*
2561 * Set up the operation. 2623 * Set up the operation.
2562 */ 2624 */
2563 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, 2625 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
2564 XFS_ATTR_FORK); 2626 if (error)
2565 if (error) {
2566 return(error); 2627 return(error);
2567 }
2568 ASSERT(bp != NULL);
2569 2628
2570 leaf = bp->b_addr; 2629 leaf = bp->b_addr;
2571 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
2572 ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); 2630 ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
2573 ASSERT(args->index >= 0); 2631 ASSERT(args->index >= 0);
2574 entry = &leaf->entries[ args->index ]; 2632 entry = &leaf->entries[ args->index ];
@@ -2617,35 +2675,28 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
2617 /* 2675 /*
2618 * Read the block containing the "old" attr 2676 * Read the block containing the "old" attr
2619 */ 2677 */
2620 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1, 2678 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
2621 XFS_ATTR_FORK); 2679 if (error)
2622 if (error) { 2680 return error;
2623 return(error);
2624 }
2625 ASSERT(bp1 != NULL);
2626 2681
2627 /* 2682 /*
2628 * Read the block containing the "new" attr, if it is different 2683 * Read the block containing the "new" attr, if it is different
2629 */ 2684 */
2630 if (args->blkno2 != args->blkno) { 2685 if (args->blkno2 != args->blkno) {
2631 error = xfs_da_read_buf(args->trans, args->dp, args->blkno2, 2686 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2,
2632 -1, &bp2, XFS_ATTR_FORK); 2687 -1, &bp2);
2633 if (error) { 2688 if (error)
2634 return(error); 2689 return error;
2635 }
2636 ASSERT(bp2 != NULL);
2637 } else { 2690 } else {
2638 bp2 = bp1; 2691 bp2 = bp1;
2639 } 2692 }
2640 2693
2641 leaf1 = bp1->b_addr; 2694 leaf1 = bp1->b_addr;
2642 ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
2643 ASSERT(args->index < be16_to_cpu(leaf1->hdr.count)); 2695 ASSERT(args->index < be16_to_cpu(leaf1->hdr.count));
2644 ASSERT(args->index >= 0); 2696 ASSERT(args->index >= 0);
2645 entry1 = &leaf1->entries[ args->index ]; 2697 entry1 = &leaf1->entries[ args->index ];
2646 2698
2647 leaf2 = bp2->b_addr; 2699 leaf2 = bp2->b_addr;
2648 ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
2649 ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count)); 2700 ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count));
2650 ASSERT(args->index2 >= 0); 2701 ASSERT(args->index2 >= 0);
2651 entry2 = &leaf2->entries[ args->index2 ]; 2702 entry2 = &leaf2->entries[ args->index2 ];
@@ -2730,7 +2781,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
2730 * the extents in reverse order the extent containing 2781 * the extents in reverse order the extent containing
2731 * block 0 must still be there. 2782 * block 0 must still be there.
2732 */ 2783 */
2733 error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); 2784 error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
2734 if (error) 2785 if (error)
2735 return(error); 2786 return(error);
2736 blkno = XFS_BUF_ADDR(bp); 2787 blkno = XFS_BUF_ADDR(bp);
@@ -2815,7 +2866,7 @@ xfs_attr_node_inactive(
2815 * traversal of the tree so we may deal with many blocks 2866 * traversal of the tree so we may deal with many blocks
2816 * before we come back to this one. 2867 * before we come back to this one.
2817 */ 2868 */
2818 error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp, 2869 error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp,
2819 XFS_ATTR_FORK); 2870 XFS_ATTR_FORK);
2820 if (error) 2871 if (error)
2821 return(error); 2872 return(error);
@@ -2856,8 +2907,8 @@ xfs_attr_node_inactive(
2856 * child block number. 2907 * child block number.
2857 */ 2908 */
2858 if ((i+1) < count) { 2909 if ((i+1) < count) {
2859 error = xfs_da_read_buf(*trans, dp, 0, parent_blkno, 2910 error = xfs_da_node_read(*trans, dp, 0, parent_blkno,
2860 &bp, XFS_ATTR_FORK); 2911 &bp, XFS_ATTR_FORK);
2861 if (error) 2912 if (error)
2862 return(error); 2913 return(error);
2863 child_fsb = be32_to_cpu(node->btree[i+1].before); 2914 child_fsb = be32_to_cpu(node->btree[i+1].before);
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index dea17722945e..77de139a58f0 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -261,4 +261,10 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
261 struct xfs_buf *leaf2_bp); 261 struct xfs_buf *leaf2_bp);
262int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, 262int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
263 int *local); 263 int *local);
264int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
265 xfs_dablk_t bno, xfs_daddr_t mappedbno,
266 struct xfs_buf **bpp);
267
268extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops;
269
264#endif /* __XFS_ATTR_LEAF_H__ */ 270#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 848ffa77707b..0e92d12765d2 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2437,6 +2437,7 @@ xfs_bmap_btalloc(
2437 * Normal allocation, done through xfs_alloc_vextent. 2437 * Normal allocation, done through xfs_alloc_vextent.
2438 */ 2438 */
2439 tryagain = isaligned = 0; 2439 tryagain = isaligned = 0;
2440 memset(&args, 0, sizeof(args));
2440 args.tp = ap->tp; 2441 args.tp = ap->tp;
2441 args.mp = mp; 2442 args.mp = mp;
2442 args.fsbno = ap->blkno; 2443 args.fsbno = ap->blkno;
@@ -2661,8 +2662,9 @@ xfs_bmap_btree_to_extents(
2661 if ((error = xfs_btree_check_lptr(cur, cbno, 1))) 2662 if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
2662 return error; 2663 return error;
2663#endif 2664#endif
2664 if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, 2665 error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
2665 XFS_BMAP_BTREE_REF))) 2666 &xfs_bmbt_buf_ops);
2667 if (error)
2666 return error; 2668 return error;
2667 cblock = XFS_BUF_TO_BLOCK(cbp); 2669 cblock = XFS_BUF_TO_BLOCK(cbp);
2668 if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) 2670 if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
@@ -3082,6 +3084,7 @@ xfs_bmap_extents_to_btree(
3082 * Convert to a btree with two levels, one record in root. 3084 * Convert to a btree with two levels, one record in root.
3083 */ 3085 */
3084 XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); 3086 XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
3087 memset(&args, 0, sizeof(args));
3085 args.tp = tp; 3088 args.tp = tp;
3086 args.mp = mp; 3089 args.mp = mp;
3087 args.firstblock = *firstblock; 3090 args.firstblock = *firstblock;
@@ -3121,6 +3124,7 @@ xfs_bmap_extents_to_btree(
3121 /* 3124 /*
3122 * Fill in the child block. 3125 * Fill in the child block.
3123 */ 3126 */
3127 abp->b_ops = &xfs_bmbt_buf_ops;
3124 ablock = XFS_BUF_TO_BLOCK(abp); 3128 ablock = XFS_BUF_TO_BLOCK(abp);
3125 ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); 3129 ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
3126 ablock->bb_level = 0; 3130 ablock->bb_level = 0;
@@ -3237,6 +3241,7 @@ xfs_bmap_local_to_extents(
3237 xfs_buf_t *bp; /* buffer for extent block */ 3241 xfs_buf_t *bp; /* buffer for extent block */
3238 xfs_bmbt_rec_host_t *ep;/* extent record pointer */ 3242 xfs_bmbt_rec_host_t *ep;/* extent record pointer */
3239 3243
3244 memset(&args, 0, sizeof(args));
3240 args.tp = tp; 3245 args.tp = tp;
3241 args.mp = ip->i_mount; 3246 args.mp = ip->i_mount;
3242 args.firstblock = *firstblock; 3247 args.firstblock = *firstblock;
@@ -3266,6 +3271,7 @@ xfs_bmap_local_to_extents(
3266 ASSERT(args.len == 1); 3271 ASSERT(args.len == 1);
3267 *firstblock = args.fsbno; 3272 *firstblock = args.fsbno;
3268 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); 3273 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
3274 bp->b_ops = &xfs_bmbt_buf_ops;
3269 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); 3275 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
3270 xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); 3276 xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
3271 xfs_bmap_forkoff_reset(args.mp, ip, whichfork); 3277 xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
@@ -4075,8 +4081,9 @@ xfs_bmap_read_extents(
4075 * pointer (leftmost) at each level. 4081 * pointer (leftmost) at each level.
4076 */ 4082 */
4077 while (level-- > 0) { 4083 while (level-- > 0) {
4078 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, 4084 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
4079 XFS_BMAP_BTREE_REF))) 4085 XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
4086 if (error)
4080 return error; 4087 return error;
4081 block = XFS_BUF_TO_BLOCK(bp); 4088 block = XFS_BUF_TO_BLOCK(bp);
4082 XFS_WANT_CORRUPTED_GOTO( 4089 XFS_WANT_CORRUPTED_GOTO(
@@ -4121,7 +4128,8 @@ xfs_bmap_read_extents(
4121 */ 4128 */
4122 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); 4129 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
4123 if (nextbno != NULLFSBLOCK) 4130 if (nextbno != NULLFSBLOCK)
4124 xfs_btree_reada_bufl(mp, nextbno, 1); 4131 xfs_btree_reada_bufl(mp, nextbno, 1,
4132 &xfs_bmbt_buf_ops);
4125 /* 4133 /*
4126 * Copy records into the extent records. 4134 * Copy records into the extent records.
4127 */ 4135 */
@@ -4153,8 +4161,9 @@ xfs_bmap_read_extents(
4153 */ 4161 */
4154 if (bno == NULLFSBLOCK) 4162 if (bno == NULLFSBLOCK)
4155 break; 4163 break;
4156 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, 4164 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
4157 XFS_BMAP_BTREE_REF))) 4165 XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
4166 if (error)
4158 return error; 4167 return error;
4159 block = XFS_BUF_TO_BLOCK(bp); 4168 block = XFS_BUF_TO_BLOCK(bp);
4160 } 4169 }
@@ -4616,12 +4625,11 @@ xfs_bmapi_delay(
4616 4625
4617 4626
4618STATIC int 4627STATIC int
4619xfs_bmapi_allocate( 4628__xfs_bmapi_allocate(
4620 struct xfs_bmalloca *bma, 4629 struct xfs_bmalloca *bma)
4621 int flags)
4622{ 4630{
4623 struct xfs_mount *mp = bma->ip->i_mount; 4631 struct xfs_mount *mp = bma->ip->i_mount;
4624 int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? 4632 int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ?
4625 XFS_ATTR_FORK : XFS_DATA_FORK; 4633 XFS_ATTR_FORK : XFS_DATA_FORK;
4626 struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); 4634 struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
4627 int tmp_logflags = 0; 4635 int tmp_logflags = 0;
@@ -4654,24 +4662,27 @@ xfs_bmapi_allocate(
4654 * Indicate if this is the first user data in the file, or just any 4662 * Indicate if this is the first user data in the file, or just any
4655 * user data. 4663 * user data.
4656 */ 4664 */
4657 if (!(flags & XFS_BMAPI_METADATA)) { 4665 if (!(bma->flags & XFS_BMAPI_METADATA)) {
4658 bma->userdata = (bma->offset == 0) ? 4666 bma->userdata = (bma->offset == 0) ?
4659 XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; 4667 XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
4660 } 4668 }
4661 4669
4662 bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1; 4670 bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
4663 4671
4664 /* 4672 /*
4665 * Only want to do the alignment at the eof if it is userdata and 4673 * Only want to do the alignment at the eof if it is userdata and
4666 * allocation length is larger than a stripe unit. 4674 * allocation length is larger than a stripe unit.
4667 */ 4675 */
4668 if (mp->m_dalign && bma->length >= mp->m_dalign && 4676 if (mp->m_dalign && bma->length >= mp->m_dalign &&
4669 !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { 4677 !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
4670 error = xfs_bmap_isaeof(bma, whichfork); 4678 error = xfs_bmap_isaeof(bma, whichfork);
4671 if (error) 4679 if (error)
4672 return error; 4680 return error;
4673 } 4681 }
4674 4682
4683 if (bma->flags & XFS_BMAPI_STACK_SWITCH)
4684 bma->stack_switch = 1;
4685
4675 error = xfs_bmap_alloc(bma); 4686 error = xfs_bmap_alloc(bma);
4676 if (error) 4687 if (error)
4677 return error; 4688 return error;
@@ -4706,7 +4717,7 @@ xfs_bmapi_allocate(
4706 * A wasdelay extent has been initialized, so shouldn't be flagged 4717 * A wasdelay extent has been initialized, so shouldn't be flagged
4707 * as unwritten. 4718 * as unwritten.
4708 */ 4719 */
4709 if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) && 4720 if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) &&
4710 xfs_sb_version_hasextflgbit(&mp->m_sb)) 4721 xfs_sb_version_hasextflgbit(&mp->m_sb))
4711 bma->got.br_state = XFS_EXT_UNWRITTEN; 4722 bma->got.br_state = XFS_EXT_UNWRITTEN;
4712 4723
@@ -4734,6 +4745,45 @@ xfs_bmapi_allocate(
4734 return 0; 4745 return 0;
4735} 4746}
4736 4747
4748static void
4749xfs_bmapi_allocate_worker(
4750 struct work_struct *work)
4751{
4752 struct xfs_bmalloca *args = container_of(work,
4753 struct xfs_bmalloca, work);
4754 unsigned long pflags;
4755
4756 /* we are in a transaction context here */
4757 current_set_flags_nested(&pflags, PF_FSTRANS);
4758
4759 args->result = __xfs_bmapi_allocate(args);
4760 complete(args->done);
4761
4762 current_restore_flags_nested(&pflags, PF_FSTRANS);
4763}
4764
4765/*
4766 * Some allocation requests often come in with little stack to work on. Push
4767 * them off to a worker thread so there is lots of stack to use. Otherwise just
4768 * call directly to avoid the context switch overhead here.
4769 */
4770int
4771xfs_bmapi_allocate(
4772 struct xfs_bmalloca *args)
4773{
4774 DECLARE_COMPLETION_ONSTACK(done);
4775
4776 if (!args->stack_switch)
4777 return __xfs_bmapi_allocate(args);
4778
4779
4780 args->done = &done;
4781 INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
4782 queue_work(xfs_alloc_wq, &args->work);
4783 wait_for_completion(&done);
4784 return args->result;
4785}
4786
4737STATIC int 4787STATIC int
4738xfs_bmapi_convert_unwritten( 4788xfs_bmapi_convert_unwritten(
4739 struct xfs_bmalloca *bma, 4789 struct xfs_bmalloca *bma,
@@ -4919,6 +4969,7 @@ xfs_bmapi_write(
4919 bma.conv = !!(flags & XFS_BMAPI_CONVERT); 4969 bma.conv = !!(flags & XFS_BMAPI_CONVERT);
4920 bma.wasdel = wasdelay; 4970 bma.wasdel = wasdelay;
4921 bma.offset = bno; 4971 bma.offset = bno;
4972 bma.flags = flags;
4922 4973
4923 /* 4974 /*
4924 * There's a 32/64 bit type mismatch between the 4975 * There's a 32/64 bit type mismatch between the
@@ -4934,7 +4985,7 @@ xfs_bmapi_write(
4934 4985
4935 ASSERT(len > 0); 4986 ASSERT(len > 0);
4936 ASSERT(bma.length > 0); 4987 ASSERT(bma.length > 0);
4937 error = xfs_bmapi_allocate(&bma, flags); 4988 error = xfs_bmapi_allocate(&bma);
4938 if (error) 4989 if (error)
4939 goto error0; 4990 goto error0;
4940 if (bma.blkno == NULLFSBLOCK) 4991 if (bma.blkno == NULLFSBLOCK)
@@ -5554,7 +5605,7 @@ xfs_getbmap(
5554 xfs_ilock(ip, XFS_IOLOCK_SHARED); 5605 xfs_ilock(ip, XFS_IOLOCK_SHARED);
5555 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { 5606 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
5556 if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { 5607 if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
5557 error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); 5608 error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
5558 if (error) 5609 if (error)
5559 goto out_unlock_iolock; 5610 goto out_unlock_iolock;
5560 } 5611 }
@@ -5823,15 +5874,16 @@ xfs_bmap_check_leaf_extents(
5823 */ 5874 */
5824 while (level-- > 0) { 5875 while (level-- > 0) {
5825 /* See if buf is in cur first */ 5876 /* See if buf is in cur first */
5877 bp_release = 0;
5826 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); 5878 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
5827 if (bp) { 5879 if (!bp) {
5828 bp_release = 0;
5829 } else {
5830 bp_release = 1; 5880 bp_release = 1;
5881 error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
5882 XFS_BMAP_BTREE_REF,
5883 &xfs_bmbt_buf_ops);
5884 if (error)
5885 goto error_norelse;
5831 } 5886 }
5832 if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
5833 XFS_BMAP_BTREE_REF)))
5834 goto error_norelse;
5835 block = XFS_BUF_TO_BLOCK(bp); 5887 block = XFS_BUF_TO_BLOCK(bp);
5836 XFS_WANT_CORRUPTED_GOTO( 5888 XFS_WANT_CORRUPTED_GOTO(
5837 xfs_bmap_sanity_check(mp, bp, level), 5889 xfs_bmap_sanity_check(mp, bp, level),
@@ -5908,15 +5960,16 @@ xfs_bmap_check_leaf_extents(
5908 if (bno == NULLFSBLOCK) 5960 if (bno == NULLFSBLOCK)
5909 break; 5961 break;
5910 5962
5963 bp_release = 0;
5911 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); 5964 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
5912 if (bp) { 5965 if (!bp) {
5913 bp_release = 0;
5914 } else {
5915 bp_release = 1; 5966 bp_release = 1;
5967 error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
5968 XFS_BMAP_BTREE_REF,
5969 &xfs_bmbt_buf_ops);
5970 if (error)
5971 goto error_norelse;
5916 } 5972 }
5917 if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
5918 XFS_BMAP_BTREE_REF)))
5919 goto error_norelse;
5920 block = XFS_BUF_TO_BLOCK(bp); 5973 block = XFS_BUF_TO_BLOCK(bp);
5921 } 5974 }
5922 if (bp_release) { 5975 if (bp_release) {
@@ -6007,7 +6060,9 @@ xfs_bmap_count_tree(
6007 struct xfs_btree_block *block, *nextblock; 6060 struct xfs_btree_block *block, *nextblock;
6008 int numrecs; 6061 int numrecs;
6009 6062
6010 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF))) 6063 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
6064 &xfs_bmbt_buf_ops);
6065 if (error)
6011 return error; 6066 return error;
6012 *count += 1; 6067 *count += 1;
6013 block = XFS_BUF_TO_BLOCK(bp); 6068 block = XFS_BUF_TO_BLOCK(bp);
@@ -6016,8 +6071,10 @@ xfs_bmap_count_tree(
6016 /* Not at node above leaves, count this level of nodes */ 6071 /* Not at node above leaves, count this level of nodes */
6017 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); 6072 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
6018 while (nextbno != NULLFSBLOCK) { 6073 while (nextbno != NULLFSBLOCK) {
6019 if ((error = xfs_btree_read_bufl(mp, tp, nextbno, 6074 error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
6020 0, &nbp, XFS_BMAP_BTREE_REF))) 6075 XFS_BMAP_BTREE_REF,
6076 &xfs_bmbt_buf_ops);
6077 if (error)
6021 return error; 6078 return error;
6022 *count += 1; 6079 *count += 1;
6023 nextblock = XFS_BUF_TO_BLOCK(nbp); 6080 nextblock = XFS_BUF_TO_BLOCK(nbp);
@@ -6046,8 +6103,10 @@ xfs_bmap_count_tree(
6046 if (nextbno == NULLFSBLOCK) 6103 if (nextbno == NULLFSBLOCK)
6047 break; 6104 break;
6048 bno = nextbno; 6105 bno = nextbno;
6049 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, 6106 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
6050 XFS_BMAP_BTREE_REF))) 6107 XFS_BMAP_BTREE_REF,
6108 &xfs_bmbt_buf_ops);
6109 if (error)
6051 return error; 6110 return error;
6052 *count += 1; 6111 *count += 1;
6053 block = XFS_BUF_TO_BLOCK(bp); 6112 block = XFS_BUF_TO_BLOCK(bp);
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 803b56d7ce16..5f469c3516eb 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -77,6 +77,7 @@ typedef struct xfs_bmap_free
77 * from written to unwritten, otherwise convert from unwritten to written. 77 * from written to unwritten, otherwise convert from unwritten to written.
78 */ 78 */
79#define XFS_BMAPI_CONVERT 0x040 79#define XFS_BMAPI_CONVERT 0x040
80#define XFS_BMAPI_STACK_SWITCH 0x080
80 81
81#define XFS_BMAPI_FLAGS \ 82#define XFS_BMAPI_FLAGS \
82 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ 83 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \
@@ -85,7 +86,8 @@ typedef struct xfs_bmap_free
85 { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ 86 { XFS_BMAPI_PREALLOC, "PREALLOC" }, \
86 { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ 87 { XFS_BMAPI_IGSTATE, "IGSTATE" }, \
87 { XFS_BMAPI_CONTIG, "CONTIG" }, \ 88 { XFS_BMAPI_CONTIG, "CONTIG" }, \
88 { XFS_BMAPI_CONVERT, "CONVERT" } 89 { XFS_BMAPI_CONVERT, "CONVERT" }, \
90 { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" }
89 91
90 92
91static inline int xfs_bmapi_aflag(int w) 93static inline int xfs_bmapi_aflag(int w)
@@ -133,6 +135,11 @@ typedef struct xfs_bmalloca {
133 char userdata;/* set if is user data */ 135 char userdata;/* set if is user data */
134 char aeof; /* allocated space at eof */ 136 char aeof; /* allocated space at eof */
135 char conv; /* overwriting unwritten extents */ 137 char conv; /* overwriting unwritten extents */
138 char stack_switch;
139 int flags;
140 struct completion *done;
141 struct work_struct work;
142 int result;
136} xfs_bmalloca_t; 143} xfs_bmalloca_t;
137 144
138/* 145/*
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 862084a47a7e..061b45cbe614 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -36,6 +36,7 @@
36#include "xfs_bmap.h" 36#include "xfs_bmap.h"
37#include "xfs_error.h" 37#include "xfs_error.h"
38#include "xfs_quota.h" 38#include "xfs_quota.h"
39#include "xfs_trace.h"
39 40
40/* 41/*
41 * Determine the extent state. 42 * Determine the extent state.
@@ -707,6 +708,67 @@ xfs_bmbt_key_diff(
707 cur->bc_rec.b.br_startoff; 708 cur->bc_rec.b.br_startoff;
708} 709}
709 710
711static void
712xfs_bmbt_verify(
713 struct xfs_buf *bp)
714{
715 struct xfs_mount *mp = bp->b_target->bt_mount;
716 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
717 unsigned int level;
718 int lblock_ok; /* block passes checks */
719
720 /* magic number and level verification.
721 *
722 * We don't know waht fork we belong to, so just verify that the level
723 * is less than the maximum of the two. Later checks will be more
724 * precise.
725 */
726 level = be16_to_cpu(block->bb_level);
727 lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) &&
728 level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]);
729
730 /* numrecs verification */
731 lblock_ok = lblock_ok &&
732 be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0];
733
734 /* sibling pointer verification */
735 lblock_ok = lblock_ok &&
736 block->bb_u.l.bb_leftsib &&
737 (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
738 XFS_FSB_SANITY_CHECK(mp,
739 be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
740 block->bb_u.l.bb_rightsib &&
741 (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
742 XFS_FSB_SANITY_CHECK(mp,
743 be64_to_cpu(block->bb_u.l.bb_rightsib)));
744
745 if (!lblock_ok) {
746 trace_xfs_btree_corrupt(bp, _RET_IP_);
747 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
748 xfs_buf_ioerror(bp, EFSCORRUPTED);
749 }
750}
751
752static void
753xfs_bmbt_read_verify(
754 struct xfs_buf *bp)
755{
756 xfs_bmbt_verify(bp);
757}
758
759static void
760xfs_bmbt_write_verify(
761 struct xfs_buf *bp)
762{
763 xfs_bmbt_verify(bp);
764}
765
766const struct xfs_buf_ops xfs_bmbt_buf_ops = {
767 .verify_read = xfs_bmbt_read_verify,
768 .verify_write = xfs_bmbt_write_verify,
769};
770
771
710#ifdef DEBUG 772#ifdef DEBUG
711STATIC int 773STATIC int
712xfs_bmbt_keys_inorder( 774xfs_bmbt_keys_inorder(
@@ -746,6 +808,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
746 .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, 808 .init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
747 .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, 809 .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur,
748 .key_diff = xfs_bmbt_key_diff, 810 .key_diff = xfs_bmbt_key_diff,
811 .buf_ops = &xfs_bmbt_buf_ops,
749#ifdef DEBUG 812#ifdef DEBUG
750 .keys_inorder = xfs_bmbt_keys_inorder, 813 .keys_inorder = xfs_bmbt_keys_inorder,
751 .recs_inorder = xfs_bmbt_recs_inorder, 814 .recs_inorder = xfs_bmbt_recs_inorder,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0e66c4ea0f85..88469ca08696 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -236,5 +236,6 @@ extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
236extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, 236extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
237 struct xfs_trans *, struct xfs_inode *, int); 237 struct xfs_trans *, struct xfs_inode *, int);
238 238
239extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
239 240
240#endif /* __XFS_BMAP_BTREE_H__ */ 241#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e53e317b1582..db010408d701 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -266,9 +266,13 @@ xfs_btree_dup_cursor(
266 for (i = 0; i < new->bc_nlevels; i++) { 266 for (i = 0; i < new->bc_nlevels; i++) {
267 new->bc_ptrs[i] = cur->bc_ptrs[i]; 267 new->bc_ptrs[i] = cur->bc_ptrs[i];
268 new->bc_ra[i] = cur->bc_ra[i]; 268 new->bc_ra[i] = cur->bc_ra[i];
269 if ((bp = cur->bc_bufs[i])) { 269 bp = cur->bc_bufs[i];
270 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 270 if (bp) {
271 XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) { 271 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
272 XFS_BUF_ADDR(bp), mp->m_bsize,
273 0, &bp,
274 cur->bc_ops->buf_ops);
275 if (error) {
272 xfs_btree_del_cursor(new, error); 276 xfs_btree_del_cursor(new, error);
273 *ncur = NULL; 277 *ncur = NULL;
274 return error; 278 return error;
@@ -609,25 +613,26 @@ xfs_btree_offsets(
609 * Get a buffer for the block, return it read in. 613 * Get a buffer for the block, return it read in.
610 * Long-form addressing. 614 * Long-form addressing.
611 */ 615 */
612int /* error */ 616int
613xfs_btree_read_bufl( 617xfs_btree_read_bufl(
614 xfs_mount_t *mp, /* file system mount point */ 618 struct xfs_mount *mp, /* file system mount point */
615 xfs_trans_t *tp, /* transaction pointer */ 619 struct xfs_trans *tp, /* transaction pointer */
616 xfs_fsblock_t fsbno, /* file system block number */ 620 xfs_fsblock_t fsbno, /* file system block number */
617 uint lock, /* lock flags for read_buf */ 621 uint lock, /* lock flags for read_buf */
618 xfs_buf_t **bpp, /* buffer for fsbno */ 622 struct xfs_buf **bpp, /* buffer for fsbno */
619 int refval) /* ref count value for buffer */ 623 int refval, /* ref count value for buffer */
620{ 624 const struct xfs_buf_ops *ops)
621 xfs_buf_t *bp; /* return value */ 625{
626 struct xfs_buf *bp; /* return value */
622 xfs_daddr_t d; /* real disk block address */ 627 xfs_daddr_t d; /* real disk block address */
623 int error; 628 int error;
624 629
625 ASSERT(fsbno != NULLFSBLOCK); 630 ASSERT(fsbno != NULLFSBLOCK);
626 d = XFS_FSB_TO_DADDR(mp, fsbno); 631 d = XFS_FSB_TO_DADDR(mp, fsbno);
627 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, 632 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
628 mp->m_bsize, lock, &bp))) { 633 mp->m_bsize, lock, &bp, ops);
634 if (error)
629 return error; 635 return error;
630 }
631 ASSERT(!xfs_buf_geterror(bp)); 636 ASSERT(!xfs_buf_geterror(bp));
632 if (bp) 637 if (bp)
633 xfs_buf_set_ref(bp, refval); 638 xfs_buf_set_ref(bp, refval);
@@ -642,15 +647,16 @@ xfs_btree_read_bufl(
642/* ARGSUSED */ 647/* ARGSUSED */
643void 648void
644xfs_btree_reada_bufl( 649xfs_btree_reada_bufl(
645 xfs_mount_t *mp, /* file system mount point */ 650 struct xfs_mount *mp, /* file system mount point */
646 xfs_fsblock_t fsbno, /* file system block number */ 651 xfs_fsblock_t fsbno, /* file system block number */
647 xfs_extlen_t count) /* count of filesystem blocks */ 652 xfs_extlen_t count, /* count of filesystem blocks */
653 const struct xfs_buf_ops *ops)
648{ 654{
649 xfs_daddr_t d; 655 xfs_daddr_t d;
650 656
651 ASSERT(fsbno != NULLFSBLOCK); 657 ASSERT(fsbno != NULLFSBLOCK);
652 d = XFS_FSB_TO_DADDR(mp, fsbno); 658 d = XFS_FSB_TO_DADDR(mp, fsbno);
653 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); 659 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
654} 660}
655 661
656/* 662/*
@@ -660,17 +666,18 @@ xfs_btree_reada_bufl(
660/* ARGSUSED */ 666/* ARGSUSED */
661void 667void
662xfs_btree_reada_bufs( 668xfs_btree_reada_bufs(
663 xfs_mount_t *mp, /* file system mount point */ 669 struct xfs_mount *mp, /* file system mount point */
664 xfs_agnumber_t agno, /* allocation group number */ 670 xfs_agnumber_t agno, /* allocation group number */
665 xfs_agblock_t agbno, /* allocation group block number */ 671 xfs_agblock_t agbno, /* allocation group block number */
666 xfs_extlen_t count) /* count of filesystem blocks */ 672 xfs_extlen_t count, /* count of filesystem blocks */
673 const struct xfs_buf_ops *ops)
667{ 674{
668 xfs_daddr_t d; 675 xfs_daddr_t d;
669 676
670 ASSERT(agno != NULLAGNUMBER); 677 ASSERT(agno != NULLAGNUMBER);
671 ASSERT(agbno != NULLAGBLOCK); 678 ASSERT(agbno != NULLAGBLOCK);
672 d = XFS_AGB_TO_DADDR(mp, agno, agbno); 679 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
673 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); 680 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
674} 681}
675 682
676STATIC int 683STATIC int
@@ -684,12 +691,14 @@ xfs_btree_readahead_lblock(
684 xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 691 xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
685 692
686 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) { 693 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
687 xfs_btree_reada_bufl(cur->bc_mp, left, 1); 694 xfs_btree_reada_bufl(cur->bc_mp, left, 1,
695 cur->bc_ops->buf_ops);
688 rval++; 696 rval++;
689 } 697 }
690 698
691 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) { 699 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
692 xfs_btree_reada_bufl(cur->bc_mp, right, 1); 700 xfs_btree_reada_bufl(cur->bc_mp, right, 1,
701 cur->bc_ops->buf_ops);
693 rval++; 702 rval++;
694 } 703 }
695 704
@@ -709,13 +718,13 @@ xfs_btree_readahead_sblock(
709 718
710 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { 719 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
711 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, 720 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
712 left, 1); 721 left, 1, cur->bc_ops->buf_ops);
713 rval++; 722 rval++;
714 } 723 }
715 724
716 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { 725 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
717 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, 726 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
718 right, 1); 727 right, 1, cur->bc_ops->buf_ops);
719 rval++; 728 rval++;
720 } 729 }
721 730
@@ -853,18 +862,22 @@ xfs_btree_set_sibling(
853 } 862 }
854} 863}
855 864
856STATIC void 865void
857xfs_btree_init_block( 866xfs_btree_init_block(
858 struct xfs_btree_cur *cur, 867 struct xfs_mount *mp,
859 int level, 868 struct xfs_buf *bp,
860 int numrecs, 869 __u32 magic,
861 struct xfs_btree_block *new) /* new block */ 870 __u16 level,
871 __u16 numrecs,
872 unsigned int flags)
862{ 873{
863 new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]); 874 struct xfs_btree_block *new = XFS_BUF_TO_BLOCK(bp);
875
876 new->bb_magic = cpu_to_be32(magic);
864 new->bb_level = cpu_to_be16(level); 877 new->bb_level = cpu_to_be16(level);
865 new->bb_numrecs = cpu_to_be16(numrecs); 878 new->bb_numrecs = cpu_to_be16(numrecs);
866 879
867 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 880 if (flags & XFS_BTREE_LONG_PTRS) {
868 new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); 881 new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
869 new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); 882 new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
870 } else { 883 } else {
@@ -873,6 +886,17 @@ xfs_btree_init_block(
873 } 886 }
874} 887}
875 888
889STATIC void
890xfs_btree_init_block_cur(
891 struct xfs_btree_cur *cur,
892 int level,
893 int numrecs,
894 struct xfs_buf *bp)
895{
896 xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum],
897 level, numrecs, cur->bc_flags);
898}
899
876/* 900/*
877 * Return true if ptr is the last record in the btree and 901 * Return true if ptr is the last record in the btree and
878 * we need to track updateѕ to this record. The decision 902 * we need to track updateѕ to this record. The decision
@@ -972,6 +996,7 @@ xfs_btree_get_buf_block(
972 if (!*bpp) 996 if (!*bpp)
973 return ENOMEM; 997 return ENOMEM;
974 998
999 (*bpp)->b_ops = cur->bc_ops->buf_ops;
975 *block = XFS_BUF_TO_BLOCK(*bpp); 1000 *block = XFS_BUF_TO_BLOCK(*bpp);
976 return 0; 1001 return 0;
977} 1002}
@@ -998,19 +1023,15 @@ xfs_btree_read_buf_block(
998 1023
999 d = xfs_btree_ptr_to_daddr(cur, ptr); 1024 d = xfs_btree_ptr_to_daddr(cur, ptr);
1000 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, 1025 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1001 mp->m_bsize, flags, bpp); 1026 mp->m_bsize, flags, bpp,
1027 cur->bc_ops->buf_ops);
1002 if (error) 1028 if (error)
1003 return error; 1029 return error;
1004 1030
1005 ASSERT(!xfs_buf_geterror(*bpp)); 1031 ASSERT(!xfs_buf_geterror(*bpp));
1006
1007 xfs_btree_set_refs(cur, *bpp); 1032 xfs_btree_set_refs(cur, *bpp);
1008 *block = XFS_BUF_TO_BLOCK(*bpp); 1033 *block = XFS_BUF_TO_BLOCK(*bpp);
1009 1034 return 0;
1010 error = xfs_btree_check_block(cur, *block, level, *bpp);
1011 if (error)
1012 xfs_trans_brelse(cur->bc_tp, *bpp);
1013 return error;
1014} 1035}
1015 1036
1016/* 1037/*
@@ -2183,7 +2204,7 @@ xfs_btree_split(
2183 goto error0; 2204 goto error0;
2184 2205
2185 /* Fill in the btree header for the new right block. */ 2206 /* Fill in the btree header for the new right block. */
2186 xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right); 2207 xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp);
2187 2208
2188 /* 2209 /*
2189 * Split the entries between the old and the new block evenly. 2210 * Split the entries between the old and the new block evenly.
@@ -2492,7 +2513,7 @@ xfs_btree_new_root(
2492 nptr = 2; 2513 nptr = 2;
2493 } 2514 }
2494 /* Fill in the new block's btree header and log it. */ 2515 /* Fill in the new block's btree header and log it. */
2495 xfs_btree_init_block(cur, cur->bc_nlevels, 2, new); 2516 xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp);
2496 xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); 2517 xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
2497 ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && 2518 ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
2498 !xfs_btree_ptr_is_null(cur, &rptr)); 2519 !xfs_btree_ptr_is_null(cur, &rptr));
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 5b240de104c0..f932897194eb 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -188,6 +188,8 @@ struct xfs_btree_ops {
188 __int64_t (*key_diff)(struct xfs_btree_cur *cur, 188 __int64_t (*key_diff)(struct xfs_btree_cur *cur,
189 union xfs_btree_key *key); 189 union xfs_btree_key *key);
190 190
191 const struct xfs_buf_ops *buf_ops;
192
191#ifdef DEBUG 193#ifdef DEBUG
192 /* check that k1 is lower than k2 */ 194 /* check that k1 is lower than k2 */
193 int (*keys_inorder)(struct xfs_btree_cur *cur, 195 int (*keys_inorder)(struct xfs_btree_cur *cur,
@@ -355,7 +357,8 @@ xfs_btree_read_bufl(
355 xfs_fsblock_t fsbno, /* file system block number */ 357 xfs_fsblock_t fsbno, /* file system block number */
356 uint lock, /* lock flags for read_buf */ 358 uint lock, /* lock flags for read_buf */
357 struct xfs_buf **bpp, /* buffer for fsbno */ 359 struct xfs_buf **bpp, /* buffer for fsbno */
358 int refval);/* ref count value for buffer */ 360 int refval, /* ref count value for buffer */
361 const struct xfs_buf_ops *ops);
359 362
360/* 363/*
361 * Read-ahead the block, don't wait for it, don't return a buffer. 364 * Read-ahead the block, don't wait for it, don't return a buffer.
@@ -365,7 +368,8 @@ void /* error */
365xfs_btree_reada_bufl( 368xfs_btree_reada_bufl(
366 struct xfs_mount *mp, /* file system mount point */ 369 struct xfs_mount *mp, /* file system mount point */
367 xfs_fsblock_t fsbno, /* file system block number */ 370 xfs_fsblock_t fsbno, /* file system block number */
368 xfs_extlen_t count); /* count of filesystem blocks */ 371 xfs_extlen_t count, /* count of filesystem blocks */
372 const struct xfs_buf_ops *ops);
369 373
370/* 374/*
371 * Read-ahead the block, don't wait for it, don't return a buffer. 375 * Read-ahead the block, don't wait for it, don't return a buffer.
@@ -376,8 +380,20 @@ xfs_btree_reada_bufs(
376 struct xfs_mount *mp, /* file system mount point */ 380 struct xfs_mount *mp, /* file system mount point */
377 xfs_agnumber_t agno, /* allocation group number */ 381 xfs_agnumber_t agno, /* allocation group number */
378 xfs_agblock_t agbno, /* allocation group block number */ 382 xfs_agblock_t agbno, /* allocation group block number */
379 xfs_extlen_t count); /* count of filesystem blocks */ 383 xfs_extlen_t count, /* count of filesystem blocks */
384 const struct xfs_buf_ops *ops);
380 385
386/*
387 * Initialise a new btree block header
388 */
389void
390xfs_btree_init_block(
391 struct xfs_mount *mp,
392 struct xfs_buf *bp,
393 __u32 magic,
394 __u16 level,
395 __u16 numrecs,
396 unsigned int flags);
381 397
382/* 398/*
383 * Common btree core entry points. 399 * Common btree core entry points.
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 933b7930b863..26673a0b20e7 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -569,7 +569,9 @@ found:
569 */ 569 */
570 if (bp->b_flags & XBF_STALE) { 570 if (bp->b_flags & XBF_STALE) {
571 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 571 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
572 ASSERT(bp->b_iodone == NULL);
572 bp->b_flags &= _XBF_KMEM | _XBF_PAGES; 573 bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
574 bp->b_ops = NULL;
573 } 575 }
574 576
575 trace_xfs_buf_find(bp, flags, _RET_IP_); 577 trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -654,7 +656,8 @@ xfs_buf_read_map(
654 struct xfs_buftarg *target, 656 struct xfs_buftarg *target,
655 struct xfs_buf_map *map, 657 struct xfs_buf_map *map,
656 int nmaps, 658 int nmaps,
657 xfs_buf_flags_t flags) 659 xfs_buf_flags_t flags,
660 const struct xfs_buf_ops *ops)
658{ 661{
659 struct xfs_buf *bp; 662 struct xfs_buf *bp;
660 663
@@ -666,6 +669,7 @@ xfs_buf_read_map(
666 669
667 if (!XFS_BUF_ISDONE(bp)) { 670 if (!XFS_BUF_ISDONE(bp)) {
668 XFS_STATS_INC(xb_get_read); 671 XFS_STATS_INC(xb_get_read);
672 bp->b_ops = ops;
669 _xfs_buf_read(bp, flags); 673 _xfs_buf_read(bp, flags);
670 } else if (flags & XBF_ASYNC) { 674 } else if (flags & XBF_ASYNC) {
671 /* 675 /*
@@ -691,13 +695,14 @@ void
691xfs_buf_readahead_map( 695xfs_buf_readahead_map(
692 struct xfs_buftarg *target, 696 struct xfs_buftarg *target,
693 struct xfs_buf_map *map, 697 struct xfs_buf_map *map,
694 int nmaps) 698 int nmaps,
699 const struct xfs_buf_ops *ops)
695{ 700{
696 if (bdi_read_congested(target->bt_bdi)) 701 if (bdi_read_congested(target->bt_bdi))
697 return; 702 return;
698 703
699 xfs_buf_read_map(target, map, nmaps, 704 xfs_buf_read_map(target, map, nmaps,
700 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); 705 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops);
701} 706}
702 707
703/* 708/*
@@ -709,10 +714,10 @@ xfs_buf_read_uncached(
709 struct xfs_buftarg *target, 714 struct xfs_buftarg *target,
710 xfs_daddr_t daddr, 715 xfs_daddr_t daddr,
711 size_t numblks, 716 size_t numblks,
712 int flags) 717 int flags,
718 const struct xfs_buf_ops *ops)
713{ 719{
714 xfs_buf_t *bp; 720 struct xfs_buf *bp;
715 int error;
716 721
717 bp = xfs_buf_get_uncached(target, numblks, flags); 722 bp = xfs_buf_get_uncached(target, numblks, flags);
718 if (!bp) 723 if (!bp)
@@ -723,13 +728,10 @@ xfs_buf_read_uncached(
723 bp->b_bn = daddr; 728 bp->b_bn = daddr;
724 bp->b_maps[0].bm_bn = daddr; 729 bp->b_maps[0].bm_bn = daddr;
725 bp->b_flags |= XBF_READ; 730 bp->b_flags |= XBF_READ;
731 bp->b_ops = ops;
726 732
727 xfsbdstrat(target->bt_mount, bp); 733 xfsbdstrat(target->bt_mount, bp);
728 error = xfs_buf_iowait(bp); 734 xfs_buf_iowait(bp);
729 if (error) {
730 xfs_buf_relse(bp);
731 return NULL;
732 }
733 return bp; 735 return bp;
734} 736}
735 737
@@ -999,27 +1001,37 @@ STATIC void
999xfs_buf_iodone_work( 1001xfs_buf_iodone_work(
1000 struct work_struct *work) 1002 struct work_struct *work)
1001{ 1003{
1002 xfs_buf_t *bp = 1004 struct xfs_buf *bp =
1003 container_of(work, xfs_buf_t, b_iodone_work); 1005 container_of(work, xfs_buf_t, b_iodone_work);
1006 bool read = !!(bp->b_flags & XBF_READ);
1007
1008 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1009 if (read && bp->b_ops)
1010 bp->b_ops->verify_read(bp);
1004 1011
1005 if (bp->b_iodone) 1012 if (bp->b_iodone)
1006 (*(bp->b_iodone))(bp); 1013 (*(bp->b_iodone))(bp);
1007 else if (bp->b_flags & XBF_ASYNC) 1014 else if (bp->b_flags & XBF_ASYNC)
1008 xfs_buf_relse(bp); 1015 xfs_buf_relse(bp);
1016 else {
1017 ASSERT(read && bp->b_ops);
1018 complete(&bp->b_iowait);
1019 }
1009} 1020}
1010 1021
1011void 1022void
1012xfs_buf_ioend( 1023xfs_buf_ioend(
1013 xfs_buf_t *bp, 1024 struct xfs_buf *bp,
1014 int schedule) 1025 int schedule)
1015{ 1026{
1027 bool read = !!(bp->b_flags & XBF_READ);
1028
1016 trace_xfs_buf_iodone(bp, _RET_IP_); 1029 trace_xfs_buf_iodone(bp, _RET_IP_);
1017 1030
1018 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1019 if (bp->b_error == 0) 1031 if (bp->b_error == 0)
1020 bp->b_flags |= XBF_DONE; 1032 bp->b_flags |= XBF_DONE;
1021 1033
1022 if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { 1034 if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
1023 if (schedule) { 1035 if (schedule) {
1024 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); 1036 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
1025 queue_work(xfslogd_workqueue, &bp->b_iodone_work); 1037 queue_work(xfslogd_workqueue, &bp->b_iodone_work);
@@ -1027,6 +1039,7 @@ xfs_buf_ioend(
1027 xfs_buf_iodone_work(&bp->b_iodone_work); 1039 xfs_buf_iodone_work(&bp->b_iodone_work);
1028 } 1040 }
1029 } else { 1041 } else {
1042 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1030 complete(&bp->b_iowait); 1043 complete(&bp->b_iowait);
1031 } 1044 }
1032} 1045}
@@ -1197,9 +1210,14 @@ xfs_buf_bio_end_io(
1197{ 1210{
1198 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; 1211 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
1199 1212
1200 xfs_buf_ioerror(bp, -error); 1213 /*
1214 * don't overwrite existing errors - otherwise we can lose errors on
1215 * buffers that require multiple bios to complete.
1216 */
1217 if (!bp->b_error)
1218 xfs_buf_ioerror(bp, -error);
1201 1219
1202 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1220 if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1203 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1221 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1204 1222
1205 _xfs_buf_ioend(bp, 1); 1223 _xfs_buf_ioend(bp, 1);
@@ -1279,6 +1297,11 @@ next_chunk:
1279 if (size) 1297 if (size)
1280 goto next_chunk; 1298 goto next_chunk;
1281 } else { 1299 } else {
1300 /*
1301 * This is guaranteed not to be the last io reference count
1302 * because the caller (xfs_buf_iorequest) holds a count itself.
1303 */
1304 atomic_dec(&bp->b_io_remaining);
1282 xfs_buf_ioerror(bp, EIO); 1305 xfs_buf_ioerror(bp, EIO);
1283 bio_put(bio); 1306 bio_put(bio);
1284 } 1307 }
@@ -1304,6 +1327,20 @@ _xfs_buf_ioapply(
1304 rw |= REQ_FUA; 1327 rw |= REQ_FUA;
1305 if (bp->b_flags & XBF_FLUSH) 1328 if (bp->b_flags & XBF_FLUSH)
1306 rw |= REQ_FLUSH; 1329 rw |= REQ_FLUSH;
1330
1331 /*
1332 * Run the write verifier callback function if it exists. If
1333 * this function fails it will mark the buffer with an error and
1334 * the IO should not be dispatched.
1335 */
1336 if (bp->b_ops) {
1337 bp->b_ops->verify_write(bp);
1338 if (bp->b_error) {
1339 xfs_force_shutdown(bp->b_target->bt_mount,
1340 SHUTDOWN_CORRUPT_INCORE);
1341 return;
1342 }
1343 }
1307 } else if (bp->b_flags & XBF_READ_AHEAD) { 1344 } else if (bp->b_flags & XBF_READ_AHEAD) {
1308 rw = READA; 1345 rw = READA;
1309 } else { 1346 } else {
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 7c0b6a0a1557..23f5642480bb 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -100,6 +100,7 @@ typedef struct xfs_buftarg {
100struct xfs_buf; 100struct xfs_buf;
101typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); 101typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
102 102
103
103#define XB_PAGES 2 104#define XB_PAGES 2
104 105
105struct xfs_buf_map { 106struct xfs_buf_map {
@@ -110,6 +111,11 @@ struct xfs_buf_map {
110#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \ 111#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \
111 struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; 112 struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };
112 113
114struct xfs_buf_ops {
115 void (*verify_read)(struct xfs_buf *);
116 void (*verify_write)(struct xfs_buf *);
117};
118
113typedef struct xfs_buf { 119typedef struct xfs_buf {
114 /* 120 /*
115 * first cacheline holds all the fields needed for an uncontended cache 121 * first cacheline holds all the fields needed for an uncontended cache
@@ -153,13 +159,13 @@ typedef struct xfs_buf {
153 unsigned int b_page_count; /* size of page array */ 159 unsigned int b_page_count; /* size of page array */
154 unsigned int b_offset; /* page offset in first page */ 160 unsigned int b_offset; /* page offset in first page */
155 unsigned short b_error; /* error code on I/O */ 161 unsigned short b_error; /* error code on I/O */
162 const struct xfs_buf_ops *b_ops;
156 163
157#ifdef XFS_BUF_LOCK_TRACKING 164#ifdef XFS_BUF_LOCK_TRACKING
158 int b_last_holder; 165 int b_last_holder;
159#endif 166#endif
160} xfs_buf_t; 167} xfs_buf_t;
161 168
162
163/* Finding and Reading Buffers */ 169/* Finding and Reading Buffers */
164struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, 170struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target,
165 struct xfs_buf_map *map, int nmaps, 171 struct xfs_buf_map *map, int nmaps,
@@ -196,9 +202,11 @@ struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target,
196 xfs_buf_flags_t flags); 202 xfs_buf_flags_t flags);
197struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, 203struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target,
198 struct xfs_buf_map *map, int nmaps, 204 struct xfs_buf_map *map, int nmaps,
199 xfs_buf_flags_t flags); 205 xfs_buf_flags_t flags,
206 const struct xfs_buf_ops *ops);
200void xfs_buf_readahead_map(struct xfs_buftarg *target, 207void xfs_buf_readahead_map(struct xfs_buftarg *target,
201 struct xfs_buf_map *map, int nmaps); 208 struct xfs_buf_map *map, int nmaps,
209 const struct xfs_buf_ops *ops);
202 210
203static inline struct xfs_buf * 211static inline struct xfs_buf *
204xfs_buf_get( 212xfs_buf_get(
@@ -216,20 +224,22 @@ xfs_buf_read(
216 struct xfs_buftarg *target, 224 struct xfs_buftarg *target,
217 xfs_daddr_t blkno, 225 xfs_daddr_t blkno,
218 size_t numblks, 226 size_t numblks,
219 xfs_buf_flags_t flags) 227 xfs_buf_flags_t flags,
228 const struct xfs_buf_ops *ops)
220{ 229{
221 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); 230 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
222 return xfs_buf_read_map(target, &map, 1, flags); 231 return xfs_buf_read_map(target, &map, 1, flags, ops);
223} 232}
224 233
225static inline void 234static inline void
226xfs_buf_readahead( 235xfs_buf_readahead(
227 struct xfs_buftarg *target, 236 struct xfs_buftarg *target,
228 xfs_daddr_t blkno, 237 xfs_daddr_t blkno,
229 size_t numblks) 238 size_t numblks,
239 const struct xfs_buf_ops *ops)
230{ 240{
231 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); 241 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
232 return xfs_buf_readahead_map(target, &map, 1); 242 return xfs_buf_readahead_map(target, &map, 1, ops);
233} 243}
234 244
235struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); 245struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks);
@@ -239,7 +249,8 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
239struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, 249struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
240 int flags); 250 int flags);
241struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, 251struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target,
242 xfs_daddr_t daddr, size_t numblks, int flags); 252 xfs_daddr_t daddr, size_t numblks, int flags,
253 const struct xfs_buf_ops *ops);
243void xfs_buf_hold(struct xfs_buf *bp); 254void xfs_buf_hold(struct xfs_buf *bp);
244 255
245/* Releasing Buffers */ 256/* Releasing Buffers */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index a8d0ed911196..becf4a97efc6 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -526,7 +526,25 @@ xfs_buf_item_unpin(
526 } 526 }
527 xfs_buf_relse(bp); 527 xfs_buf_relse(bp);
528 } else if (freed && remove) { 528 } else if (freed && remove) {
529 /*
530 * There are currently two references to the buffer - the active
531 * LRU reference and the buf log item. What we are about to do
532 * here - simulate a failed IO completion - requires 3
533 * references.
534 *
535 * The LRU reference is removed by the xfs_buf_stale() call. The
536 * buf item reference is removed by the xfs_buf_iodone()
537 * callback that is run by xfs_buf_do_callbacks() during ioend
538 * processing (via the bp->b_iodone callback), and then finally
539 * the ioend processing will drop the IO reference if the buffer
540 * is marked XBF_ASYNC.
541 *
542 * Hence we need to take an additional reference here so that IO
543 * completion processing doesn't free the buffer prematurely.
544 */
529 xfs_buf_lock(bp); 545 xfs_buf_lock(bp);
546 xfs_buf_hold(bp);
547 bp->b_flags |= XBF_ASYNC;
530 xfs_buf_ioerror(bp, EIO); 548 xfs_buf_ioerror(bp, EIO);
531 XFS_BUF_UNDONE(bp); 549 XFS_BUF_UNDONE(bp);
532 xfs_buf_stale(bp); 550 xfs_buf_stale(bp);
diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/xfs_cksum.h
new file mode 100644
index 000000000000..fad1676ad8cd
--- /dev/null
+++ b/fs/xfs/xfs_cksum.h
@@ -0,0 +1,63 @@
1#ifndef _XFS_CKSUM_H
2#define _XFS_CKSUM_H 1
3
4#define XFS_CRC_SEED (~(__uint32_t)0)
5
6/*
7 * Calculate the intermediate checksum for a buffer that has the CRC field
8 * inside it. The offset of the 32bit crc fields is passed as the
9 * cksum_offset parameter.
10 */
11static inline __uint32_t
12xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
13{
14 __uint32_t zero = 0;
15 __uint32_t crc;
16
17 /* Calculate CRC up to the checksum. */
18 crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
19
20 /* Skip checksum field */
21 crc = crc32c(crc, &zero, sizeof(__u32));
22
23 /* Calculate the rest of the CRC. */
24 return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
25 length - (cksum_offset + sizeof(__be32)));
26}
27
28/*
29 * Convert the intermediate checksum to the final ondisk format.
30 *
31 * The CRC32c calculation uses LE format even on BE machines, but returns the
32 * result in host endian format. Hence we need to byte swap it back to LE format
33 * so that it is consistent on disk.
34 */
35static inline __le32
36xfs_end_cksum(__uint32_t crc)
37{
38 return ~cpu_to_le32(crc);
39}
40
41/*
42 * Helper to generate the checksum for a buffer.
43 */
44static inline void
45xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
46{
47 __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
48
49 *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
50}
51
52/*
53 * Helper to verify the checksum for a buffer.
54 */
55static inline int
56xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
57{
58 __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
59
60 return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
61}
62
63#endif /* _XFS_CKSUM_H */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 7bfb7dd334fc..4d7696a02418 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -91,6 +91,84 @@ STATIC int xfs_da_blk_unlink(xfs_da_state_t *state,
91 xfs_da_state_blk_t *save_blk); 91 xfs_da_state_blk_t *save_blk);
92STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); 92STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state);
93 93
94static void
95xfs_da_node_verify(
96 struct xfs_buf *bp)
97{
98 struct xfs_mount *mp = bp->b_target->bt_mount;
99 struct xfs_da_node_hdr *hdr = bp->b_addr;
100 int block_ok = 0;
101
102 block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC);
103 block_ok = block_ok &&
104 be16_to_cpu(hdr->level) > 0 &&
105 be16_to_cpu(hdr->count) > 0 ;
106 if (!block_ok) {
107 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
108 xfs_buf_ioerror(bp, EFSCORRUPTED);
109 }
110
111}
112
113static void
114xfs_da_node_write_verify(
115 struct xfs_buf *bp)
116{
117 xfs_da_node_verify(bp);
118}
119
120/*
121 * leaf/node format detection on trees is sketchy, so a node read can be done on
122 * leaf level blocks when detection identifies the tree as a node format tree
123 * incorrectly. In this case, we need to swap the verifier to match the correct
124 * format of the block being read.
125 */
126static void
127xfs_da_node_read_verify(
128 struct xfs_buf *bp)
129{
130 struct xfs_mount *mp = bp->b_target->bt_mount;
131 struct xfs_da_blkinfo *info = bp->b_addr;
132
133 switch (be16_to_cpu(info->magic)) {
134 case XFS_DA_NODE_MAGIC:
135 xfs_da_node_verify(bp);
136 break;
137 case XFS_ATTR_LEAF_MAGIC:
138 bp->b_ops = &xfs_attr_leaf_buf_ops;
139 bp->b_ops->verify_read(bp);
140 return;
141 case XFS_DIR2_LEAFN_MAGIC:
142 bp->b_ops = &xfs_dir2_leafn_buf_ops;
143 bp->b_ops->verify_read(bp);
144 return;
145 default:
146 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
147 mp, info);
148 xfs_buf_ioerror(bp, EFSCORRUPTED);
149 break;
150 }
151}
152
153const struct xfs_buf_ops xfs_da_node_buf_ops = {
154 .verify_read = xfs_da_node_read_verify,
155 .verify_write = xfs_da_node_write_verify,
156};
157
158
159int
160xfs_da_node_read(
161 struct xfs_trans *tp,
162 struct xfs_inode *dp,
163 xfs_dablk_t bno,
164 xfs_daddr_t mappedbno,
165 struct xfs_buf **bpp,
166 int which_fork)
167{
168 return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
169 which_fork, &xfs_da_node_buf_ops);
170}
171
94/*======================================================================== 172/*========================================================================
95 * Routines used for growing the Btree. 173 * Routines used for growing the Btree.
96 *========================================================================*/ 174 *========================================================================*/
@@ -125,6 +203,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
125 xfs_trans_log_buf(tp, bp, 203 xfs_trans_log_buf(tp, bp,
126 XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); 204 XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
127 205
206 bp->b_ops = &xfs_da_node_buf_ops;
128 *bpp = bp; 207 *bpp = bp;
129 return(0); 208 return(0);
130} 209}
@@ -324,6 +403,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
324 } 403 }
325 memcpy(node, oldroot, size); 404 memcpy(node, oldroot, size);
326 xfs_trans_log_buf(tp, bp, 0, size - 1); 405 xfs_trans_log_buf(tp, bp, 0, size - 1);
406
407 bp->b_ops = blk1->bp->b_ops;
327 blk1->bp = bp; 408 blk1->bp = bp;
328 blk1->blkno = blkno; 409 blk1->blkno = blkno;
329 410
@@ -746,7 +827,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
746 */ 827 */
747 child = be32_to_cpu(oldroot->btree[0].before); 828 child = be32_to_cpu(oldroot->btree[0].before);
748 ASSERT(child != 0); 829 ASSERT(child != 0);
749 error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp, 830 error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp,
750 args->whichfork); 831 args->whichfork);
751 if (error) 832 if (error)
752 return(error); 833 return(error);
@@ -754,7 +835,14 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
754 xfs_da_blkinfo_onlychild_validate(bp->b_addr, 835 xfs_da_blkinfo_onlychild_validate(bp->b_addr,
755 be16_to_cpu(oldroot->hdr.level)); 836 be16_to_cpu(oldroot->hdr.level));
756 837
838 /*
839 * This could be copying a leaf back into the root block in the case of
840 * there only being a single leaf block left in the tree. Hence we have
841 * to update the b_ops pointer as well to match the buffer type change
842 * that could occur.
843 */
757 memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize); 844 memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize);
845 root_blk->bp->b_ops = bp->b_ops;
758 xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); 846 xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
759 error = xfs_da_shrink_inode(args, child, bp); 847 error = xfs_da_shrink_inode(args, child, bp);
760 return(error); 848 return(error);
@@ -779,6 +867,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
779 xfs_dablk_t blkno; 867 xfs_dablk_t blkno;
780 struct xfs_buf *bp; 868 struct xfs_buf *bp;
781 869
870 trace_xfs_da_node_toosmall(state->args);
871
782 /* 872 /*
783 * Check for the degenerate case of the block being over 50% full. 873 * Check for the degenerate case of the block being over 50% full.
784 * If so, it's not worth even looking to see if we might be able 874 * If so, it's not worth even looking to see if we might be able
@@ -835,7 +925,7 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
835 blkno = be32_to_cpu(info->back); 925 blkno = be32_to_cpu(info->back);
836 if (blkno == 0) 926 if (blkno == 0)
837 continue; 927 continue;
838 error = xfs_da_read_buf(state->args->trans, state->args->dp, 928 error = xfs_da_node_read(state->args->trans, state->args->dp,
839 blkno, -1, &bp, state->args->whichfork); 929 blkno, -1, &bp, state->args->whichfork);
840 if (error) 930 if (error)
841 return(error); 931 return(error);
@@ -900,6 +990,8 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
900 xfs_dahash_t lasthash=0; 990 xfs_dahash_t lasthash=0;
901 int level, count; 991 int level, count;
902 992
993 trace_xfs_da_fixhashpath(state->args);
994
903 level = path->active-1; 995 level = path->active-1;
904 blk = &path->blk[ level ]; 996 blk = &path->blk[ level ];
905 switch (blk->magic) { 997 switch (blk->magic) {
@@ -1079,7 +1171,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
1079 * Read the next node down in the tree. 1171 * Read the next node down in the tree.
1080 */ 1172 */
1081 blk->blkno = blkno; 1173 blk->blkno = blkno;
1082 error = xfs_da_read_buf(args->trans, args->dp, blkno, 1174 error = xfs_da_node_read(args->trans, args->dp, blkno,
1083 -1, &blk->bp, args->whichfork); 1175 -1, &blk->bp, args->whichfork);
1084 if (error) { 1176 if (error) {
1085 blk->blkno = 0; 1177 blk->blkno = 0;
@@ -1241,7 +1333,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
1241 new_info->forw = cpu_to_be32(old_blk->blkno); 1333 new_info->forw = cpu_to_be32(old_blk->blkno);
1242 new_info->back = old_info->back; 1334 new_info->back = old_info->back;
1243 if (old_info->back) { 1335 if (old_info->back) {
1244 error = xfs_da_read_buf(args->trans, args->dp, 1336 error = xfs_da_node_read(args->trans, args->dp,
1245 be32_to_cpu(old_info->back), 1337 be32_to_cpu(old_info->back),
1246 -1, &bp, args->whichfork); 1338 -1, &bp, args->whichfork);
1247 if (error) 1339 if (error)
@@ -1262,7 +1354,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
1262 new_info->forw = old_info->forw; 1354 new_info->forw = old_info->forw;
1263 new_info->back = cpu_to_be32(old_blk->blkno); 1355 new_info->back = cpu_to_be32(old_blk->blkno);
1264 if (old_info->forw) { 1356 if (old_info->forw) {
1265 error = xfs_da_read_buf(args->trans, args->dp, 1357 error = xfs_da_node_read(args->trans, args->dp,
1266 be32_to_cpu(old_info->forw), 1358 be32_to_cpu(old_info->forw),
1267 -1, &bp, args->whichfork); 1359 -1, &bp, args->whichfork);
1268 if (error) 1360 if (error)
@@ -1362,7 +1454,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
1362 trace_xfs_da_unlink_back(args); 1454 trace_xfs_da_unlink_back(args);
1363 save_info->back = drop_info->back; 1455 save_info->back = drop_info->back;
1364 if (drop_info->back) { 1456 if (drop_info->back) {
1365 error = xfs_da_read_buf(args->trans, args->dp, 1457 error = xfs_da_node_read(args->trans, args->dp,
1366 be32_to_cpu(drop_info->back), 1458 be32_to_cpu(drop_info->back),
1367 -1, &bp, args->whichfork); 1459 -1, &bp, args->whichfork);
1368 if (error) 1460 if (error)
@@ -1379,7 +1471,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
1379 trace_xfs_da_unlink_forward(args); 1471 trace_xfs_da_unlink_forward(args);
1380 save_info->forw = drop_info->forw; 1472 save_info->forw = drop_info->forw;
1381 if (drop_info->forw) { 1473 if (drop_info->forw) {
1382 error = xfs_da_read_buf(args->trans, args->dp, 1474 error = xfs_da_node_read(args->trans, args->dp,
1383 be32_to_cpu(drop_info->forw), 1475 be32_to_cpu(drop_info->forw),
1384 -1, &bp, args->whichfork); 1476 -1, &bp, args->whichfork);
1385 if (error) 1477 if (error)
@@ -1417,6 +1509,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
1417 xfs_dablk_t blkno=0; 1509 xfs_dablk_t blkno=0;
1418 int level, error; 1510 int level, error;
1419 1511
1512 trace_xfs_da_path_shift(state->args);
1513
1420 /* 1514 /*
1421 * Roll up the Btree looking for the first block where our 1515 * Roll up the Btree looking for the first block where our
1422 * current index is not at the edge of the block. Note that 1516 * current index is not at the edge of the block. Note that
@@ -1463,8 +1557,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
1463 * Read the next child block. 1557 * Read the next child block.
1464 */ 1558 */
1465 blk->blkno = blkno; 1559 blk->blkno = blkno;
1466 error = xfs_da_read_buf(args->trans, args->dp, blkno, -1, 1560 error = xfs_da_node_read(args->trans, args->dp, blkno, -1,
1467 &blk->bp, args->whichfork); 1561 &blk->bp, args->whichfork);
1468 if (error) 1562 if (error)
1469 return(error); 1563 return(error);
1470 ASSERT(blk->bp != NULL); 1564 ASSERT(blk->bp != NULL);
@@ -1727,7 +1821,8 @@ xfs_da_swap_lastblock(
1727 * Read the last block in the btree space. 1821 * Read the last block in the btree space.
1728 */ 1822 */
1729 last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; 1823 last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
1730 if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w))) 1824 error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w);
1825 if (error)
1731 return error; 1826 return error;
1732 /* 1827 /*
1733 * Copy the last block into the dead buffer and log it. 1828 * Copy the last block into the dead buffer and log it.
@@ -1753,7 +1848,8 @@ xfs_da_swap_lastblock(
1753 * If the moved block has a left sibling, fix up the pointers. 1848 * If the moved block has a left sibling, fix up the pointers.
1754 */ 1849 */
1755 if ((sib_blkno = be32_to_cpu(dead_info->back))) { 1850 if ((sib_blkno = be32_to_cpu(dead_info->back))) {
1756 if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) 1851 error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
1852 if (error)
1757 goto done; 1853 goto done;
1758 sib_info = sib_buf->b_addr; 1854 sib_info = sib_buf->b_addr;
1759 if (unlikely( 1855 if (unlikely(
@@ -1774,7 +1870,8 @@ xfs_da_swap_lastblock(
1774 * If the moved block has a right sibling, fix up the pointers. 1870 * If the moved block has a right sibling, fix up the pointers.
1775 */ 1871 */
1776 if ((sib_blkno = be32_to_cpu(dead_info->forw))) { 1872 if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
1777 if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) 1873 error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
1874 if (error)
1778 goto done; 1875 goto done;
1779 sib_info = sib_buf->b_addr; 1876 sib_info = sib_buf->b_addr;
1780 if (unlikely( 1877 if (unlikely(
@@ -1797,7 +1894,8 @@ xfs_da_swap_lastblock(
1797 * Walk down the tree looking for the parent of the moved block. 1894 * Walk down the tree looking for the parent of the moved block.
1798 */ 1895 */
1799 for (;;) { 1896 for (;;) {
1800 if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) 1897 error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
1898 if (error)
1801 goto done; 1899 goto done;
1802 par_node = par_buf->b_addr; 1900 par_node = par_buf->b_addr;
1803 if (unlikely(par_node->hdr.info.magic != 1901 if (unlikely(par_node->hdr.info.magic !=
@@ -1847,7 +1945,8 @@ xfs_da_swap_lastblock(
1847 error = XFS_ERROR(EFSCORRUPTED); 1945 error = XFS_ERROR(EFSCORRUPTED);
1848 goto done; 1946 goto done;
1849 } 1947 }
1850 if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) 1948 error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
1949 if (error)
1851 goto done; 1950 goto done;
1852 par_node = par_buf->b_addr; 1951 par_node = par_buf->b_addr;
1853 if (unlikely( 1952 if (unlikely(
@@ -2133,7 +2232,8 @@ xfs_da_read_buf(
2133 xfs_dablk_t bno, 2232 xfs_dablk_t bno,
2134 xfs_daddr_t mappedbno, 2233 xfs_daddr_t mappedbno,
2135 struct xfs_buf **bpp, 2234 struct xfs_buf **bpp,
2136 int whichfork) 2235 int whichfork,
2236 const struct xfs_buf_ops *ops)
2137{ 2237{
2138 struct xfs_buf *bp; 2238 struct xfs_buf *bp;
2139 struct xfs_buf_map map; 2239 struct xfs_buf_map map;
@@ -2155,7 +2255,7 @@ xfs_da_read_buf(
2155 2255
2156 error = xfs_trans_read_buf_map(dp->i_mount, trans, 2256 error = xfs_trans_read_buf_map(dp->i_mount, trans,
2157 dp->i_mount->m_ddev_targp, 2257 dp->i_mount->m_ddev_targp,
2158 mapp, nmap, 0, &bp); 2258 mapp, nmap, 0, &bp, ops);
2159 if (error) 2259 if (error)
2160 goto out_free; 2260 goto out_free;
2161 2261
@@ -2211,9 +2311,10 @@ xfs_da_reada_buf(
2211 struct xfs_trans *trans, 2311 struct xfs_trans *trans,
2212 struct xfs_inode *dp, 2312 struct xfs_inode *dp,
2213 xfs_dablk_t bno, 2313 xfs_dablk_t bno,
2214 int whichfork) 2314 xfs_daddr_t mappedbno,
2315 int whichfork,
2316 const struct xfs_buf_ops *ops)
2215{ 2317{
2216 xfs_daddr_t mappedbno = -1;
2217 struct xfs_buf_map map; 2318 struct xfs_buf_map map;
2218 struct xfs_buf_map *mapp; 2319 struct xfs_buf_map *mapp;
2219 int nmap; 2320 int nmap;
@@ -2221,7 +2322,7 @@ xfs_da_reada_buf(
2221 2322
2222 mapp = &map; 2323 mapp = &map;
2223 nmap = 1; 2324 nmap = 1;
2224 error = xfs_dabuf_map(trans, dp, bno, -1, whichfork, 2325 error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
2225 &mapp, &nmap); 2326 &mapp, &nmap);
2226 if (error) { 2327 if (error) {
2227 /* mapping a hole is not an error, but we don't continue */ 2328 /* mapping a hole is not an error, but we don't continue */
@@ -2231,7 +2332,7 @@ xfs_da_reada_buf(
2231 } 2332 }
2232 2333
2233 mappedbno = mapp[0].bm_bn; 2334 mappedbno = mapp[0].bm_bn;
2234 xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap); 2335 xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
2235 2336
2236out_free: 2337out_free:
2237 if (mapp != &map) 2338 if (mapp != &map)
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 132adafb041e..ee5170c46ae1 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -18,7 +18,6 @@
18#ifndef __XFS_DA_BTREE_H__ 18#ifndef __XFS_DA_BTREE_H__
19#define __XFS_DA_BTREE_H__ 19#define __XFS_DA_BTREE_H__
20 20
21struct xfs_buf;
22struct xfs_bmap_free; 21struct xfs_bmap_free;
23struct xfs_inode; 22struct xfs_inode;
24struct xfs_mount; 23struct xfs_mount;
@@ -214,6 +213,9 @@ int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
214 */ 213 */
215int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, 214int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
216 xfs_da_state_blk_t *new_blk); 215 xfs_da_state_blk_t *new_blk);
216int xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
217 xfs_dablk_t bno, xfs_daddr_t mappedbno,
218 struct xfs_buf **bpp, int which_fork);
217 219
218/* 220/*
219 * Utility routines. 221 * Utility routines.
@@ -226,9 +228,11 @@ int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
226 struct xfs_buf **bp, int whichfork); 228 struct xfs_buf **bp, int whichfork);
227int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, 229int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
228 xfs_dablk_t bno, xfs_daddr_t mappedbno, 230 xfs_dablk_t bno, xfs_daddr_t mappedbno,
229 struct xfs_buf **bpp, int whichfork); 231 struct xfs_buf **bpp, int whichfork,
232 const struct xfs_buf_ops *ops);
230xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, 233xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
231 xfs_dablk_t bno, int whichfork); 234 xfs_dablk_t bno, xfs_daddr_t mapped_bno,
235 int whichfork, const struct xfs_buf_ops *ops);
232int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, 236int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
233 struct xfs_buf *dead_buf); 237 struct xfs_buf *dead_buf);
234 238
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index b9b8646e62db..d0e9c74d3d96 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -246,12 +246,10 @@ xfs_swap_extents(
246 goto out_unlock; 246 goto out_unlock;
247 } 247 }
248 248
249 if (VN_CACHED(VFS_I(tip)) != 0) { 249 error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
250 error = xfs_flushinval_pages(tip, 0, -1, 250 if (error)
251 FI_REMAPF_LOCKED); 251 goto out_unlock;
252 if (error) 252 truncate_pagecache_range(VFS_I(ip), 0, -1);
253 goto out_unlock;
254 }
255 253
256 /* Verify O_DIRECT for ftmp */ 254 /* Verify O_DIRECT for ftmp */
257 if (VN_CACHED(VFS_I(tip)) != 0) { 255 if (VN_CACHED(VFS_I(tip)) != 0) {
@@ -315,8 +313,7 @@ xfs_swap_extents(
315 * are safe. We don't really care if non-io related 313 * are safe. We don't really care if non-io related
316 * fields change. 314 * fields change.
317 */ 315 */
318 316 truncate_pagecache_range(VFS_I(ip), 0, -1);
319 xfs_tosspages(ip, 0, -1, FI_REMAPF);
320 317
321 tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); 318 tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
322 if ((error = xfs_trans_reserve(tp, 0, 319 if ((error = xfs_trans_reserve(tp, 0,
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e93ca8f054f4..7536faaa61e7 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -56,6 +56,214 @@ xfs_dir_startup(void)
56 xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); 56 xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
57} 57}
58 58
59static void
60xfs_dir2_block_verify(
61 struct xfs_buf *bp)
62{
63 struct xfs_mount *mp = bp->b_target->bt_mount;
64 struct xfs_dir2_data_hdr *hdr = bp->b_addr;
65 int block_ok = 0;
66
67 block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
68 block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
69
70 if (!block_ok) {
71 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
72 xfs_buf_ioerror(bp, EFSCORRUPTED);
73 }
74}
75
76static void
77xfs_dir2_block_read_verify(
78 struct xfs_buf *bp)
79{
80 xfs_dir2_block_verify(bp);
81}
82
83static void
84xfs_dir2_block_write_verify(
85 struct xfs_buf *bp)
86{
87 xfs_dir2_block_verify(bp);
88}
89
90const struct xfs_buf_ops xfs_dir2_block_buf_ops = {
91 .verify_read = xfs_dir2_block_read_verify,
92 .verify_write = xfs_dir2_block_write_verify,
93};
94
95static int
96xfs_dir2_block_read(
97 struct xfs_trans *tp,
98 struct xfs_inode *dp,
99 struct xfs_buf **bpp)
100{
101 struct xfs_mount *mp = dp->i_mount;
102
103 return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp,
104 XFS_DATA_FORK, &xfs_dir2_block_buf_ops);
105}
106
107static void
108xfs_dir2_block_need_space(
109 struct xfs_dir2_data_hdr *hdr,
110 struct xfs_dir2_block_tail *btp,
111 struct xfs_dir2_leaf_entry *blp,
112 __be16 **tagpp,
113 struct xfs_dir2_data_unused **dupp,
114 struct xfs_dir2_data_unused **enddupp,
115 int *compact,
116 int len)
117{
118 struct xfs_dir2_data_free *bf;
119 __be16 *tagp = NULL;
120 struct xfs_dir2_data_unused *dup = NULL;
121 struct xfs_dir2_data_unused *enddup = NULL;
122
123 *compact = 0;
124 bf = hdr->bestfree;
125
126 /*
127 * If there are stale entries we'll use one for the leaf.
128 */
129 if (btp->stale) {
130 if (be16_to_cpu(bf[0].length) >= len) {
131 /*
132 * The biggest entry enough to avoid compaction.
133 */
134 dup = (xfs_dir2_data_unused_t *)
135 ((char *)hdr + be16_to_cpu(bf[0].offset));
136 goto out;
137 }
138
139 /*
140 * Will need to compact to make this work.
141 * Tag just before the first leaf entry.
142 */
143 *compact = 1;
144 tagp = (__be16 *)blp - 1;
145
146 /* Data object just before the first leaf entry. */
147 dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
148
149 /*
150 * If it's not free then the data will go where the
151 * leaf data starts now, if it works at all.
152 */
153 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
154 if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
155 (uint)sizeof(*blp) < len)
156 dup = NULL;
157 } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
158 dup = NULL;
159 else
160 dup = (xfs_dir2_data_unused_t *)blp;
161 goto out;
162 }
163
164 /*
165 * no stale entries, so just use free space.
166 * Tag just before the first leaf entry.
167 */
168 tagp = (__be16 *)blp - 1;
169
170 /* Data object just before the first leaf entry. */
171 enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
172
173 /*
174 * If it's not free then can't do this add without cleaning up:
175 * the space before the first leaf entry needs to be free so it
176 * can be expanded to hold the pointer to the new entry.
177 */
178 if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
179 /*
180 * Check out the biggest freespace and see if it's the same one.
181 */
182 dup = (xfs_dir2_data_unused_t *)
183 ((char *)hdr + be16_to_cpu(bf[0].offset));
184 if (dup != enddup) {
185 /*
186 * Not the same free entry, just check its length.
187 */
188 if (be16_to_cpu(dup->length) < len)
189 dup = NULL;
190 goto out;
191 }
192
193 /*
194 * It is the biggest freespace, can it hold the leaf too?
195 */
196 if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
197 /*
198 * Yes, use the second-largest entry instead if it works.
199 */
200 if (be16_to_cpu(bf[1].length) >= len)
201 dup = (xfs_dir2_data_unused_t *)
202 ((char *)hdr + be16_to_cpu(bf[1].offset));
203 else
204 dup = NULL;
205 }
206 }
207out:
208 *tagpp = tagp;
209 *dupp = dup;
210 *enddupp = enddup;
211}
212
213/*
214 * compact the leaf entries.
215 * Leave the highest-numbered stale entry stale.
216 * XXX should be the one closest to mid but mid is not yet computed.
217 */
218static void
219xfs_dir2_block_compact(
220 struct xfs_trans *tp,
221 struct xfs_buf *bp,
222 struct xfs_dir2_data_hdr *hdr,
223 struct xfs_dir2_block_tail *btp,
224 struct xfs_dir2_leaf_entry *blp,
225 int *needlog,
226 int *lfloghigh,
227 int *lfloglow)
228{
229 int fromidx; /* source leaf index */
230 int toidx; /* target leaf index */
231 int needscan = 0;
232 int highstale; /* high stale index */
233
234 fromidx = toidx = be32_to_cpu(btp->count) - 1;
235 highstale = *lfloghigh = -1;
236 for (; fromidx >= 0; fromidx--) {
237 if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
238 if (highstale == -1)
239 highstale = toidx;
240 else {
241 if (*lfloghigh == -1)
242 *lfloghigh = toidx;
243 continue;
244 }
245 }
246 if (fromidx < toidx)
247 blp[toidx] = blp[fromidx];
248 toidx--;
249 }
250 *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
251 *lfloghigh -= be32_to_cpu(btp->stale) - 1;
252 be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
253 xfs_dir2_data_make_free(tp, bp,
254 (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
255 (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
256 needlog, &needscan);
257 blp += be32_to_cpu(btp->stale) - 1;
258 btp->stale = cpu_to_be32(1);
259 /*
260 * If we now need to rebuild the bestfree map, do so.
261 * This needs to happen before the next call to use_free.
262 */
263 if (needscan)
264 xfs_dir2_data_freescan(tp->t_mountp, hdr, needlog);
265}
266
59/* 267/*
60 * Add an entry to a block directory. 268 * Add an entry to a block directory.
61 */ 269 */
@@ -63,7 +271,6 @@ int /* error */
63xfs_dir2_block_addname( 271xfs_dir2_block_addname(
64 xfs_da_args_t *args) /* directory op arguments */ 272 xfs_da_args_t *args) /* directory op arguments */
65{ 273{
66 xfs_dir2_data_free_t *bf; /* bestfree table in block */
67 xfs_dir2_data_hdr_t *hdr; /* block header */ 274 xfs_dir2_data_hdr_t *hdr; /* block header */
68 xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ 275 xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
69 struct xfs_buf *bp; /* buffer for block */ 276 struct xfs_buf *bp; /* buffer for block */
@@ -94,134 +301,44 @@ xfs_dir2_block_addname(
94 dp = args->dp; 301 dp = args->dp;
95 tp = args->trans; 302 tp = args->trans;
96 mp = dp->i_mount; 303 mp = dp->i_mount;
97 /* 304
98 * Read the (one and only) directory block into dabuf bp. 305 /* Read the (one and only) directory block into bp. */
99 */ 306 error = xfs_dir2_block_read(tp, dp, &bp);
100 if ((error = 307 if (error)
101 xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
102 return error; 308 return error;
103 } 309
104 ASSERT(bp != NULL);
105 hdr = bp->b_addr;
106 /*
107 * Check the magic number, corrupted if wrong.
108 */
109 if (unlikely(hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))) {
110 XFS_CORRUPTION_ERROR("xfs_dir2_block_addname",
111 XFS_ERRLEVEL_LOW, mp, hdr);
112 xfs_trans_brelse(tp, bp);
113 return XFS_ERROR(EFSCORRUPTED);
114 }
115 len = xfs_dir2_data_entsize(args->namelen); 310 len = xfs_dir2_data_entsize(args->namelen);
311
116 /* 312 /*
117 * Set up pointers to parts of the block. 313 * Set up pointers to parts of the block.
118 */ 314 */
119 bf = hdr->bestfree; 315 hdr = bp->b_addr;
120 btp = xfs_dir2_block_tail_p(mp, hdr); 316 btp = xfs_dir2_block_tail_p(mp, hdr);
121 blp = xfs_dir2_block_leaf_p(btp); 317 blp = xfs_dir2_block_leaf_p(btp);
318
122 /* 319 /*
123 * No stale entries? Need space for entry and new leaf. 320 * Find out if we can reuse stale entries or whether we need extra
124 */ 321 * space for entry and new leaf.
125 if (!btp->stale) {
126 /*
127 * Tag just before the first leaf entry.
128 */
129 tagp = (__be16 *)blp - 1;
130 /*
131 * Data object just before the first leaf entry.
132 */
133 enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
134 /*
135 * If it's not free then can't do this add without cleaning up:
136 * the space before the first leaf entry needs to be free so it
137 * can be expanded to hold the pointer to the new entry.
138 */
139 if (be16_to_cpu(enddup->freetag) != XFS_DIR2_DATA_FREE_TAG)
140 dup = enddup = NULL;
141 /*
142 * Check out the biggest freespace and see if it's the same one.
143 */
144 else {
145 dup = (xfs_dir2_data_unused_t *)
146 ((char *)hdr + be16_to_cpu(bf[0].offset));
147 if (dup == enddup) {
148 /*
149 * It is the biggest freespace, is it too small
150 * to hold the new leaf too?
151 */
152 if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
153 /*
154 * Yes, we use the second-largest
155 * entry instead if it works.
156 */
157 if (be16_to_cpu(bf[1].length) >= len)
158 dup = (xfs_dir2_data_unused_t *)
159 ((char *)hdr +
160 be16_to_cpu(bf[1].offset));
161 else
162 dup = NULL;
163 }
164 } else {
165 /*
166 * Not the same free entry,
167 * just check its length.
168 */
169 if (be16_to_cpu(dup->length) < len) {
170 dup = NULL;
171 }
172 }
173 }
174 compact = 0;
175 }
176 /*
177 * If there are stale entries we'll use one for the leaf.
178 * Is the biggest entry enough to avoid compaction?
179 */
180 else if (be16_to_cpu(bf[0].length) >= len) {
181 dup = (xfs_dir2_data_unused_t *)
182 ((char *)hdr + be16_to_cpu(bf[0].offset));
183 compact = 0;
184 }
185 /*
186 * Will need to compact to make this work.
187 */ 322 */
188 else { 323 xfs_dir2_block_need_space(hdr, btp, blp, &tagp, &dup,
189 /* 324 &enddup, &compact, len);
190 * Tag just before the first leaf entry. 325
191 */
192 tagp = (__be16 *)blp - 1;
193 /*
194 * Data object just before the first leaf entry.
195 */
196 dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
197 /*
198 * If it's not free then the data will go where the
199 * leaf data starts now, if it works at all.
200 */
201 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
202 if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
203 (uint)sizeof(*blp) < len)
204 dup = NULL;
205 } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
206 dup = NULL;
207 else
208 dup = (xfs_dir2_data_unused_t *)blp;
209 compact = 1;
210 }
211 /* 326 /*
212 * If this isn't a real add, we're done with the buffer. 327 * Done everything we need for a space check now.
213 */ 328 */
214 if (args->op_flags & XFS_DA_OP_JUSTCHECK) 329 if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
215 xfs_trans_brelse(tp, bp); 330 xfs_trans_brelse(tp, bp);
331 if (!dup)
332 return XFS_ERROR(ENOSPC);
333 return 0;
334 }
335
216 /* 336 /*
217 * If we don't have space for the new entry & leaf ... 337 * If we don't have space for the new entry & leaf ...
218 */ 338 */
219 if (!dup) { 339 if (!dup) {
220 /* 340 /* Don't have a space reservation: return no-space. */
221 * Not trying to actually do anything, or don't have 341 if (args->total == 0)
222 * a space reservation: return no-space.
223 */
224 if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
225 return XFS_ERROR(ENOSPC); 342 return XFS_ERROR(ENOSPC);
226 /* 343 /*
227 * Convert to the next larger format. 344 * Convert to the next larger format.
@@ -232,65 +349,24 @@ xfs_dir2_block_addname(
232 return error; 349 return error;
233 return xfs_dir2_leaf_addname(args); 350 return xfs_dir2_leaf_addname(args);
234 } 351 }
235 /* 352
236 * Just checking, and it would work, so say so.
237 */
238 if (args->op_flags & XFS_DA_OP_JUSTCHECK)
239 return 0;
240 needlog = needscan = 0; 353 needlog = needscan = 0;
354
241 /* 355 /*
242 * If need to compact the leaf entries, do it now. 356 * If need to compact the leaf entries, do it now.
243 * Leave the highest-numbered stale entry stale.
244 * XXX should be the one closest to mid but mid is not yet computed.
245 */
246 if (compact) {
247 int fromidx; /* source leaf index */
248 int toidx; /* target leaf index */
249
250 for (fromidx = toidx = be32_to_cpu(btp->count) - 1,
251 highstale = lfloghigh = -1;
252 fromidx >= 0;
253 fromidx--) {
254 if (blp[fromidx].address ==
255 cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
256 if (highstale == -1)
257 highstale = toidx;
258 else {
259 if (lfloghigh == -1)
260 lfloghigh = toidx;
261 continue;
262 }
263 }
264 if (fromidx < toidx)
265 blp[toidx] = blp[fromidx];
266 toidx--;
267 }
268 lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
269 lfloghigh -= be32_to_cpu(btp->stale) - 1;
270 be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
271 xfs_dir2_data_make_free(tp, bp,
272 (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
273 (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
274 &needlog, &needscan);
275 blp += be32_to_cpu(btp->stale) - 1;
276 btp->stale = cpu_to_be32(1);
277 /*
278 * If we now need to rebuild the bestfree map, do so.
279 * This needs to happen before the next call to use_free.
280 */
281 if (needscan) {
282 xfs_dir2_data_freescan(mp, hdr, &needlog);
283 needscan = 0;
284 }
285 }
286 /*
287 * Set leaf logging boundaries to impossible state.
288 * For the no-stale case they're set explicitly.
289 */ 357 */
358 if (compact)
359 xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog,
360 &lfloghigh, &lfloglow);
290 else if (btp->stale) { 361 else if (btp->stale) {
362 /*
363 * Set leaf logging boundaries to impossible state.
364 * For the no-stale case they're set explicitly.
365 */
291 lfloglow = be32_to_cpu(btp->count); 366 lfloglow = be32_to_cpu(btp->count);
292 lfloghigh = -1; 367 lfloghigh = -1;
293 } 368 }
369
294 /* 370 /*
295 * Find the slot that's first lower than our hash value, -1 if none. 371 * Find the slot that's first lower than our hash value, -1 if none.
296 */ 372 */
@@ -450,18 +526,13 @@ xfs_dir2_block_getdents(
450 /* 526 /*
451 * If the block number in the offset is out of range, we're done. 527 * If the block number in the offset is out of range, we're done.
452 */ 528 */
453 if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) { 529 if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
454 return 0; 530 return 0;
455 } 531
456 /* 532 error = xfs_dir2_block_read(NULL, dp, &bp);
457 * Can't read the block, give up, else get dabuf in bp.
458 */
459 error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1,
460 &bp, XFS_DATA_FORK);
461 if (error) 533 if (error)
462 return error; 534 return error;
463 535
464 ASSERT(bp != NULL);
465 /* 536 /*
466 * Extract the byte offset we start at from the seek pointer. 537 * Extract the byte offset we start at from the seek pointer.
467 * We'll skip entries before this. 538 * We'll skip entries before this.
@@ -637,14 +708,11 @@ xfs_dir2_block_lookup_int(
637 dp = args->dp; 708 dp = args->dp;
638 tp = args->trans; 709 tp = args->trans;
639 mp = dp->i_mount; 710 mp = dp->i_mount;
640 /* 711
641 * Read the buffer, return error if we can't get it. 712 error = xfs_dir2_block_read(tp, dp, &bp);
642 */ 713 if (error)
643 if ((error =
644 xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
645 return error; 714 return error;
646 } 715
647 ASSERT(bp != NULL);
648 hdr = bp->b_addr; 716 hdr = bp->b_addr;
649 xfs_dir2_data_check(dp, bp); 717 xfs_dir2_data_check(dp, bp);
650 btp = xfs_dir2_block_tail_p(mp, hdr); 718 btp = xfs_dir2_block_tail_p(mp, hdr);
@@ -917,10 +985,10 @@ xfs_dir2_leaf_to_block(
917 /* 985 /*
918 * Read the data block if we don't already have it, give up if it fails. 986 * Read the data block if we don't already have it, give up if it fails.
919 */ 987 */
920 if (dbp == NULL && 988 if (!dbp) {
921 (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, 989 error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp);
922 XFS_DATA_FORK))) { 990 if (error)
923 return error; 991 return error;
924 } 992 }
925 hdr = dbp->b_addr; 993 hdr = dbp->b_addr;
926 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); 994 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
@@ -944,6 +1012,7 @@ xfs_dir2_leaf_to_block(
944 /* 1012 /*
945 * Start converting it to block form. 1013 * Start converting it to block form.
946 */ 1014 */
1015 dbp->b_ops = &xfs_dir2_block_buf_ops;
947 hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); 1016 hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
948 needlog = 1; 1017 needlog = 1;
949 needscan = 0; 1018 needscan = 0;
@@ -1073,6 +1142,7 @@ xfs_dir2_sf_to_block(
1073 kmem_free(sfp); 1142 kmem_free(sfp);
1074 return error; 1143 return error;
1075 } 1144 }
1145 bp->b_ops = &xfs_dir2_block_buf_ops;
1076 hdr = bp->b_addr; 1146 hdr = bp->b_addr;
1077 hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); 1147 hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
1078 /* 1148 /*
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 44ffd4d6bc91..ffcf1774152e 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -34,14 +34,13 @@
34STATIC xfs_dir2_data_free_t * 34STATIC xfs_dir2_data_free_t *
35xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); 35xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup);
36 36
37#ifdef DEBUG
38/* 37/*
39 * Check the consistency of the data block. 38 * Check the consistency of the data block.
40 * The input can also be a block-format directory. 39 * The input can also be a block-format directory.
41 * Pop an assert if we find anything bad. 40 * Return 0 is the buffer is good, otherwise an error.
42 */ 41 */
43void 42int
44xfs_dir2_data_check( 43__xfs_dir2_data_check(
45 struct xfs_inode *dp, /* incore inode pointer */ 44 struct xfs_inode *dp, /* incore inode pointer */
46 struct xfs_buf *bp) /* data block's buffer */ 45 struct xfs_buf *bp) /* data block's buffer */
47{ 46{
@@ -64,18 +63,23 @@ xfs_dir2_data_check(
64 int stale; /* count of stale leaves */ 63 int stale; /* count of stale leaves */
65 struct xfs_name name; 64 struct xfs_name name;
66 65
67 mp = dp->i_mount; 66 mp = bp->b_target->bt_mount;
68 hdr = bp->b_addr; 67 hdr = bp->b_addr;
69 bf = hdr->bestfree; 68 bf = hdr->bestfree;
70 p = (char *)(hdr + 1); 69 p = (char *)(hdr + 1);
71 70
72 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { 71 switch (hdr->magic) {
72 case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
73 btp = xfs_dir2_block_tail_p(mp, hdr); 73 btp = xfs_dir2_block_tail_p(mp, hdr);
74 lep = xfs_dir2_block_leaf_p(btp); 74 lep = xfs_dir2_block_leaf_p(btp);
75 endp = (char *)lep; 75 endp = (char *)lep;
76 } else { 76 break;
77 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); 77 case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
78 endp = (char *)hdr + mp->m_dirblksize; 78 endp = (char *)hdr + mp->m_dirblksize;
79 break;
80 default:
81 XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
82 return EFSCORRUPTED;
79 } 83 }
80 84
81 count = lastfree = freeseen = 0; 85 count = lastfree = freeseen = 0;
@@ -83,19 +87,22 @@ xfs_dir2_data_check(
83 * Account for zero bestfree entries. 87 * Account for zero bestfree entries.
84 */ 88 */
85 if (!bf[0].length) { 89 if (!bf[0].length) {
86 ASSERT(!bf[0].offset); 90 XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
87 freeseen |= 1 << 0; 91 freeseen |= 1 << 0;
88 } 92 }
89 if (!bf[1].length) { 93 if (!bf[1].length) {
90 ASSERT(!bf[1].offset); 94 XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
91 freeseen |= 1 << 1; 95 freeseen |= 1 << 1;
92 } 96 }
93 if (!bf[2].length) { 97 if (!bf[2].length) {
94 ASSERT(!bf[2].offset); 98 XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
95 freeseen |= 1 << 2; 99 freeseen |= 1 << 2;
96 } 100 }
97 ASSERT(be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length)); 101
98 ASSERT(be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length)); 102 XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
103 be16_to_cpu(bf[1].length));
104 XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
105 be16_to_cpu(bf[2].length));
99 /* 106 /*
100 * Loop over the data/unused entries. 107 * Loop over the data/unused entries.
101 */ 108 */
@@ -107,17 +114,20 @@ xfs_dir2_data_check(
107 * doesn't need to be there. 114 * doesn't need to be there.
108 */ 115 */
109 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { 116 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
110 ASSERT(lastfree == 0); 117 XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
111 ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == 118 XFS_WANT_CORRUPTED_RETURN(
112 (char *)dup - (char *)hdr); 119 be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
120 (char *)dup - (char *)hdr);
113 dfp = xfs_dir2_data_freefind(hdr, dup); 121 dfp = xfs_dir2_data_freefind(hdr, dup);
114 if (dfp) { 122 if (dfp) {
115 i = (int)(dfp - bf); 123 i = (int)(dfp - bf);
116 ASSERT((freeseen & (1 << i)) == 0); 124 XFS_WANT_CORRUPTED_RETURN(
125 (freeseen & (1 << i)) == 0);
117 freeseen |= 1 << i; 126 freeseen |= 1 << i;
118 } else { 127 } else {
119 ASSERT(be16_to_cpu(dup->length) <= 128 XFS_WANT_CORRUPTED_RETURN(
120 be16_to_cpu(bf[2].length)); 129 be16_to_cpu(dup->length) <=
130 be16_to_cpu(bf[2].length));
121 } 131 }
122 p += be16_to_cpu(dup->length); 132 p += be16_to_cpu(dup->length);
123 lastfree = 1; 133 lastfree = 1;
@@ -130,10 +140,12 @@ xfs_dir2_data_check(
130 * The linear search is crude but this is DEBUG code. 140 * The linear search is crude but this is DEBUG code.
131 */ 141 */
132 dep = (xfs_dir2_data_entry_t *)p; 142 dep = (xfs_dir2_data_entry_t *)p;
133 ASSERT(dep->namelen != 0); 143 XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
134 ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0); 144 XFS_WANT_CORRUPTED_RETURN(
135 ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == 145 !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
136 (char *)dep - (char *)hdr); 146 XFS_WANT_CORRUPTED_RETURN(
147 be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
148 (char *)dep - (char *)hdr);
137 count++; 149 count++;
138 lastfree = 0; 150 lastfree = 0;
139 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { 151 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
@@ -148,27 +160,122 @@ xfs_dir2_data_check(
148 be32_to_cpu(lep[i].hashval) == hash) 160 be32_to_cpu(lep[i].hashval) == hash)
149 break; 161 break;
150 } 162 }
151 ASSERT(i < be32_to_cpu(btp->count)); 163 XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
152 } 164 }
153 p += xfs_dir2_data_entsize(dep->namelen); 165 p += xfs_dir2_data_entsize(dep->namelen);
154 } 166 }
155 /* 167 /*
156 * Need to have seen all the entries and all the bestfree slots. 168 * Need to have seen all the entries and all the bestfree slots.
157 */ 169 */
158 ASSERT(freeseen == 7); 170 XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
159 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { 171 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
160 for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { 172 for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
161 if (lep[i].address == 173 if (lep[i].address ==
162 cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) 174 cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
163 stale++; 175 stale++;
164 if (i > 0) 176 if (i > 0)
165 ASSERT(be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval)); 177 XFS_WANT_CORRUPTED_RETURN(
178 be32_to_cpu(lep[i].hashval) >=
179 be32_to_cpu(lep[i - 1].hashval));
166 } 180 }
167 ASSERT(count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); 181 XFS_WANT_CORRUPTED_RETURN(count ==
168 ASSERT(stale == be32_to_cpu(btp->stale)); 182 be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
183 XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
169 } 184 }
185 return 0;
186}
187
188static void
189xfs_dir2_data_verify(
190 struct xfs_buf *bp)
191{
192 struct xfs_mount *mp = bp->b_target->bt_mount;
193 struct xfs_dir2_data_hdr *hdr = bp->b_addr;
194 int block_ok = 0;
195
196 block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC);
197 block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
198
199 if (!block_ok) {
200 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
201 xfs_buf_ioerror(bp, EFSCORRUPTED);
202 }
203}
204
205/*
206 * Readahead of the first block of the directory when it is opened is completely
207 * oblivious to the format of the directory. Hence we can either get a block
208 * format buffer or a data format buffer on readahead.
209 */
210static void
211xfs_dir2_data_reada_verify(
212 struct xfs_buf *bp)
213{
214 struct xfs_mount *mp = bp->b_target->bt_mount;
215 struct xfs_dir2_data_hdr *hdr = bp->b_addr;
216
217 switch (hdr->magic) {
218 case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
219 bp->b_ops = &xfs_dir2_block_buf_ops;
220 bp->b_ops->verify_read(bp);
221 return;
222 case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
223 xfs_dir2_data_verify(bp);
224 return;
225 default:
226 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
227 xfs_buf_ioerror(bp, EFSCORRUPTED);
228 break;
229 }
230}
231
232static void
233xfs_dir2_data_read_verify(
234 struct xfs_buf *bp)
235{
236 xfs_dir2_data_verify(bp);
237}
238
239static void
240xfs_dir2_data_write_verify(
241 struct xfs_buf *bp)
242{
243 xfs_dir2_data_verify(bp);
244}
245
246const struct xfs_buf_ops xfs_dir2_data_buf_ops = {
247 .verify_read = xfs_dir2_data_read_verify,
248 .verify_write = xfs_dir2_data_write_verify,
249};
250
251static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = {
252 .verify_read = xfs_dir2_data_reada_verify,
253 .verify_write = xfs_dir2_data_write_verify,
254};
255
256
257int
258xfs_dir2_data_read(
259 struct xfs_trans *tp,
260 struct xfs_inode *dp,
261 xfs_dablk_t bno,
262 xfs_daddr_t mapped_bno,
263 struct xfs_buf **bpp)
264{
265 return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
266 XFS_DATA_FORK, &xfs_dir2_data_buf_ops);
267}
268
269int
270xfs_dir2_data_readahead(
271 struct xfs_trans *tp,
272 struct xfs_inode *dp,
273 xfs_dablk_t bno,
274 xfs_daddr_t mapped_bno)
275{
276 return xfs_da_reada_buf(tp, dp, bno, mapped_bno,
277 XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops);
170} 278}
171#endif
172 279
173/* 280/*
174 * Given a data block and an unused entry from that block, 281 * Given a data block and an unused entry from that block,
@@ -409,10 +516,9 @@ xfs_dir2_data_init(
409 */ 516 */
410 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp, 517 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp,
411 XFS_DATA_FORK); 518 XFS_DATA_FORK);
412 if (error) { 519 if (error)
413 return error; 520 return error;
414 } 521 bp->b_ops = &xfs_dir2_data_buf_ops;
415 ASSERT(bp != NULL);
416 522
417 /* 523 /*
418 * Initialize the header. 524 * Initialize the header.
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 0b296253bd01..60cd2fa4e047 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -48,6 +48,83 @@ static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp,
48 int first, int last); 48 int first, int last);
49static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp); 49static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp);
50 50
51static void
52xfs_dir2_leaf_verify(
53 struct xfs_buf *bp,
54 __be16 magic)
55{
56 struct xfs_mount *mp = bp->b_target->bt_mount;
57 struct xfs_dir2_leaf_hdr *hdr = bp->b_addr;
58 int block_ok = 0;
59
60 block_ok = hdr->info.magic == magic;
61 if (!block_ok) {
62 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
63 xfs_buf_ioerror(bp, EFSCORRUPTED);
64 }
65}
66
67static void
68xfs_dir2_leaf1_read_verify(
69 struct xfs_buf *bp)
70{
71 xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
72}
73
74static void
75xfs_dir2_leaf1_write_verify(
76 struct xfs_buf *bp)
77{
78 xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
79}
80
81void
82xfs_dir2_leafn_read_verify(
83 struct xfs_buf *bp)
84{
85 xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
86}
87
88void
89xfs_dir2_leafn_write_verify(
90 struct xfs_buf *bp)
91{
92 xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
93}
94
95static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = {
96 .verify_read = xfs_dir2_leaf1_read_verify,
97 .verify_write = xfs_dir2_leaf1_write_verify,
98};
99
100const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = {
101 .verify_read = xfs_dir2_leafn_read_verify,
102 .verify_write = xfs_dir2_leafn_write_verify,
103};
104
105static int
106xfs_dir2_leaf_read(
107 struct xfs_trans *tp,
108 struct xfs_inode *dp,
109 xfs_dablk_t fbno,
110 xfs_daddr_t mappedbno,
111 struct xfs_buf **bpp)
112{
113 return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
114 XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops);
115}
116
117int
118xfs_dir2_leafn_read(
119 struct xfs_trans *tp,
120 struct xfs_inode *dp,
121 xfs_dablk_t fbno,
122 xfs_daddr_t mappedbno,
123 struct xfs_buf **bpp)
124{
125 return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
126 XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops);
127}
51 128
52/* 129/*
53 * Convert a block form directory to a leaf form directory. 130 * Convert a block form directory to a leaf form directory.
@@ -125,6 +202,7 @@ xfs_dir2_block_to_leaf(
125 /* 202 /*
126 * Fix up the block header, make it a data block. 203 * Fix up the block header, make it a data block.
127 */ 204 */
205 dbp->b_ops = &xfs_dir2_data_buf_ops;
128 hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); 206 hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
129 if (needscan) 207 if (needscan)
130 xfs_dir2_data_freescan(mp, hdr, &needlog); 208 xfs_dir2_data_freescan(mp, hdr, &needlog);
@@ -311,15 +389,11 @@ xfs_dir2_leaf_addname(
311 dp = args->dp; 389 dp = args->dp;
312 tp = args->trans; 390 tp = args->trans;
313 mp = dp->i_mount; 391 mp = dp->i_mount;
314 /* 392
315 * Read the leaf block. 393 error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
316 */ 394 if (error)
317 error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
318 XFS_DATA_FORK);
319 if (error) {
320 return error; 395 return error;
321 } 396
322 ASSERT(lbp != NULL);
323 /* 397 /*
324 * Look up the entry by hash value and name. 398 * Look up the entry by hash value and name.
325 * We know it's not there, our caller has already done a lookup. 399 * We know it's not there, our caller has already done a lookup.
@@ -494,22 +568,21 @@ xfs_dir2_leaf_addname(
494 hdr = dbp->b_addr; 568 hdr = dbp->b_addr;
495 bestsp[use_block] = hdr->bestfree[0].length; 569 bestsp[use_block] = hdr->bestfree[0].length;
496 grown = 1; 570 grown = 1;
497 } 571 } else {
498 /* 572 /*
499 * Already had space in some data block. 573 * Already had space in some data block.
500 * Just read that one in. 574 * Just read that one in.
501 */ 575 */
502 else { 576 error = xfs_dir2_data_read(tp, dp,
503 if ((error = 577 xfs_dir2_db_to_da(mp, use_block),
504 xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block), 578 -1, &dbp);
505 -1, &dbp, XFS_DATA_FORK))) { 579 if (error) {
506 xfs_trans_brelse(tp, lbp); 580 xfs_trans_brelse(tp, lbp);
507 return error; 581 return error;
508 } 582 }
509 hdr = dbp->b_addr; 583 hdr = dbp->b_addr;
510 grown = 0; 584 grown = 0;
511 } 585 }
512 xfs_dir2_data_check(dp, dbp);
513 /* 586 /*
514 * Point to the biggest freespace in our data block. 587 * Point to the biggest freespace in our data block.
515 */ 588 */
@@ -892,10 +965,9 @@ xfs_dir2_leaf_readbuf(
892 * Read the directory block starting at the first mapping. 965 * Read the directory block starting at the first mapping.
893 */ 966 */
894 mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff); 967 mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
895 error = xfs_da_read_buf(NULL, dp, map->br_startoff, 968 error = xfs_dir2_data_read(NULL, dp, map->br_startoff,
896 map->br_blockcount >= mp->m_dirblkfsbs ? 969 map->br_blockcount >= mp->m_dirblkfsbs ?
897 XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, 970 XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
898 &bp, XFS_DATA_FORK);
899 971
900 /* 972 /*
901 * Should just skip over the data block instead of giving up. 973 * Should just skip over the data block instead of giving up.
@@ -922,11 +994,11 @@ xfs_dir2_leaf_readbuf(
922 */ 994 */
923 if (i > mip->ra_current && 995 if (i > mip->ra_current &&
924 map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) { 996 map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
925 xfs_buf_readahead(mp->m_ddev_targp, 997 xfs_dir2_data_readahead(NULL, dp,
998 map[mip->ra_index].br_startoff + mip->ra_offset,
926 XFS_FSB_TO_DADDR(mp, 999 XFS_FSB_TO_DADDR(mp,
927 map[mip->ra_index].br_startblock + 1000 map[mip->ra_index].br_startblock +
928 mip->ra_offset), 1001 mip->ra_offset));
929 (int)BTOBB(mp->m_dirblksize));
930 mip->ra_current = i; 1002 mip->ra_current = i;
931 } 1003 }
932 1004
@@ -935,10 +1007,9 @@ xfs_dir2_leaf_readbuf(
935 * use our mapping, but this is a very rare case. 1007 * use our mapping, but this is a very rare case.
936 */ 1008 */
937 else if (i > mip->ra_current) { 1009 else if (i > mip->ra_current) {
938 xfs_da_reada_buf(NULL, dp, 1010 xfs_dir2_data_readahead(NULL, dp,
939 map[mip->ra_index].br_startoff + 1011 map[mip->ra_index].br_startoff +
940 mip->ra_offset, 1012 mip->ra_offset, -1);
941 XFS_DATA_FORK);
942 mip->ra_current = i; 1013 mip->ra_current = i;
943 } 1014 }
944 1015
@@ -1177,15 +1248,14 @@ xfs_dir2_leaf_init(
1177 * Get the buffer for the block. 1248 * Get the buffer for the block.
1178 */ 1249 */
1179 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, 1250 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp,
1180 XFS_DATA_FORK); 1251 XFS_DATA_FORK);
1181 if (error) { 1252 if (error)
1182 return error; 1253 return error;
1183 } 1254
1184 ASSERT(bp != NULL);
1185 leaf = bp->b_addr;
1186 /* 1255 /*
1187 * Initialize the header. 1256 * Initialize the header.
1188 */ 1257 */
1258 leaf = bp->b_addr;
1189 leaf->hdr.info.magic = cpu_to_be16(magic); 1259 leaf->hdr.info.magic = cpu_to_be16(magic);
1190 leaf->hdr.info.forw = 0; 1260 leaf->hdr.info.forw = 0;
1191 leaf->hdr.info.back = 0; 1261 leaf->hdr.info.back = 0;
@@ -1198,10 +1268,12 @@ xfs_dir2_leaf_init(
1198 * the block. 1268 * the block.
1199 */ 1269 */
1200 if (magic == XFS_DIR2_LEAF1_MAGIC) { 1270 if (magic == XFS_DIR2_LEAF1_MAGIC) {
1271 bp->b_ops = &xfs_dir2_leaf1_buf_ops;
1201 ltp = xfs_dir2_leaf_tail_p(mp, leaf); 1272 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
1202 ltp->bestcount = 0; 1273 ltp->bestcount = 0;
1203 xfs_dir2_leaf_log_tail(tp, bp); 1274 xfs_dir2_leaf_log_tail(tp, bp);
1204 } 1275 } else
1276 bp->b_ops = &xfs_dir2_leafn_buf_ops;
1205 *bpp = bp; 1277 *bpp = bp;
1206 return 0; 1278 return 0;
1207} 1279}
@@ -1372,13 +1444,11 @@ xfs_dir2_leaf_lookup_int(
1372 dp = args->dp; 1444 dp = args->dp;
1373 tp = args->trans; 1445 tp = args->trans;
1374 mp = dp->i_mount; 1446 mp = dp->i_mount;
1375 /* 1447
1376 * Read the leaf block into the buffer. 1448 error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
1377 */
1378 error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
1379 XFS_DATA_FORK);
1380 if (error) 1449 if (error)
1381 return error; 1450 return error;
1451
1382 *lbpp = lbp; 1452 *lbpp = lbp;
1383 leaf = lbp->b_addr; 1453 leaf = lbp->b_addr;
1384 xfs_dir2_leaf_check(dp, lbp); 1454 xfs_dir2_leaf_check(dp, lbp);
@@ -1409,14 +1479,13 @@ xfs_dir2_leaf_lookup_int(
1409 if (newdb != curdb) { 1479 if (newdb != curdb) {
1410 if (dbp) 1480 if (dbp)
1411 xfs_trans_brelse(tp, dbp); 1481 xfs_trans_brelse(tp, dbp);
1412 error = xfs_da_read_buf(tp, dp, 1482 error = xfs_dir2_data_read(tp, dp,
1413 xfs_dir2_db_to_da(mp, newdb), 1483 xfs_dir2_db_to_da(mp, newdb),
1414 -1, &dbp, XFS_DATA_FORK); 1484 -1, &dbp);
1415 if (error) { 1485 if (error) {
1416 xfs_trans_brelse(tp, lbp); 1486 xfs_trans_brelse(tp, lbp);
1417 return error; 1487 return error;
1418 } 1488 }
1419 xfs_dir2_data_check(dp, dbp);
1420 curdb = newdb; 1489 curdb = newdb;
1421 } 1490 }
1422 /* 1491 /*
@@ -1451,9 +1520,9 @@ xfs_dir2_leaf_lookup_int(
1451 ASSERT(cidb != -1); 1520 ASSERT(cidb != -1);
1452 if (cidb != curdb) { 1521 if (cidb != curdb) {
1453 xfs_trans_brelse(tp, dbp); 1522 xfs_trans_brelse(tp, dbp);
1454 error = xfs_da_read_buf(tp, dp, 1523 error = xfs_dir2_data_read(tp, dp,
1455 xfs_dir2_db_to_da(mp, cidb), 1524 xfs_dir2_db_to_da(mp, cidb),
1456 -1, &dbp, XFS_DATA_FORK); 1525 -1, &dbp);
1457 if (error) { 1526 if (error) {
1458 xfs_trans_brelse(tp, lbp); 1527 xfs_trans_brelse(tp, lbp);
1459 return error; 1528 return error;
@@ -1738,10 +1807,9 @@ xfs_dir2_leaf_trim_data(
1738 /* 1807 /*
1739 * Read the offending data block. We need its buffer. 1808 * Read the offending data block. We need its buffer.
1740 */ 1809 */
1741 if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp, 1810 error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp);
1742 XFS_DATA_FORK))) { 1811 if (error)
1743 return error; 1812 return error;
1744 }
1745 1813
1746 leaf = lbp->b_addr; 1814 leaf = lbp->b_addr;
1747 ltp = xfs_dir2_leaf_tail_p(mp, leaf); 1815 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
@@ -1864,10 +1932,9 @@ xfs_dir2_node_to_leaf(
1864 /* 1932 /*
1865 * Read the freespace block. 1933 * Read the freespace block.
1866 */ 1934 */
1867 if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp, 1935 error = xfs_dir2_free_read(tp, dp, mp->m_dirfreeblk, &fbp);
1868 XFS_DATA_FORK))) { 1936 if (error)
1869 return error; 1937 return error;
1870 }
1871 free = fbp->b_addr; 1938 free = fbp->b_addr;
1872 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); 1939 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
1873 ASSERT(!free->hdr.firstdb); 1940 ASSERT(!free->hdr.firstdb);
@@ -1890,7 +1957,10 @@ xfs_dir2_node_to_leaf(
1890 xfs_dir2_leaf_compact(args, lbp); 1957 xfs_dir2_leaf_compact(args, lbp);
1891 else 1958 else
1892 xfs_dir2_leaf_log_header(tp, lbp); 1959 xfs_dir2_leaf_log_header(tp, lbp);
1960
1961 lbp->b_ops = &xfs_dir2_leaf1_buf_ops;
1893 leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC); 1962 leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC);
1963
1894 /* 1964 /*
1895 * Set up the leaf tail from the freespace block. 1965 * Set up the leaf tail from the freespace block.
1896 */ 1966 */
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 6c7052406605..5980f9b7fa9b 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -55,6 +55,74 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
55static int xfs_dir2_node_addname_int(xfs_da_args_t *args, 55static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
56 xfs_da_state_blk_t *fblk); 56 xfs_da_state_blk_t *fblk);
57 57
58static void
59xfs_dir2_free_verify(
60 struct xfs_buf *bp)
61{
62 struct xfs_mount *mp = bp->b_target->bt_mount;
63 struct xfs_dir2_free_hdr *hdr = bp->b_addr;
64 int block_ok = 0;
65
66 block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC);
67 if (!block_ok) {
68 XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic",
69 XFS_ERRLEVEL_LOW, mp, hdr);
70 xfs_buf_ioerror(bp, EFSCORRUPTED);
71 }
72}
73
74static void
75xfs_dir2_free_read_verify(
76 struct xfs_buf *bp)
77{
78 xfs_dir2_free_verify(bp);
79}
80
81static void
82xfs_dir2_free_write_verify(
83 struct xfs_buf *bp)
84{
85 xfs_dir2_free_verify(bp);
86}
87
88static const struct xfs_buf_ops xfs_dir2_free_buf_ops = {
89 .verify_read = xfs_dir2_free_read_verify,
90 .verify_write = xfs_dir2_free_write_verify,
91};
92
93
94static int
95__xfs_dir2_free_read(
96 struct xfs_trans *tp,
97 struct xfs_inode *dp,
98 xfs_dablk_t fbno,
99 xfs_daddr_t mappedbno,
100 struct xfs_buf **bpp)
101{
102 return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
103 XFS_DATA_FORK, &xfs_dir2_free_buf_ops);
104}
105
106int
107xfs_dir2_free_read(
108 struct xfs_trans *tp,
109 struct xfs_inode *dp,
110 xfs_dablk_t fbno,
111 struct xfs_buf **bpp)
112{
113 return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp);
114}
115
116static int
117xfs_dir2_free_try_read(
118 struct xfs_trans *tp,
119 struct xfs_inode *dp,
120 xfs_dablk_t fbno,
121 struct xfs_buf **bpp)
122{
123 return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp);
124}
125
58/* 126/*
59 * Log entries from a freespace block. 127 * Log entries from a freespace block.
60 */ 128 */
@@ -131,11 +199,12 @@ xfs_dir2_leaf_to_node(
131 /* 199 /*
132 * Get the buffer for the new freespace block. 200 * Get the buffer for the new freespace block.
133 */ 201 */
134 if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, 202 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
135 XFS_DATA_FORK))) { 203 XFS_DATA_FORK);
204 if (error)
136 return error; 205 return error;
137 } 206 fbp->b_ops = &xfs_dir2_free_buf_ops;
138 ASSERT(fbp != NULL); 207
139 free = fbp->b_addr; 208 free = fbp->b_addr;
140 leaf = lbp->b_addr; 209 leaf = lbp->b_addr;
141 ltp = xfs_dir2_leaf_tail_p(mp, leaf); 210 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
@@ -157,7 +226,10 @@ xfs_dir2_leaf_to_node(
157 *to = cpu_to_be16(off); 226 *to = cpu_to_be16(off);
158 } 227 }
159 free->hdr.nused = cpu_to_be32(n); 228 free->hdr.nused = cpu_to_be32(n);
229
230 lbp->b_ops = &xfs_dir2_leafn_buf_ops;
160 leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); 231 leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
232
161 /* 233 /*
162 * Log everything. 234 * Log everything.
163 */ 235 */
@@ -394,12 +466,10 @@ xfs_dir2_leafn_lookup_for_addname(
394 */ 466 */
395 if (curbp) 467 if (curbp)
396 xfs_trans_brelse(tp, curbp); 468 xfs_trans_brelse(tp, curbp);
397 /* 469
398 * Read the free block. 470 error = xfs_dir2_free_read(tp, dp,
399 */
400 error = xfs_da_read_buf(tp, dp,
401 xfs_dir2_db_to_da(mp, newfdb), 471 xfs_dir2_db_to_da(mp, newfdb),
402 -1, &curbp, XFS_DATA_FORK); 472 &curbp);
403 if (error) 473 if (error)
404 return error; 474 return error;
405 free = curbp->b_addr; 475 free = curbp->b_addr;
@@ -534,9 +604,9 @@ xfs_dir2_leafn_lookup_for_entry(
534 ASSERT(state->extravalid); 604 ASSERT(state->extravalid);
535 curbp = state->extrablk.bp; 605 curbp = state->extrablk.bp;
536 } else { 606 } else {
537 error = xfs_da_read_buf(tp, dp, 607 error = xfs_dir2_data_read(tp, dp,
538 xfs_dir2_db_to_da(mp, newdb), 608 xfs_dir2_db_to_da(mp, newdb),
539 -1, &curbp, XFS_DATA_FORK); 609 -1, &curbp);
540 if (error) 610 if (error)
541 return error; 611 return error;
542 } 612 }
@@ -568,6 +638,7 @@ xfs_dir2_leafn_lookup_for_entry(
568 state->extrablk.index = (int)((char *)dep - 638 state->extrablk.index = (int)((char *)dep -
569 (char *)curbp->b_addr); 639 (char *)curbp->b_addr);
570 state->extrablk.magic = XFS_DIR2_DATA_MAGIC; 640 state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
641 curbp->b_ops = &xfs_dir2_data_buf_ops;
571 if (cmp == XFS_CMP_EXACT) 642 if (cmp == XFS_CMP_EXACT)
572 return XFS_ERROR(EEXIST); 643 return XFS_ERROR(EEXIST);
573 } 644 }
@@ -582,6 +653,7 @@ xfs_dir2_leafn_lookup_for_entry(
582 state->extrablk.index = -1; 653 state->extrablk.index = -1;
583 state->extrablk.blkno = curdb; 654 state->extrablk.blkno = curdb;
584 state->extrablk.magic = XFS_DIR2_DATA_MAGIC; 655 state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
656 curbp->b_ops = &xfs_dir2_data_buf_ops;
585 } else { 657 } else {
586 /* If the curbp is not the CI match block, drop it */ 658 /* If the curbp is not the CI match block, drop it */
587 if (state->extrablk.bp != curbp) 659 if (state->extrablk.bp != curbp)
@@ -825,6 +897,77 @@ xfs_dir2_leafn_rebalance(
825 } 897 }
826} 898}
827 899
900static int
901xfs_dir2_data_block_free(
902 xfs_da_args_t *args,
903 struct xfs_dir2_data_hdr *hdr,
904 struct xfs_dir2_free *free,
905 xfs_dir2_db_t fdb,
906 int findex,
907 struct xfs_buf *fbp,
908 int longest)
909{
910 struct xfs_trans *tp = args->trans;
911 int logfree = 0;
912
913 if (!hdr) {
914 /* One less used entry in the free table. */
915 be32_add_cpu(&free->hdr.nused, -1);
916 xfs_dir2_free_log_header(tp, fbp);
917
918 /*
919 * If this was the last entry in the table, we can trim the
920 * table size back. There might be other entries at the end
921 * referring to non-existent data blocks, get those too.
922 */
923 if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
924 int i; /* free entry index */
925
926 for (i = findex - 1; i >= 0; i--) {
927 if (free->bests[i] != cpu_to_be16(NULLDATAOFF))
928 break;
929 }
930 free->hdr.nvalid = cpu_to_be32(i + 1);
931 logfree = 0;
932 } else {
933 /* Not the last entry, just punch it out. */
934 free->bests[findex] = cpu_to_be16(NULLDATAOFF);
935 logfree = 1;
936 }
937 /*
938 * If there are no useful entries left in the block,
939 * get rid of the block if we can.
940 */
941 if (!free->hdr.nused) {
942 int error;
943
944 error = xfs_dir2_shrink_inode(args, fdb, fbp);
945 if (error == 0) {
946 fbp = NULL;
947 logfree = 0;
948 } else if (error != ENOSPC || args->total != 0)
949 return error;
950 /*
951 * It's possible to get ENOSPC if there is no
952 * space reservation. In this case some one
953 * else will eventually get rid of this block.
954 */
955 }
956 } else {
957 /*
958 * Data block is not empty, just set the free entry to the new
959 * value.
960 */
961 free->bests[findex] = cpu_to_be16(longest);
962 logfree = 1;
963 }
964
965 /* Log the free entry that changed, unless we got rid of it. */
966 if (logfree)
967 xfs_dir2_free_log_bests(tp, fbp, findex, findex);
968 return 0;
969}
970
828/* 971/*
829 * Remove an entry from a node directory. 972 * Remove an entry from a node directory.
830 * This removes the leaf entry and the data entry, 973 * This removes the leaf entry and the data entry,
@@ -908,17 +1051,16 @@ xfs_dir2_leafn_remove(
908 xfs_dir2_db_t fdb; /* freeblock block number */ 1051 xfs_dir2_db_t fdb; /* freeblock block number */
909 int findex; /* index in freeblock entries */ 1052 int findex; /* index in freeblock entries */
910 xfs_dir2_free_t *free; /* freeblock structure */ 1053 xfs_dir2_free_t *free; /* freeblock structure */
911 int logfree; /* need to log free entry */
912 1054
913 /* 1055 /*
914 * Convert the data block number to a free block, 1056 * Convert the data block number to a free block,
915 * read in the free block. 1057 * read in the free block.
916 */ 1058 */
917 fdb = xfs_dir2_db_to_fdb(mp, db); 1059 fdb = xfs_dir2_db_to_fdb(mp, db);
918 if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), 1060 error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb),
919 -1, &fbp, XFS_DATA_FORK))) { 1061 &fbp);
1062 if (error)
920 return error; 1063 return error;
921 }
922 free = fbp->b_addr; 1064 free = fbp->b_addr;
923 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); 1065 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
924 ASSERT(be32_to_cpu(free->hdr.firstdb) == 1066 ASSERT(be32_to_cpu(free->hdr.firstdb) ==
@@ -954,68 +1096,12 @@ xfs_dir2_leafn_remove(
954 * If we got rid of the data block, we can eliminate that entry 1096 * If we got rid of the data block, we can eliminate that entry
955 * in the free block. 1097 * in the free block.
956 */ 1098 */
957 if (hdr == NULL) { 1099 error = xfs_dir2_data_block_free(args, hdr, free,
958 /* 1100 fdb, findex, fbp, longest);
959 * One less used entry in the free table. 1101 if (error)
960 */ 1102 return error;
961 be32_add_cpu(&free->hdr.nused, -1);
962 xfs_dir2_free_log_header(tp, fbp);
963 /*
964 * If this was the last entry in the table, we can
965 * trim the table size back. There might be other
966 * entries at the end referring to non-existent
967 * data blocks, get those too.
968 */
969 if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
970 int i; /* free entry index */
971
972 for (i = findex - 1;
973 i >= 0 &&
974 free->bests[i] == cpu_to_be16(NULLDATAOFF);
975 i--)
976 continue;
977 free->hdr.nvalid = cpu_to_be32(i + 1);
978 logfree = 0;
979 }
980 /*
981 * Not the last entry, just punch it out.
982 */
983 else {
984 free->bests[findex] = cpu_to_be16(NULLDATAOFF);
985 logfree = 1;
986 }
987 /*
988 * If there are no useful entries left in the block,
989 * get rid of the block if we can.
990 */
991 if (!free->hdr.nused) {
992 error = xfs_dir2_shrink_inode(args, fdb, fbp);
993 if (error == 0) {
994 fbp = NULL;
995 logfree = 0;
996 } else if (error != ENOSPC || args->total != 0)
997 return error;
998 /*
999 * It's possible to get ENOSPC if there is no
1000 * space reservation. In this case some one
1001 * else will eventually get rid of this block.
1002 */
1003 }
1004 }
1005 /*
1006 * Data block is not empty, just set the free entry to
1007 * the new value.
1008 */
1009 else {
1010 free->bests[findex] = cpu_to_be16(longest);
1011 logfree = 1;
1012 }
1013 /*
1014 * Log the free entry that changed, unless we got rid of it.
1015 */
1016 if (logfree)
1017 xfs_dir2_free_log_bests(tp, fbp, findex, findex);
1018 } 1103 }
1104
1019 xfs_dir2_leafn_check(dp, bp); 1105 xfs_dir2_leafn_check(dp, bp);
1020 /* 1106 /*
1021 * Return indication of whether this leaf block is empty enough 1107 * Return indication of whether this leaf block is empty enough
@@ -1169,12 +1255,11 @@ xfs_dir2_leafn_toosmall(
1169 /* 1255 /*
1170 * Read the sibling leaf block. 1256 * Read the sibling leaf block.
1171 */ 1257 */
1172 if ((error = 1258 error = xfs_dir2_leafn_read(state->args->trans, state->args->dp,
1173 xfs_da_read_buf(state->args->trans, state->args->dp, blkno, 1259 blkno, -1, &bp);
1174 -1, &bp, XFS_DATA_FORK))) { 1260 if (error)
1175 return error; 1261 return error;
1176 } 1262
1177 ASSERT(bp != NULL);
1178 /* 1263 /*
1179 * Count bytes in the two blocks combined. 1264 * Count bytes in the two blocks combined.
1180 */ 1265 */
@@ -1454,14 +1539,13 @@ xfs_dir2_node_addname_int(
1454 * This should be really rare, so there's no reason 1539 * This should be really rare, so there's no reason
1455 * to avoid it. 1540 * to avoid it.
1456 */ 1541 */
1457 if ((error = xfs_da_read_buf(tp, dp, 1542 error = xfs_dir2_free_try_read(tp, dp,
1458 xfs_dir2_db_to_da(mp, fbno), -2, &fbp, 1543 xfs_dir2_db_to_da(mp, fbno),
1459 XFS_DATA_FORK))) { 1544 &fbp);
1545 if (error)
1460 return error; 1546 return error;
1461 } 1547 if (!fbp)
1462 if (unlikely(fbp == NULL)) {
1463 continue; 1548 continue;
1464 }
1465 free = fbp->b_addr; 1549 free = fbp->b_addr;
1466 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); 1550 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
1467 findex = 0; 1551 findex = 0;
@@ -1520,9 +1604,10 @@ xfs_dir2_node_addname_int(
1520 * that was just allocated. 1604 * that was just allocated.
1521 */ 1605 */
1522 fbno = xfs_dir2_db_to_fdb(mp, dbno); 1606 fbno = xfs_dir2_db_to_fdb(mp, dbno);
1523 if (unlikely(error = xfs_da_read_buf(tp, dp, 1607 error = xfs_dir2_free_try_read(tp, dp,
1524 xfs_dir2_db_to_da(mp, fbno), -2, &fbp, 1608 xfs_dir2_db_to_da(mp, fbno),
1525 XFS_DATA_FORK))) 1609 &fbp);
1610 if (error)
1526 return error; 1611 return error;
1527 1612
1528 /* 1613 /*
@@ -1561,12 +1646,12 @@ xfs_dir2_node_addname_int(
1561 /* 1646 /*
1562 * Get a buffer for the new block. 1647 * Get a buffer for the new block.
1563 */ 1648 */
1564 if ((error = xfs_da_get_buf(tp, dp, 1649 error = xfs_da_get_buf(tp, dp,
1565 xfs_dir2_db_to_da(mp, fbno), 1650 xfs_dir2_db_to_da(mp, fbno),
1566 -1, &fbp, XFS_DATA_FORK))) { 1651 -1, &fbp, XFS_DATA_FORK);
1652 if (error)
1567 return error; 1653 return error;
1568 } 1654 fbp->b_ops = &xfs_dir2_free_buf_ops;
1569 ASSERT(fbp != NULL);
1570 1655
1571 /* 1656 /*
1572 * Initialize the new block to be empty, and remember 1657 * Initialize the new block to be empty, and remember
@@ -1630,8 +1715,8 @@ xfs_dir2_node_addname_int(
1630 /* 1715 /*
1631 * Read the data block in. 1716 * Read the data block in.
1632 */ 1717 */
1633 error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno), 1718 error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno),
1634 -1, &dbp, XFS_DATA_FORK); 1719 -1, &dbp);
1635 if (error) 1720 if (error)
1636 return error; 1721 return error;
1637 hdr = dbp->b_addr; 1722 hdr = dbp->b_addr;
@@ -1917,18 +2002,15 @@ xfs_dir2_node_trim_free(
1917 /* 2002 /*
1918 * Read the freespace block. 2003 * Read the freespace block.
1919 */ 2004 */
1920 if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp, 2005 error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
1921 XFS_DATA_FORK))) { 2006 if (error)
1922 return error; 2007 return error;
1923 }
1924
1925 /* 2008 /*
1926 * There can be holes in freespace. If fo is a hole, there's 2009 * There can be holes in freespace. If fo is a hole, there's
1927 * nothing to do. 2010 * nothing to do.
1928 */ 2011 */
1929 if (bp == NULL) { 2012 if (!bp)
1930 return 0; 2013 return 0;
1931 }
1932 free = bp->b_addr; 2014 free = bp->b_addr;
1933 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); 2015 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
1934 /* 2016 /*
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 3523d3e15aa8..7da79f6515fd 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -30,6 +30,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
30 const unsigned char *name, int len); 30 const unsigned char *name, int len);
31 31
32/* xfs_dir2_block.c */ 32/* xfs_dir2_block.c */
33extern const struct xfs_buf_ops xfs_dir2_block_buf_ops;
34
33extern int xfs_dir2_block_addname(struct xfs_da_args *args); 35extern int xfs_dir2_block_addname(struct xfs_da_args *args);
34extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent, 36extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent,
35 xfs_off_t *offset, filldir_t filldir); 37 xfs_off_t *offset, filldir_t filldir);
@@ -41,10 +43,19 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
41 43
42/* xfs_dir2_data.c */ 44/* xfs_dir2_data.c */
43#ifdef DEBUG 45#ifdef DEBUG
44extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); 46#define xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp);
45#else 47#else
46#define xfs_dir2_data_check(dp,bp) 48#define xfs_dir2_data_check(dp,bp)
47#endif 49#endif
50
51extern const struct xfs_buf_ops xfs_dir2_data_buf_ops;
52
53extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
54extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
55 xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
56extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
57 xfs_dablk_t bno, xfs_daddr_t mapped_bno);
58
48extern struct xfs_dir2_data_free * 59extern struct xfs_dir2_data_free *
49xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, 60xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
50 struct xfs_dir2_data_unused *dup, int *loghead); 61 struct xfs_dir2_data_unused *dup, int *loghead);
@@ -66,6 +77,10 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
66 xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); 77 xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
67 78
68/* xfs_dir2_leaf.c */ 79/* xfs_dir2_leaf.c */
80extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops;
81
82extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
83 xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
69extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, 84extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
70 struct xfs_buf *dbp); 85 struct xfs_buf *dbp);
71extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); 86extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
@@ -115,6 +130,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args);
115extern int xfs_dir2_node_replace(struct xfs_da_args *args); 130extern int xfs_dir2_node_replace(struct xfs_da_args *args);
116extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, 131extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
117 int *rvalp); 132 int *rvalp);
133extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
134 xfs_dablk_t fbno, struct xfs_buf **bpp);
118 135
119/* xfs_dir2_sf.c */ 136/* xfs_dir2_sf.c */
120extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp); 137extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index bf27fcca4843..9e1bf5294c91 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -248,7 +248,59 @@ xfs_qm_init_dquot_blk(
248 xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); 248 xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
249} 249}
250 250
251static void
252xfs_dquot_buf_verify(
253 struct xfs_buf *bp)
254{
255 struct xfs_mount *mp = bp->b_target->bt_mount;
256 struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
257 struct xfs_disk_dquot *ddq;
258 xfs_dqid_t id = 0;
259 int i;
260
261 /*
262 * On the first read of the buffer, verify that each dquot is valid.
263 * We don't know what the id of the dquot is supposed to be, just that
264 * they should be increasing monotonically within the buffer. If the
265 * first id is corrupt, then it will fail on the second dquot in the
266 * buffer so corruptions could point to the wrong dquot in this case.
267 */
268 for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
269 int error;
270
271 ddq = &d[i].dd_diskdq;
272
273 if (i == 0)
274 id = be32_to_cpu(ddq->d_id);
275
276 error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
277 "xfs_dquot_read_verify");
278 if (error) {
279 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d);
280 xfs_buf_ioerror(bp, EFSCORRUPTED);
281 break;
282 }
283 }
284}
285
286static void
287xfs_dquot_buf_read_verify(
288 struct xfs_buf *bp)
289{
290 xfs_dquot_buf_verify(bp);
291}
292
293void
294xfs_dquot_buf_write_verify(
295 struct xfs_buf *bp)
296{
297 xfs_dquot_buf_verify(bp);
298}
251 299
300const struct xfs_buf_ops xfs_dquot_buf_ops = {
301 .verify_read = xfs_dquot_buf_read_verify,
302 .verify_write = xfs_dquot_buf_write_verify,
303};
252 304
253/* 305/*
254 * Allocate a block and fill it with dquots. 306 * Allocate a block and fill it with dquots.
@@ -315,6 +367,7 @@ xfs_qm_dqalloc(
315 error = xfs_buf_geterror(bp); 367 error = xfs_buf_geterror(bp);
316 if (error) 368 if (error)
317 goto error1; 369 goto error1;
370 bp->b_ops = &xfs_dquot_buf_ops;
318 371
319 /* 372 /*
320 * Make a chunk of dquots out of this buffer and log 373 * Make a chunk of dquots out of this buffer and log
@@ -359,6 +412,51 @@ xfs_qm_dqalloc(
359 412
360 return (error); 413 return (error);
361} 414}
415STATIC int
416xfs_qm_dqrepair(
417 struct xfs_mount *mp,
418 struct xfs_trans *tp,
419 struct xfs_dquot *dqp,
420 xfs_dqid_t firstid,
421 struct xfs_buf **bpp)
422{
423 int error;
424 struct xfs_disk_dquot *ddq;
425 struct xfs_dqblk *d;
426 int i;
427
428 /*
429 * Read the buffer without verification so we get the corrupted
430 * buffer returned to us. make sure we verify it on write, though.
431 */
432 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno,
433 mp->m_quotainfo->qi_dqchunklen,
434 0, bpp, NULL);
435
436 if (error) {
437 ASSERT(*bpp == NULL);
438 return XFS_ERROR(error);
439 }
440 (*bpp)->b_ops = &xfs_dquot_buf_ops;
441
442 ASSERT(xfs_buf_islocked(*bpp));
443 d = (struct xfs_dqblk *)(*bpp)->b_addr;
444
445 /* Do the actual repair of dquots in this buffer */
446 for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
447 ddq = &d[i].dd_diskdq;
448 error = xfs_qm_dqcheck(mp, ddq, firstid + i,
449 dqp->dq_flags & XFS_DQ_ALLTYPES,
450 XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair");
451 if (error) {
452 /* repair failed, we're screwed */
453 xfs_trans_brelse(tp, *bpp);
454 return XFS_ERROR(EIO);
455 }
456 }
457
458 return 0;
459}
362 460
363/* 461/*
364 * Maps a dquot to the buffer containing its on-disk version. 462 * Maps a dquot to the buffer containing its on-disk version.
@@ -378,7 +476,6 @@ xfs_qm_dqtobp(
378 xfs_buf_t *bp; 476 xfs_buf_t *bp;
379 xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp); 477 xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp);
380 xfs_mount_t *mp = dqp->q_mount; 478 xfs_mount_t *mp = dqp->q_mount;
381 xfs_disk_dquot_t *ddq;
382 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); 479 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
383 xfs_trans_t *tp = (tpp ? *tpp : NULL); 480 xfs_trans_t *tp = (tpp ? *tpp : NULL);
384 481
@@ -439,33 +536,24 @@ xfs_qm_dqtobp(
439 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 536 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
440 dqp->q_blkno, 537 dqp->q_blkno,
441 mp->m_quotainfo->qi_dqchunklen, 538 mp->m_quotainfo->qi_dqchunklen,
442 0, &bp); 539 0, &bp, &xfs_dquot_buf_ops);
443 if (error || !bp)
444 return XFS_ERROR(error);
445 }
446
447 ASSERT(xfs_buf_islocked(bp));
448 540
449 /* 541 if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
450 * calculate the location of the dquot inside the buffer. 542 xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff *
451 */ 543 mp->m_quotainfo->qi_dqperchunk;
452 ddq = bp->b_addr + dqp->q_bufoffset; 544 ASSERT(bp == NULL);
545 error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp);
546 }
453 547
454 /* 548 if (error) {
455 * A simple sanity check in case we got a corrupted dquot... 549 ASSERT(bp == NULL);
456 */ 550 return XFS_ERROR(error);
457 error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
458 flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
459 "dqtobp");
460 if (error) {
461 if (!(flags & XFS_QMOPT_DQREPAIR)) {
462 xfs_trans_brelse(tp, bp);
463 return XFS_ERROR(EIO);
464 } 551 }
465 } 552 }
466 553
554 ASSERT(xfs_buf_islocked(bp));
467 *O_bpp = bp; 555 *O_bpp = bp;
468 *O_ddpp = ddq; 556 *O_ddpp = bp->b_addr + dqp->q_bufoffset;
469 557
470 return (0); 558 return (0);
471} 559}
@@ -920,7 +1008,7 @@ xfs_qm_dqflush(
920 * Get the buffer containing the on-disk dquot 1008 * Get the buffer containing the on-disk dquot
921 */ 1009 */
922 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, 1010 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
923 mp->m_quotainfo->qi_dqchunklen, 0, &bp); 1011 mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL);
924 if (error) 1012 if (error)
925 goto out_unlock; 1013 goto out_unlock;
926 1014
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 7d20af27346d..c694a8469c4a 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -161,4 +161,6 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
161 return dqp; 161 return dqp;
162} 162}
163 163
164extern const struct xfs_buf_ops xfs_dquot_buf_ops;
165
164#endif /* __XFS_DQUOT_H__ */ 166#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 8c6d1d70278c..a83611849cee 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -29,6 +29,7 @@
29#include "xfs_inode.h" 29#include "xfs_inode.h"
30#include "xfs_inode_item.h" 30#include "xfs_inode_item.h"
31#include "xfs_trace.h" 31#include "xfs_trace.h"
32#include "xfs_icache.h"
32 33
33/* 34/*
34 * Note that we only accept fileids which are long enough rather than allow 35 * Note that we only accept fileids which are long enough rather than allow
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index aa473fa640a2..67284edb84d7 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -31,6 +31,8 @@
31#include "xfs_error.h" 31#include "xfs_error.h"
32#include "xfs_vnodeops.h" 32#include "xfs_vnodeops.h"
33#include "xfs_da_btree.h" 33#include "xfs_da_btree.h"
34#include "xfs_dir2_format.h"
35#include "xfs_dir2_priv.h"
34#include "xfs_ioctl.h" 36#include "xfs_ioctl.h"
35#include "xfs_trace.h" 37#include "xfs_trace.h"
36 38
@@ -84,7 +86,7 @@ xfs_rw_ilock_demote(
84 * valid before the operation, it will be read from disk before 86 * valid before the operation, it will be read from disk before
85 * being partially zeroed. 87 * being partially zeroed.
86 */ 88 */
87STATIC int 89int
88xfs_iozero( 90xfs_iozero(
89 struct xfs_inode *ip, /* inode */ 91 struct xfs_inode *ip, /* inode */
90 loff_t pos, /* offset in file */ 92 loff_t pos, /* offset in file */
@@ -255,15 +257,14 @@ xfs_file_aio_read(
255 xfs_buftarg_t *target = 257 xfs_buftarg_t *target =
256 XFS_IS_REALTIME_INODE(ip) ? 258 XFS_IS_REALTIME_INODE(ip) ?
257 mp->m_rtdev_targp : mp->m_ddev_targp; 259 mp->m_rtdev_targp : mp->m_ddev_targp;
258 if ((iocb->ki_pos & target->bt_smask) || 260 if ((pos & target->bt_smask) || (size & target->bt_smask)) {
259 (size & target->bt_smask)) { 261 if (pos == i_size_read(inode))
260 if (iocb->ki_pos == i_size_read(inode))
261 return 0; 262 return 0;
262 return -XFS_ERROR(EINVAL); 263 return -XFS_ERROR(EINVAL);
263 } 264 }
264 } 265 }
265 266
266 n = mp->m_super->s_maxbytes - iocb->ki_pos; 267 n = mp->m_super->s_maxbytes - pos;
267 if (n <= 0 || size == 0) 268 if (n <= 0 || size == 0)
268 return 0; 269 return 0;
269 270
@@ -289,20 +290,21 @@ xfs_file_aio_read(
289 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); 290 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
290 291
291 if (inode->i_mapping->nrpages) { 292 if (inode->i_mapping->nrpages) {
292 ret = -xfs_flushinval_pages(ip, 293 ret = -filemap_write_and_wait_range(
293 (iocb->ki_pos & PAGE_CACHE_MASK), 294 VFS_I(ip)->i_mapping,
294 -1, FI_REMAPF_LOCKED); 295 pos, -1);
295 if (ret) { 296 if (ret) {
296 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); 297 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
297 return ret; 298 return ret;
298 } 299 }
300 truncate_pagecache_range(VFS_I(ip), pos, -1);
299 } 301 }
300 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 302 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
301 } 303 }
302 304
303 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); 305 trace_xfs_file_read(ip, size, pos, ioflags);
304 306
305 ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos); 307 ret = generic_file_aio_read(iocb, iovp, nr_segs, pos);
306 if (ret > 0) 308 if (ret > 0)
307 XFS_STATS_ADD(xs_read_bytes, ret); 309 XFS_STATS_ADD(xs_read_bytes, ret);
308 310
@@ -670,10 +672,11 @@ xfs_file_dio_aio_write(
670 goto out; 672 goto out;
671 673
672 if (mapping->nrpages) { 674 if (mapping->nrpages) {
673 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, 675 ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
674 FI_REMAPF_LOCKED); 676 pos, -1);
675 if (ret) 677 if (ret)
676 goto out; 678 goto out;
679 truncate_pagecache_range(VFS_I(ip), pos, -1);
677 } 680 }
678 681
679 /* 682 /*
@@ -728,16 +731,17 @@ xfs_file_buffered_aio_write(
728write_retry: 731write_retry:
729 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); 732 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
730 ret = generic_file_buffered_write(iocb, iovp, nr_segs, 733 ret = generic_file_buffered_write(iocb, iovp, nr_segs,
731 pos, &iocb->ki_pos, count, ret); 734 pos, &iocb->ki_pos, count, 0);
735
732 /* 736 /*
733 * if we just got an ENOSPC, flush the inode now we aren't holding any 737 * If we just got an ENOSPC, try to write back all dirty inodes to
734 * page locks and retry *once* 738 * convert delalloc space to free up some of the excess reserved
739 * metadata space.
735 */ 740 */
736 if (ret == -ENOSPC && !enospc) { 741 if (ret == -ENOSPC && !enospc) {
737 enospc = 1; 742 enospc = 1;
738 ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); 743 xfs_flush_inodes(ip->i_mount);
739 if (!ret) 744 goto write_retry;
740 goto write_retry;
741 } 745 }
742 746
743 current->backing_dev_info = NULL; 747 current->backing_dev_info = NULL;
@@ -889,7 +893,7 @@ xfs_dir_open(
889 */ 893 */
890 mode = xfs_ilock_map_shared(ip); 894 mode = xfs_ilock_map_shared(ip);
891 if (ip->i_d.di_nextents > 0) 895 if (ip->i_d.di_nextents > 0)
892 xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK); 896 xfs_dir2_data_readahead(NULL, ip, 0, -1);
893 xfs_iunlock(ip, mode); 897 xfs_iunlock(ip, mode);
894 return 0; 898 return 0;
895} 899}
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index c13fed8c394a..6dda3f949b04 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -233,7 +233,8 @@ typedef struct xfs_fsop_resblks {
233#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ 233#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */
234#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ 234#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */
235#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ 235#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */
236#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ 236#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */
237#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */
237#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ 238#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */
238 239
239 240
@@ -339,6 +340,35 @@ typedef struct xfs_error_injection {
339 340
340 341
341/* 342/*
343 * Speculative preallocation trimming.
344 */
345#define XFS_EOFBLOCKS_VERSION 1
346struct xfs_eofblocks {
347 __u32 eof_version;
348 __u32 eof_flags;
349 uid_t eof_uid;
350 gid_t eof_gid;
351 prid_t eof_prid;
352 __u32 pad32;
353 __u64 eof_min_file_size;
354 __u64 pad64[12];
355};
356
357/* eof_flags values */
358#define XFS_EOF_FLAGS_SYNC (1 << 0) /* sync/wait mode scan */
359#define XFS_EOF_FLAGS_UID (1 << 1) /* filter by uid */
360#define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */
361#define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */
362#define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */
363#define XFS_EOF_FLAGS_VALID \
364 (XFS_EOF_FLAGS_SYNC | \
365 XFS_EOF_FLAGS_UID | \
366 XFS_EOF_FLAGS_GID | \
367 XFS_EOF_FLAGS_PRID | \
368 XFS_EOF_FLAGS_MINFILESIZE)
369
370
371/*
342 * The user-level Handle Request interface structure. 372 * The user-level Handle Request interface structure.
343 */ 373 */
344typedef struct xfs_fsop_handlereq { 374typedef struct xfs_fsop_handlereq {
@@ -456,6 +486,7 @@ typedef struct xfs_handle {
456/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ 486/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */
457#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) 487#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
458#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) 488#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
489#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_eofblocks)
459 490
460/* 491/*
461 * ioctl commands that replace IRIX syssgi()'s 492 * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
deleted file mode 100644
index 652b875a9d4c..000000000000
--- a/fs/xfs/xfs_fs_subr.c
+++ /dev/null
@@ -1,96 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_vnodeops.h"
20#include "xfs_bmap_btree.h"
21#include "xfs_inode.h"
22#include "xfs_trace.h"
23
24/*
25 * note: all filemap functions return negative error codes. These
26 * need to be inverted before returning to the xfs core functions.
27 */
28void
29xfs_tosspages(
30 xfs_inode_t *ip,
31 xfs_off_t first,
32 xfs_off_t last,
33 int fiopt)
34{
35 /* can't toss partial tail pages, so mask them out */
36 last &= ~(PAGE_SIZE - 1);
37 truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
38}
39
40int
41xfs_flushinval_pages(
42 xfs_inode_t *ip,
43 xfs_off_t first,
44 xfs_off_t last,
45 int fiopt)
46{
47 struct address_space *mapping = VFS_I(ip)->i_mapping;
48 int ret = 0;
49
50 trace_xfs_pagecache_inval(ip, first, last);
51
52 xfs_iflags_clear(ip, XFS_ITRUNCATED);
53 ret = filemap_write_and_wait_range(mapping, first,
54 last == -1 ? LLONG_MAX : last);
55 if (!ret)
56 truncate_inode_pages_range(mapping, first, last);
57 return -ret;
58}
59
60int
61xfs_flush_pages(
62 xfs_inode_t *ip,
63 xfs_off_t first,
64 xfs_off_t last,
65 uint64_t flags,
66 int fiopt)
67{
68 struct address_space *mapping = VFS_I(ip)->i_mapping;
69 int ret = 0;
70 int ret2;
71
72 xfs_iflags_clear(ip, XFS_ITRUNCATED);
73 ret = -filemap_fdatawrite_range(mapping, first,
74 last == -1 ? LLONG_MAX : last);
75 if (flags & XBF_ASYNC)
76 return ret;
77 ret2 = xfs_wait_on_pages(ip, first, last);
78 if (!ret)
79 ret = ret2;
80 return ret;
81}
82
83int
84xfs_wait_on_pages(
85 xfs_inode_t *ip,
86 xfs_off_t first,
87 xfs_off_t last)
88{
89 struct address_space *mapping = VFS_I(ip)->i_mapping;
90
91 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
92 return -filemap_fdatawait_range(mapping, first,
93 last == -1 ? XFS_ISIZE(ip) - 1 : last);
94 }
95 return 0;
96}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index c25b094efbf7..94eaeedc5498 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -97,7 +97,9 @@ xfs_fs_geometry(
97 (xfs_sb_version_haslazysbcount(&mp->m_sb) ? 97 (xfs_sb_version_haslazysbcount(&mp->m_sb) ?
98 XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) | 98 XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) |
99 (xfs_sb_version_hasattr2(&mp->m_sb) ? 99 (xfs_sb_version_hasattr2(&mp->m_sb) ?
100 XFS_FSOP_GEOM_FLAGS_ATTR2 : 0); 100 XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) |
101 (xfs_sb_version_hasprojid32bit(&mp->m_sb) ?
102 XFS_FSOP_GEOM_FLAGS_PROJID32 : 0);
101 geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? 103 geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
102 mp->m_sb.sb_logsectsize : BBSIZE; 104 mp->m_sb.sb_logsectsize : BBSIZE;
103 geo->rtsectsize = mp->m_sb.sb_blocksize; 105 geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -112,18 +114,40 @@ xfs_fs_geometry(
112 return 0; 114 return 0;
113} 115}
114 116
117static struct xfs_buf *
118xfs_growfs_get_hdr_buf(
119 struct xfs_mount *mp,
120 xfs_daddr_t blkno,
121 size_t numblks,
122 int flags,
123 const struct xfs_buf_ops *ops)
124{
125 struct xfs_buf *bp;
126
127 bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
128 if (!bp)
129 return NULL;
130
131 xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
132 bp->b_bn = blkno;
133 bp->b_maps[0].bm_bn = blkno;
134 bp->b_ops = ops;
135
136 return bp;
137}
138
115static int 139static int
116xfs_growfs_data_private( 140xfs_growfs_data_private(
117 xfs_mount_t *mp, /* mount point for filesystem */ 141 xfs_mount_t *mp, /* mount point for filesystem */
118 xfs_growfs_data_t *in) /* growfs data input struct */ 142 xfs_growfs_data_t *in) /* growfs data input struct */
119{ 143{
120 xfs_agf_t *agf; 144 xfs_agf_t *agf;
145 struct xfs_agfl *agfl;
121 xfs_agi_t *agi; 146 xfs_agi_t *agi;
122 xfs_agnumber_t agno; 147 xfs_agnumber_t agno;
123 xfs_extlen_t agsize; 148 xfs_extlen_t agsize;
124 xfs_extlen_t tmpsize; 149 xfs_extlen_t tmpsize;
125 xfs_alloc_rec_t *arec; 150 xfs_alloc_rec_t *arec;
126 struct xfs_btree_block *block;
127 xfs_buf_t *bp; 151 xfs_buf_t *bp;
128 int bucket; 152 int bucket;
129 int dpct; 153 int dpct;
@@ -146,9 +170,14 @@ xfs_growfs_data_private(
146 dpct = pct - mp->m_sb.sb_imax_pct; 170 dpct = pct - mp->m_sb.sb_imax_pct;
147 bp = xfs_buf_read_uncached(mp->m_ddev_targp, 171 bp = xfs_buf_read_uncached(mp->m_ddev_targp,
148 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), 172 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
149 XFS_FSS_TO_BB(mp, 1), 0); 173 XFS_FSS_TO_BB(mp, 1), 0, NULL);
150 if (!bp) 174 if (!bp)
151 return EIO; 175 return EIO;
176 if (bp->b_error) {
177 int error = bp->b_error;
178 xfs_buf_relse(bp);
179 return error;
180 }
152 xfs_buf_relse(bp); 181 xfs_buf_relse(bp);
153 182
154 new = nb; /* use new as a temporary here */ 183 new = nb; /* use new as a temporary here */
@@ -186,17 +215,18 @@ xfs_growfs_data_private(
186 nfree = 0; 215 nfree = 0;
187 for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { 216 for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
188 /* 217 /*
189 * AG freelist header block 218 * AG freespace header block
190 */ 219 */
191 bp = xfs_buf_get(mp->m_ddev_targp, 220 bp = xfs_growfs_get_hdr_buf(mp,
192 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), 221 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
193 XFS_FSS_TO_BB(mp, 1), 0); 222 XFS_FSS_TO_BB(mp, 1), 0,
223 &xfs_agf_buf_ops);
194 if (!bp) { 224 if (!bp) {
195 error = ENOMEM; 225 error = ENOMEM;
196 goto error0; 226 goto error0;
197 } 227 }
228
198 agf = XFS_BUF_TO_AGF(bp); 229 agf = XFS_BUF_TO_AGF(bp);
199 memset(agf, 0, mp->m_sb.sb_sectsize);
200 agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); 230 agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
201 agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); 231 agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
202 agf->agf_seqno = cpu_to_be32(agno); 232 agf->agf_seqno = cpu_to_be32(agno);
@@ -223,17 +253,39 @@ xfs_growfs_data_private(
223 goto error0; 253 goto error0;
224 254
225 /* 255 /*
256 * AG freelist header block
257 */
258 bp = xfs_growfs_get_hdr_buf(mp,
259 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
260 XFS_FSS_TO_BB(mp, 1), 0,
261 &xfs_agfl_buf_ops);
262 if (!bp) {
263 error = ENOMEM;
264 goto error0;
265 }
266
267 agfl = XFS_BUF_TO_AGFL(bp);
268 for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
269 agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
270
271 error = xfs_bwrite(bp);
272 xfs_buf_relse(bp);
273 if (error)
274 goto error0;
275
276 /*
226 * AG inode header block 277 * AG inode header block
227 */ 278 */
228 bp = xfs_buf_get(mp->m_ddev_targp, 279 bp = xfs_growfs_get_hdr_buf(mp,
229 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), 280 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
230 XFS_FSS_TO_BB(mp, 1), 0); 281 XFS_FSS_TO_BB(mp, 1), 0,
282 &xfs_agi_buf_ops);
231 if (!bp) { 283 if (!bp) {
232 error = ENOMEM; 284 error = ENOMEM;
233 goto error0; 285 goto error0;
234 } 286 }
287
235 agi = XFS_BUF_TO_AGI(bp); 288 agi = XFS_BUF_TO_AGI(bp);
236 memset(agi, 0, mp->m_sb.sb_sectsize);
237 agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); 289 agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
238 agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); 290 agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
239 agi->agi_seqno = cpu_to_be32(agno); 291 agi->agi_seqno = cpu_to_be32(agno);
@@ -254,24 +306,22 @@ xfs_growfs_data_private(
254 /* 306 /*
255 * BNO btree root block 307 * BNO btree root block
256 */ 308 */
257 bp = xfs_buf_get(mp->m_ddev_targp, 309 bp = xfs_growfs_get_hdr_buf(mp,
258 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), 310 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
259 BTOBB(mp->m_sb.sb_blocksize), 0); 311 BTOBB(mp->m_sb.sb_blocksize), 0,
312 &xfs_allocbt_buf_ops);
313
260 if (!bp) { 314 if (!bp) {
261 error = ENOMEM; 315 error = ENOMEM;
262 goto error0; 316 goto error0;
263 } 317 }
264 block = XFS_BUF_TO_BLOCK(bp); 318
265 memset(block, 0, mp->m_sb.sb_blocksize); 319 xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0);
266 block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); 320 arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
267 block->bb_level = 0;
268 block->bb_numrecs = cpu_to_be16(1);
269 block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
270 block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
271 arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
272 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); 321 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
273 arec->ar_blockcount = cpu_to_be32( 322 arec->ar_blockcount = cpu_to_be32(
274 agsize - be32_to_cpu(arec->ar_startblock)); 323 agsize - be32_to_cpu(arec->ar_startblock));
324
275 error = xfs_bwrite(bp); 325 error = xfs_bwrite(bp);
276 xfs_buf_relse(bp); 326 xfs_buf_relse(bp);
277 if (error) 327 if (error)
@@ -280,25 +330,22 @@ xfs_growfs_data_private(
280 /* 330 /*
281 * CNT btree root block 331 * CNT btree root block
282 */ 332 */
283 bp = xfs_buf_get(mp->m_ddev_targp, 333 bp = xfs_growfs_get_hdr_buf(mp,
284 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), 334 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
285 BTOBB(mp->m_sb.sb_blocksize), 0); 335 BTOBB(mp->m_sb.sb_blocksize), 0,
336 &xfs_allocbt_buf_ops);
286 if (!bp) { 337 if (!bp) {
287 error = ENOMEM; 338 error = ENOMEM;
288 goto error0; 339 goto error0;
289 } 340 }
290 block = XFS_BUF_TO_BLOCK(bp); 341
291 memset(block, 0, mp->m_sb.sb_blocksize); 342 xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0);
292 block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); 343 arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
293 block->bb_level = 0;
294 block->bb_numrecs = cpu_to_be16(1);
295 block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
296 block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
297 arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
298 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); 344 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
299 arec->ar_blockcount = cpu_to_be32( 345 arec->ar_blockcount = cpu_to_be32(
300 agsize - be32_to_cpu(arec->ar_startblock)); 346 agsize - be32_to_cpu(arec->ar_startblock));
301 nfree += be32_to_cpu(arec->ar_blockcount); 347 nfree += be32_to_cpu(arec->ar_blockcount);
348
302 error = xfs_bwrite(bp); 349 error = xfs_bwrite(bp);
303 xfs_buf_relse(bp); 350 xfs_buf_relse(bp);
304 if (error) 351 if (error)
@@ -307,20 +354,17 @@ xfs_growfs_data_private(
307 /* 354 /*
308 * INO btree root block 355 * INO btree root block
309 */ 356 */
310 bp = xfs_buf_get(mp->m_ddev_targp, 357 bp = xfs_growfs_get_hdr_buf(mp,
311 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), 358 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
312 BTOBB(mp->m_sb.sb_blocksize), 0); 359 BTOBB(mp->m_sb.sb_blocksize), 0,
360 &xfs_inobt_buf_ops);
313 if (!bp) { 361 if (!bp) {
314 error = ENOMEM; 362 error = ENOMEM;
315 goto error0; 363 goto error0;
316 } 364 }
317 block = XFS_BUF_TO_BLOCK(bp); 365
318 memset(block, 0, mp->m_sb.sb_blocksize); 366 xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0);
319 block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); 367
320 block->bb_level = 0;
321 block->bb_numrecs = 0;
322 block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
323 block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
324 error = xfs_bwrite(bp); 368 error = xfs_bwrite(bp);
325 xfs_buf_relse(bp); 369 xfs_buf_relse(bp);
326 if (error) 370 if (error)
@@ -399,9 +443,28 @@ xfs_growfs_data_private(
399 443
400 /* update secondary superblocks. */ 444 /* update secondary superblocks. */
401 for (agno = 1; agno < nagcount; agno++) { 445 for (agno = 1; agno < nagcount; agno++) {
402 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, 446 error = 0;
447 /*
448 * new secondary superblocks need to be zeroed, not read from
449 * disk as the contents of the new area we are growing into is
450 * completely unknown.
451 */
452 if (agno < oagcount) {
453 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
454 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
455 XFS_FSS_TO_BB(mp, 1), 0, &bp,
456 &xfs_sb_buf_ops);
457 } else {
458 bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp,
403 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), 459 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
404 XFS_FSS_TO_BB(mp, 1), 0, &bp); 460 XFS_FSS_TO_BB(mp, 1), 0);
461 if (bp) {
462 bp->b_ops = &xfs_sb_buf_ops;
463 xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
464 } else
465 error = ENOMEM;
466 }
467
405 if (error) { 468 if (error) {
406 xfs_warn(mp, 469 xfs_warn(mp,
407 "error %d reading secondary superblock for ag %d", 470 "error %d reading secondary superblock for ag %d",
@@ -409,6 +472,7 @@ xfs_growfs_data_private(
409 break; 472 break;
410 } 473 }
411 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS); 474 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
475
412 /* 476 /*
413 * If we get an error writing out the alternate superblocks, 477 * If we get an error writing out the alternate superblocks,
414 * just issue a warning and continue. The real work is 478 * just issue a warning and continue. The real work is
@@ -423,7 +487,7 @@ xfs_growfs_data_private(
423 break; /* no point in continuing */ 487 break; /* no point in continuing */
424 } 488 }
425 } 489 }
426 return 0; 490 return error;
427 491
428 error0: 492 error0:
429 xfs_trans_cancel(tp, XFS_TRANS_ABORT); 493 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 76e81cff70b9..5399ef222dd7 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -21,7 +21,8 @@
21/* 21/*
22 * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, 22 * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n,
23 * other XFS code uses these values. Times are measured in centisecs (i.e. 23 * other XFS code uses these values. Times are measured in centisecs (i.e.
24 * 100ths of a second). 24 * 100ths of a second) with the exception of eofb_timer, which is measured in
25 * seconds.
25 */ 26 */
26xfs_param_t xfs_params = { 27xfs_param_t xfs_params = {
27 /* MIN DFLT MAX */ 28 /* MIN DFLT MAX */
@@ -40,4 +41,5 @@ xfs_param_t xfs_params = {
40 .rotorstep = { 1, 1, 255 }, 41 .rotorstep = { 1, 1, 255 },
41 .inherit_nodfrg = { 0, 1, 1 }, 42 .inherit_nodfrg = { 0, 1, 1 },
42 .fstrm_timer = { 1, 30*100, 3600*100}, 43 .fstrm_timer = { 1, 30*100, 3600*100},
44 .eofb_timer = { 1, 300, 3600*24},
43}; 45};
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 445bf1aef31c..a815412eab80 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -200,7 +200,8 @@ xfs_ialloc_inode_init(
200 */ 200 */
201 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); 201 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
202 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, 202 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
203 mp->m_bsize * blks_per_cluster, 0); 203 mp->m_bsize * blks_per_cluster,
204 XBF_UNMAPPED);
204 if (!fbuf) 205 if (!fbuf)
205 return ENOMEM; 206 return ENOMEM;
206 /* 207 /*
@@ -210,6 +211,7 @@ xfs_ialloc_inode_init(
210 * to log a whole cluster of inodes instead of all the 211 * to log a whole cluster of inodes instead of all the
211 * individual transactions causing a lot of log traffic. 212 * individual transactions causing a lot of log traffic.
212 */ 213 */
214 fbuf->b_ops = &xfs_inode_buf_ops;
213 xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); 215 xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
214 for (i = 0; i < ninodes; i++) { 216 for (i = 0; i < ninodes; i++) {
215 int ioffset = i << mp->m_sb.sb_inodelog; 217 int ioffset = i << mp->m_sb.sb_inodelog;
@@ -250,6 +252,7 @@ xfs_ialloc_ag_alloc(
250 /* boundary */ 252 /* boundary */
251 struct xfs_perag *pag; 253 struct xfs_perag *pag;
252 254
255 memset(&args, 0, sizeof(args));
253 args.tp = tp; 256 args.tp = tp;
254 args.mp = tp->t_mountp; 257 args.mp = tp->t_mountp;
255 258
@@ -876,9 +879,9 @@ error0:
876 * This function is designed to be called twice if it has to do an allocation 879 * This function is designed to be called twice if it has to do an allocation
877 * to make more free inodes. On the first call, *IO_agbp should be set to NULL. 880 * to make more free inodes. On the first call, *IO_agbp should be set to NULL.
878 * If an inode is available without having to performn an allocation, an inode 881 * If an inode is available without having to performn an allocation, an inode
879 * number is returned. In this case, *IO_agbp would be NULL. If an allocation 882 * number is returned. In this case, *IO_agbp is set to NULL. If an allocation
880 * needes to be done, xfs_dialloc would return the current AGI buffer in 883 * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
881 * *IO_agbp. The caller should then commit the current transaction, allocate a 884 * The caller should then commit the current transaction, allocate a
882 * new transaction, and call xfs_dialloc() again, passing in the previous value 885 * new transaction, and call xfs_dialloc() again, passing in the previous value
883 * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI 886 * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI
884 * buffer is locked across the two calls, the second call is guaranteed to have 887 * buffer is locked across the two calls, the second call is guaranteed to have
@@ -1471,6 +1474,57 @@ xfs_check_agi_unlinked(
1471#define xfs_check_agi_unlinked(agi) 1474#define xfs_check_agi_unlinked(agi)
1472#endif 1475#endif
1473 1476
1477static void
1478xfs_agi_verify(
1479 struct xfs_buf *bp)
1480{
1481 struct xfs_mount *mp = bp->b_target->bt_mount;
1482 struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
1483 int agi_ok;
1484
1485 /*
1486 * Validate the magic number of the agi block.
1487 */
1488 agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
1489 XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
1490
1491 /*
1492 * during growfs operations, the perag is not fully initialised,
1493 * so we can't use it for any useful checking. growfs ensures we can't
1494 * use it by using uncached buffers that don't have the perag attached
1495 * so we can detect and avoid this problem.
1496 */
1497 if (bp->b_pag)
1498 agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) ==
1499 bp->b_pag->pag_agno;
1500
1501 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
1502 XFS_RANDOM_IALLOC_READ_AGI))) {
1503 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi);
1504 xfs_buf_ioerror(bp, EFSCORRUPTED);
1505 }
1506 xfs_check_agi_unlinked(agi);
1507}
1508
1509static void
1510xfs_agi_read_verify(
1511 struct xfs_buf *bp)
1512{
1513 xfs_agi_verify(bp);
1514}
1515
1516static void
1517xfs_agi_write_verify(
1518 struct xfs_buf *bp)
1519{
1520 xfs_agi_verify(bp);
1521}
1522
1523const struct xfs_buf_ops xfs_agi_buf_ops = {
1524 .verify_read = xfs_agi_read_verify,
1525 .verify_write = xfs_agi_write_verify,
1526};
1527
1474/* 1528/*
1475 * Read in the allocation group header (inode allocation section) 1529 * Read in the allocation group header (inode allocation section)
1476 */ 1530 */
@@ -1481,38 +1535,18 @@ xfs_read_agi(
1481 xfs_agnumber_t agno, /* allocation group number */ 1535 xfs_agnumber_t agno, /* allocation group number */
1482 struct xfs_buf **bpp) /* allocation group hdr buf */ 1536 struct xfs_buf **bpp) /* allocation group hdr buf */
1483{ 1537{
1484 struct xfs_agi *agi; /* allocation group header */
1485 int agi_ok; /* agi is consistent */
1486 int error; 1538 int error;
1487 1539
1488 ASSERT(agno != NULLAGNUMBER); 1540 ASSERT(agno != NULLAGNUMBER);
1489 1541
1490 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 1542 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
1491 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), 1543 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
1492 XFS_FSS_TO_BB(mp, 1), 0, bpp); 1544 XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
1493 if (error) 1545 if (error)
1494 return error; 1546 return error;
1495 1547
1496 ASSERT(!xfs_buf_geterror(*bpp)); 1548 ASSERT(!xfs_buf_geterror(*bpp));
1497 agi = XFS_BUF_TO_AGI(*bpp);
1498
1499 /*
1500 * Validate the magic number of the agi block.
1501 */
1502 agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
1503 XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) &&
1504 be32_to_cpu(agi->agi_seqno) == agno;
1505 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
1506 XFS_RANDOM_IALLOC_READ_AGI))) {
1507 XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW,
1508 mp, agi);
1509 xfs_trans_brelse(tp, *bpp);
1510 return XFS_ERROR(EFSCORRUPTED);
1511 }
1512
1513 xfs_buf_set_ref(*bpp, XFS_AGI_REF); 1549 xfs_buf_set_ref(*bpp, XFS_AGI_REF);
1514
1515 xfs_check_agi_unlinked(agi);
1516 return 0; 1550 return 0;
1517} 1551}
1518 1552
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 1fd6ea4e9c91..c8da3df271e6 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -147,7 +147,9 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
147/* 147/*
148 * Get the data from the pointed-to record. 148 * Get the data from the pointed-to record.
149 */ 149 */
150extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, 150int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
151 xfs_inobt_rec_incore_t *rec, int *stat); 151 xfs_inobt_rec_incore_t *rec, int *stat);
152 152
153extern const struct xfs_buf_ops xfs_agi_buf_ops;
154
153#endif /* __XFS_IALLOC_H__ */ 155#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 2b8b7a37aa18..bec344b36507 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -33,6 +33,7 @@
33#include "xfs_ialloc.h" 33#include "xfs_ialloc.h"
34#include "xfs_alloc.h" 34#include "xfs_alloc.h"
35#include "xfs_error.h" 35#include "xfs_error.h"
36#include "xfs_trace.h"
36 37
37 38
38STATIC int 39STATIC int
@@ -181,6 +182,59 @@ xfs_inobt_key_diff(
181 cur->bc_rec.i.ir_startino; 182 cur->bc_rec.i.ir_startino;
182} 183}
183 184
185void
186xfs_inobt_verify(
187 struct xfs_buf *bp)
188{
189 struct xfs_mount *mp = bp->b_target->bt_mount;
190 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
191 unsigned int level;
192 int sblock_ok; /* block passes checks */
193
194 /* magic number and level verification */
195 level = be16_to_cpu(block->bb_level);
196 sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) &&
197 level < mp->m_in_maxlevels;
198
199 /* numrecs verification */
200 sblock_ok = sblock_ok &&
201 be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0];
202
203 /* sibling pointer verification */
204 sblock_ok = sblock_ok &&
205 (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
206 be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
207 block->bb_u.s.bb_leftsib &&
208 (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
209 be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
210 block->bb_u.s.bb_rightsib;
211
212 if (!sblock_ok) {
213 trace_xfs_btree_corrupt(bp, _RET_IP_);
214 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
215 xfs_buf_ioerror(bp, EFSCORRUPTED);
216 }
217}
218
219static void
220xfs_inobt_read_verify(
221 struct xfs_buf *bp)
222{
223 xfs_inobt_verify(bp);
224}
225
226static void
227xfs_inobt_write_verify(
228 struct xfs_buf *bp)
229{
230 xfs_inobt_verify(bp);
231}
232
233const struct xfs_buf_ops xfs_inobt_buf_ops = {
234 .verify_read = xfs_inobt_read_verify,
235 .verify_write = xfs_inobt_write_verify,
236};
237
184#ifdef DEBUG 238#ifdef DEBUG
185STATIC int 239STATIC int
186xfs_inobt_keys_inorder( 240xfs_inobt_keys_inorder(
@@ -218,6 +272,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
218 .init_rec_from_cur = xfs_inobt_init_rec_from_cur, 272 .init_rec_from_cur = xfs_inobt_init_rec_from_cur,
219 .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, 273 .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur,
220 .key_diff = xfs_inobt_key_diff, 274 .key_diff = xfs_inobt_key_diff,
275 .buf_ops = &xfs_inobt_buf_ops,
221#ifdef DEBUG 276#ifdef DEBUG
222 .keys_inorder = xfs_inobt_keys_inorder, 277 .keys_inorder = xfs_inobt_keys_inorder,
223 .recs_inorder = xfs_inobt_recs_inorder, 278 .recs_inorder = xfs_inobt_recs_inorder,
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index f782ad0c4769..25c0239a8eab 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -109,4 +109,6 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
109 struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t); 109 struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
110extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); 110extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
111 111
112extern const struct xfs_buf_ops xfs_inobt_buf_ops;
113
112#endif /* __XFS_IALLOC_BTREE_H__ */ 114#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_icache.c
index 9500caf15acf..96e344e3e927 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_icache.c
@@ -19,6 +19,7 @@
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_log.h" 21#include "xfs_log.h"
22#include "xfs_log_priv.h"
22#include "xfs_inum.h" 23#include "xfs_inum.h"
23#include "xfs_trans.h" 24#include "xfs_trans.h"
24#include "xfs_trans_priv.h" 25#include "xfs_trans_priv.h"
@@ -35,11 +36,425 @@
35#include "xfs_quota.h" 36#include "xfs_quota.h"
36#include "xfs_trace.h" 37#include "xfs_trace.h"
37#include "xfs_fsops.h" 38#include "xfs_fsops.h"
39#include "xfs_icache.h"
38 40
39#include <linux/kthread.h> 41#include <linux/kthread.h>
40#include <linux/freezer.h> 42#include <linux/freezer.h>
41 43
42struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ 44STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
45 struct xfs_perag *pag, struct xfs_inode *ip);
46
47/*
48 * Allocate and initialise an xfs_inode.
49 */
50STATIC struct xfs_inode *
51xfs_inode_alloc(
52 struct xfs_mount *mp,
53 xfs_ino_t ino)
54{
55 struct xfs_inode *ip;
56
57 /*
58 * if this didn't occur in transactions, we could use
59 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
60 * code up to do this anyway.
61 */
62 ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
63 if (!ip)
64 return NULL;
65 if (inode_init_always(mp->m_super, VFS_I(ip))) {
66 kmem_zone_free(xfs_inode_zone, ip);
67 return NULL;
68 }
69
70 ASSERT(atomic_read(&ip->i_pincount) == 0);
71 ASSERT(!spin_is_locked(&ip->i_flags_lock));
72 ASSERT(!xfs_isiflocked(ip));
73 ASSERT(ip->i_ino == 0);
74
75 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
76
77 /* initialise the xfs inode */
78 ip->i_ino = ino;
79 ip->i_mount = mp;
80 memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
81 ip->i_afp = NULL;
82 memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
83 ip->i_flags = 0;
84 ip->i_delayed_blks = 0;
85 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
86
87 return ip;
88}
89
90STATIC void
91xfs_inode_free_callback(
92 struct rcu_head *head)
93{
94 struct inode *inode = container_of(head, struct inode, i_rcu);
95 struct xfs_inode *ip = XFS_I(inode);
96
97 kmem_zone_free(xfs_inode_zone, ip);
98}
99
100STATIC void
101xfs_inode_free(
102 struct xfs_inode *ip)
103{
104 switch (ip->i_d.di_mode & S_IFMT) {
105 case S_IFREG:
106 case S_IFDIR:
107 case S_IFLNK:
108 xfs_idestroy_fork(ip, XFS_DATA_FORK);
109 break;
110 }
111
112 if (ip->i_afp)
113 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
114
115 if (ip->i_itemp) {
116 ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
117 xfs_inode_item_destroy(ip);
118 ip->i_itemp = NULL;
119 }
120
121 /* asserts to verify all state is correct here */
122 ASSERT(atomic_read(&ip->i_pincount) == 0);
123 ASSERT(!spin_is_locked(&ip->i_flags_lock));
124 ASSERT(!xfs_isiflocked(ip));
125
126 /*
127 * Because we use RCU freeing we need to ensure the inode always
128 * appears to be reclaimed with an invalid inode number when in the
129 * free state. The ip->i_flags_lock provides the barrier against lookup
130 * races.
131 */
132 spin_lock(&ip->i_flags_lock);
133 ip->i_flags = XFS_IRECLAIM;
134 ip->i_ino = 0;
135 spin_unlock(&ip->i_flags_lock);
136
137 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
138}
139
140/*
141 * Check the validity of the inode we just found it the cache
142 */
143static int
144xfs_iget_cache_hit(
145 struct xfs_perag *pag,
146 struct xfs_inode *ip,
147 xfs_ino_t ino,
148 int flags,
149 int lock_flags) __releases(RCU)
150{
151 struct inode *inode = VFS_I(ip);
152 struct xfs_mount *mp = ip->i_mount;
153 int error;
154
155 /*
156 * check for re-use of an inode within an RCU grace period due to the
157 * radix tree nodes not being updated yet. We monitor for this by
158 * setting the inode number to zero before freeing the inode structure.
159 * If the inode has been reallocated and set up, then the inode number
160 * will not match, so check for that, too.
161 */
162 spin_lock(&ip->i_flags_lock);
163 if (ip->i_ino != ino) {
164 trace_xfs_iget_skip(ip);
165 XFS_STATS_INC(xs_ig_frecycle);
166 error = EAGAIN;
167 goto out_error;
168 }
169
170
171 /*
172 * If we are racing with another cache hit that is currently
173 * instantiating this inode or currently recycling it out of
174 * reclaimabe state, wait for the initialisation to complete
175 * before continuing.
176 *
177 * XXX(hch): eventually we should do something equivalent to
178 * wait_on_inode to wait for these flags to be cleared
179 * instead of polling for it.
180 */
181 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
182 trace_xfs_iget_skip(ip);
183 XFS_STATS_INC(xs_ig_frecycle);
184 error = EAGAIN;
185 goto out_error;
186 }
187
188 /*
189 * If lookup is racing with unlink return an error immediately.
190 */
191 if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
192 error = ENOENT;
193 goto out_error;
194 }
195
196 /*
197 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
198 * Need to carefully get it back into useable state.
199 */
200 if (ip->i_flags & XFS_IRECLAIMABLE) {
201 trace_xfs_iget_reclaim(ip);
202
203 /*
204 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
205 * from stomping over us while we recycle the inode. We can't
206 * clear the radix tree reclaimable tag yet as it requires
207 * pag_ici_lock to be held exclusive.
208 */
209 ip->i_flags |= XFS_IRECLAIM;
210
211 spin_unlock(&ip->i_flags_lock);
212 rcu_read_unlock();
213
214 error = -inode_init_always(mp->m_super, inode);
215 if (error) {
216 /*
217 * Re-initializing the inode failed, and we are in deep
218 * trouble. Try to re-add it to the reclaim list.
219 */
220 rcu_read_lock();
221 spin_lock(&ip->i_flags_lock);
222
223 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
224 ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
225 trace_xfs_iget_reclaim_fail(ip);
226 goto out_error;
227 }
228
229 spin_lock(&pag->pag_ici_lock);
230 spin_lock(&ip->i_flags_lock);
231
232 /*
233 * Clear the per-lifetime state in the inode as we are now
234 * effectively a new inode and need to return to the initial
235 * state before reuse occurs.
236 */
237 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
238 ip->i_flags |= XFS_INEW;
239 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
240 inode->i_state = I_NEW;
241
242 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
243 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
244
245 spin_unlock(&ip->i_flags_lock);
246 spin_unlock(&pag->pag_ici_lock);
247 } else {
248 /* If the VFS inode is being torn down, pause and try again. */
249 if (!igrab(inode)) {
250 trace_xfs_iget_skip(ip);
251 error = EAGAIN;
252 goto out_error;
253 }
254
255 /* We've got a live one. */
256 spin_unlock(&ip->i_flags_lock);
257 rcu_read_unlock();
258 trace_xfs_iget_hit(ip);
259 }
260
261 if (lock_flags != 0)
262 xfs_ilock(ip, lock_flags);
263
264 xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
265 XFS_STATS_INC(xs_ig_found);
266
267 return 0;
268
269out_error:
270 spin_unlock(&ip->i_flags_lock);
271 rcu_read_unlock();
272 return error;
273}
274
275
276static int
277xfs_iget_cache_miss(
278 struct xfs_mount *mp,
279 struct xfs_perag *pag,
280 xfs_trans_t *tp,
281 xfs_ino_t ino,
282 struct xfs_inode **ipp,
283 int flags,
284 int lock_flags)
285{
286 struct xfs_inode *ip;
287 int error;
288 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
289 int iflags;
290
291 ip = xfs_inode_alloc(mp, ino);
292 if (!ip)
293 return ENOMEM;
294
295 error = xfs_iread(mp, tp, ip, flags);
296 if (error)
297 goto out_destroy;
298
299 trace_xfs_iget_miss(ip);
300
301 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
302 error = ENOENT;
303 goto out_destroy;
304 }
305
306 /*
307 * Preload the radix tree so we can insert safely under the
308 * write spinlock. Note that we cannot sleep inside the preload
309 * region. Since we can be called from transaction context, don't
310 * recurse into the file system.
311 */
312 if (radix_tree_preload(GFP_NOFS)) {
313 error = EAGAIN;
314 goto out_destroy;
315 }
316
317 /*
318 * Because the inode hasn't been added to the radix-tree yet it can't
319 * be found by another thread, so we can do the non-sleeping lock here.
320 */
321 if (lock_flags) {
322 if (!xfs_ilock_nowait(ip, lock_flags))
323 BUG();
324 }
325
326 /*
327 * These values must be set before inserting the inode into the radix
328 * tree as the moment it is inserted a concurrent lookup (allowed by the
329 * RCU locking mechanism) can find it and that lookup must see that this
330 * is an inode currently under construction (i.e. that XFS_INEW is set).
331 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
332 * memory barrier that ensures this detection works correctly at lookup
333 * time.
334 */
335 iflags = XFS_INEW;
336 if (flags & XFS_IGET_DONTCACHE)
337 iflags |= XFS_IDONTCACHE;
338 ip->i_udquot = ip->i_gdquot = NULL;
339 xfs_iflags_set(ip, iflags);
340
341 /* insert the new inode */
342 spin_lock(&pag->pag_ici_lock);
343 error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
344 if (unlikely(error)) {
345 WARN_ON(error != -EEXIST);
346 XFS_STATS_INC(xs_ig_dup);
347 error = EAGAIN;
348 goto out_preload_end;
349 }
350 spin_unlock(&pag->pag_ici_lock);
351 radix_tree_preload_end();
352
353 *ipp = ip;
354 return 0;
355
356out_preload_end:
357 spin_unlock(&pag->pag_ici_lock);
358 radix_tree_preload_end();
359 if (lock_flags)
360 xfs_iunlock(ip, lock_flags);
361out_destroy:
362 __destroy_inode(VFS_I(ip));
363 xfs_inode_free(ip);
364 return error;
365}
366
367/*
368 * Look up an inode by number in the given file system.
369 * The inode is looked up in the cache held in each AG.
370 * If the inode is found in the cache, initialise the vfs inode
371 * if necessary.
372 *
373 * If it is not in core, read it in from the file system's device,
374 * add it to the cache and initialise the vfs inode.
375 *
376 * The inode is locked according to the value of the lock_flags parameter.
377 * This flag parameter indicates how and if the inode's IO lock and inode lock
378 * should be taken.
379 *
380 * mp -- the mount point structure for the current file system. It points
381 * to the inode hash table.
382 * tp -- a pointer to the current transaction if there is one. This is
383 * simply passed through to the xfs_iread() call.
384 * ino -- the number of the inode desired. This is the unique identifier
385 * within the file system for the inode being requested.
386 * lock_flags -- flags indicating how to lock the inode. See the comment
387 * for xfs_ilock() for a list of valid values.
388 */
389int
390xfs_iget(
391 xfs_mount_t *mp,
392 xfs_trans_t *tp,
393 xfs_ino_t ino,
394 uint flags,
395 uint lock_flags,
396 xfs_inode_t **ipp)
397{
398 xfs_inode_t *ip;
399 int error;
400 xfs_perag_t *pag;
401 xfs_agino_t agino;
402
403 /*
404 * xfs_reclaim_inode() uses the ILOCK to ensure an inode
405 * doesn't get freed while it's being referenced during a
406 * radix tree traversal here. It assumes this function
407 * aqcuires only the ILOCK (and therefore it has no need to
408 * involve the IOLOCK in this synchronization).
409 */
410 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
411
412 /* reject inode numbers outside existing AGs */
413 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
414 return EINVAL;
415
416 /* get the perag structure and ensure that it's inode capable */
417 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
418 agino = XFS_INO_TO_AGINO(mp, ino);
419
420again:
421 error = 0;
422 rcu_read_lock();
423 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
424
425 if (ip) {
426 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
427 if (error)
428 goto out_error_or_again;
429 } else {
430 rcu_read_unlock();
431 XFS_STATS_INC(xs_ig_missed);
432
433 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
434 flags, lock_flags);
435 if (error)
436 goto out_error_or_again;
437 }
438 xfs_perag_put(pag);
439
440 *ipp = ip;
441
442 /*
443 * If we have a real type for an on-disk inode, we can set ops(&unlock)
444 * now. If it's a new inode being created, xfs_ialloc will handle it.
445 */
446 if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
447 xfs_setup_inode(ip);
448 return 0;
449
450out_error_or_again:
451 if (error == EAGAIN) {
452 delay(1);
453 goto again;
454 }
455 xfs_perag_put(pag);
456 return error;
457}
43 458
44/* 459/*
45 * The inode lookup is done in batches to keep the amount of lock traffic and 460 * The inode lookup is done in batches to keep the amount of lock traffic and
@@ -101,8 +516,11 @@ xfs_inode_ag_walk(
101 struct xfs_mount *mp, 516 struct xfs_mount *mp,
102 struct xfs_perag *pag, 517 struct xfs_perag *pag,
103 int (*execute)(struct xfs_inode *ip, 518 int (*execute)(struct xfs_inode *ip,
104 struct xfs_perag *pag, int flags), 519 struct xfs_perag *pag, int flags,
105 int flags) 520 void *args),
521 int flags,
522 void *args,
523 int tag)
106{ 524{
107 uint32_t first_index; 525 uint32_t first_index;
108 int last_error = 0; 526 int last_error = 0;
@@ -121,9 +539,17 @@ restart:
121 int i; 539 int i;
122 540
123 rcu_read_lock(); 541 rcu_read_lock();
124 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 542
543 if (tag == -1)
544 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
125 (void **)batch, first_index, 545 (void **)batch, first_index,
126 XFS_LOOKUP_BATCH); 546 XFS_LOOKUP_BATCH);
547 else
548 nr_found = radix_tree_gang_lookup_tag(
549 &pag->pag_ici_root,
550 (void **) batch, first_index,
551 XFS_LOOKUP_BATCH, tag);
552
127 if (!nr_found) { 553 if (!nr_found) {
128 rcu_read_unlock(); 554 rcu_read_unlock();
129 break; 555 break;
@@ -164,7 +590,7 @@ restart:
164 for (i = 0; i < nr_found; i++) { 590 for (i = 0; i < nr_found; i++) {
165 if (!batch[i]) 591 if (!batch[i])
166 continue; 592 continue;
167 error = execute(batch[i], pag, flags); 593 error = execute(batch[i], pag, flags, args);
168 IRELE(batch[i]); 594 IRELE(batch[i]);
169 if (error == EAGAIN) { 595 if (error == EAGAIN) {
170 skipped++; 596 skipped++;
@@ -189,12 +615,40 @@ restart:
189 return last_error; 615 return last_error;
190} 616}
191 617
618/*
619 * Background scanning to trim post-EOF preallocated space. This is queued
620 * based on the 'background_prealloc_discard_period' tunable (5m by default).
621 */
622STATIC void
623xfs_queue_eofblocks(
624 struct xfs_mount *mp)
625{
626 rcu_read_lock();
627 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
628 queue_delayed_work(mp->m_eofblocks_workqueue,
629 &mp->m_eofblocks_work,
630 msecs_to_jiffies(xfs_eofb_secs * 1000));
631 rcu_read_unlock();
632}
633
634void
635xfs_eofblocks_worker(
636 struct work_struct *work)
637{
638 struct xfs_mount *mp = container_of(to_delayed_work(work),
639 struct xfs_mount, m_eofblocks_work);
640 xfs_icache_free_eofblocks(mp, NULL);
641 xfs_queue_eofblocks(mp);
642}
643
192int 644int
193xfs_inode_ag_iterator( 645xfs_inode_ag_iterator(
194 struct xfs_mount *mp, 646 struct xfs_mount *mp,
195 int (*execute)(struct xfs_inode *ip, 647 int (*execute)(struct xfs_inode *ip,
196 struct xfs_perag *pag, int flags), 648 struct xfs_perag *pag, int flags,
197 int flags) 649 void *args),
650 int flags,
651 void *args)
198{ 652{
199 struct xfs_perag *pag; 653 struct xfs_perag *pag;
200 int error = 0; 654 int error = 0;
@@ -204,7 +658,7 @@ xfs_inode_ag_iterator(
204 ag = 0; 658 ag = 0;
205 while ((pag = xfs_perag_get(mp, ag))) { 659 while ((pag = xfs_perag_get(mp, ag))) {
206 ag = pag->pag_agno + 1; 660 ag = pag->pag_agno + 1;
207 error = xfs_inode_ag_walk(mp, pag, execute, flags); 661 error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1);
208 xfs_perag_put(pag); 662 xfs_perag_put(pag);
209 if (error) { 663 if (error) {
210 last_error = error; 664 last_error = error;
@@ -215,224 +669,50 @@ xfs_inode_ag_iterator(
215 return XFS_ERROR(last_error); 669 return XFS_ERROR(last_error);
216} 670}
217 671
218STATIC int
219xfs_sync_inode_data(
220 struct xfs_inode *ip,
221 struct xfs_perag *pag,
222 int flags)
223{
224 struct inode *inode = VFS_I(ip);
225 struct address_space *mapping = inode->i_mapping;
226 int error = 0;
227
228 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
229 return 0;
230
231 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
232 if (flags & SYNC_TRYLOCK)
233 return 0;
234 xfs_ilock(ip, XFS_IOLOCK_SHARED);
235 }
236
237 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
238 0 : XBF_ASYNC, FI_NONE);
239 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
240 return error;
241}
242
243/*
244 * Write out pagecache data for the whole filesystem.
245 */
246STATIC int
247xfs_sync_data(
248 struct xfs_mount *mp,
249 int flags)
250{
251 int error;
252
253 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
254
255 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
256 if (error)
257 return XFS_ERROR(error);
258
259 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
260 return 0;
261}
262
263STATIC int
264xfs_sync_fsdata(
265 struct xfs_mount *mp)
266{
267 struct xfs_buf *bp;
268 int error;
269
270 /*
271 * If the buffer is pinned then push on the log so we won't get stuck
272 * waiting in the write for someone, maybe ourselves, to flush the log.
273 *
274 * Even though we just pushed the log above, we did not have the
275 * superblock buffer locked at that point so it can become pinned in
276 * between there and here.
277 */
278 bp = xfs_getsb(mp, 0);
279 if (xfs_buf_ispinned(bp))
280 xfs_log_force(mp, 0);
281 error = xfs_bwrite(bp);
282 xfs_buf_relse(bp);
283 return error;
284}
285
286/*
287 * When remounting a filesystem read-only or freezing the filesystem, we have
288 * two phases to execute. This first phase is syncing the data before we
289 * quiesce the filesystem, and the second is flushing all the inodes out after
290 * we've waited for all the transactions created by the first phase to
291 * complete. The second phase ensures that the inodes are written to their
292 * location on disk rather than just existing in transactions in the log. This
293 * means after a quiesce there is no log replay required to write the inodes to
294 * disk (this is the main difference between a sync and a quiesce).
295 */
296/*
297 * First stage of freeze - no writers will make progress now we are here,
298 * so we flush delwri and delalloc buffers here, then wait for all I/O to
299 * complete. Data is frozen at that point. Metadata is not frozen,
300 * transactions can still occur here so don't bother emptying the AIL
301 * because it'll just get dirty again.
302 */
303int 672int
304xfs_quiesce_data( 673xfs_inode_ag_iterator_tag(
305 struct xfs_mount *mp) 674 struct xfs_mount *mp,
306{ 675 int (*execute)(struct xfs_inode *ip,
307 int error, error2 = 0; 676 struct xfs_perag *pag, int flags,
308 677 void *args),
309 /* force out the log */ 678 int flags,
310 xfs_log_force(mp, XFS_LOG_SYNC); 679 void *args,
311 680 int tag)
312 /* write superblock and hoover up shutdown errors */
313 error = xfs_sync_fsdata(mp);
314
315 /* mark the log as covered if needed */
316 if (xfs_log_need_covered(mp))
317 error2 = xfs_fs_log_dummy(mp);
318
319 return error ? error : error2;
320}
321
322/*
323 * Second stage of a quiesce. The data is already synced, now we have to take
324 * care of the metadata. New transactions are already blocked, so we need to
325 * wait for any remaining transactions to drain out before proceeding.
326 */
327void
328xfs_quiesce_attr(
329 struct xfs_mount *mp)
330{
331 int error = 0;
332
333 /* wait for all modifications to complete */
334 while (atomic_read(&mp->m_active_trans) > 0)
335 delay(100);
336
337 /* reclaim inodes to do any IO before the freeze completes */
338 xfs_reclaim_inodes(mp, 0);
339 xfs_reclaim_inodes(mp, SYNC_WAIT);
340
341 /* flush all pending changes from the AIL */
342 xfs_ail_push_all_sync(mp->m_ail);
343
344 /*
345 * Just warn here till VFS can correctly support
346 * read-only remount without racing.
347 */
348 WARN_ON(atomic_read(&mp->m_active_trans) != 0);
349
350 /* Push the superblock and write an unmount record */
351 error = xfs_log_sbcount(mp);
352 if (error)
353 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
354 "Frozen image may not be consistent.");
355 xfs_log_unmount_write(mp);
356
357 /*
358 * At this point we might have modified the superblock again and thus
359 * added an item to the AIL, thus flush it again.
360 */
361 xfs_ail_push_all_sync(mp->m_ail);
362
363 /*
364 * The superblock buffer is uncached and xfsaild_push() will lock and
365 * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
366 * here but a lock on the superblock buffer will block until iodone()
367 * has completed.
368 */
369 xfs_buf_lock(mp->m_sb_bp);
370 xfs_buf_unlock(mp->m_sb_bp);
371}
372
373static void
374xfs_syncd_queue_sync(
375 struct xfs_mount *mp)
376{
377 queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
378 msecs_to_jiffies(xfs_syncd_centisecs * 10));
379}
380
381/*
382 * Every sync period we need to unpin all items, reclaim inodes and sync
383 * disk quotas. We might need to cover the log to indicate that the
384 * filesystem is idle and not frozen.
385 */
386STATIC void
387xfs_sync_worker(
388 struct work_struct *work)
389{ 681{
390 struct xfs_mount *mp = container_of(to_delayed_work(work), 682 struct xfs_perag *pag;
391 struct xfs_mount, m_sync_work); 683 int error = 0;
392 int error; 684 int last_error = 0;
393 685 xfs_agnumber_t ag;
394 /*
395 * We shouldn't write/force the log if we are in the mount/unmount
396 * process or on a read only filesystem. The workqueue still needs to be
397 * active in both cases, however, because it is used for inode reclaim
398 * during these times. Use the MS_ACTIVE flag to avoid doing anything
399 * during mount. Doing work during unmount is avoided by calling
400 * cancel_delayed_work_sync on this work queue before tearing down
401 * the ail and the log in xfs_log_unmount.
402 */
403 if (!(mp->m_super->s_flags & MS_ACTIVE) &&
404 !(mp->m_flags & XFS_MOUNT_RDONLY)) {
405 /* dgc: errors ignored here */
406 if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
407 xfs_log_need_covered(mp))
408 error = xfs_fs_log_dummy(mp);
409 else
410 xfs_log_force(mp, 0);
411 686
412 /* start pushing all the metadata that is currently 687 ag = 0;
413 * dirty */ 688 while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
414 xfs_ail_push_all(mp->m_ail); 689 ag = pag->pag_agno + 1;
690 error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag);
691 xfs_perag_put(pag);
692 if (error) {
693 last_error = error;
694 if (error == EFSCORRUPTED)
695 break;
696 }
415 } 697 }
416 698 return XFS_ERROR(last_error);
417 /* queue us up again */
418 xfs_syncd_queue_sync(mp);
419} 699}
420 700
421/* 701/*
422 * Queue a new inode reclaim pass if there are reclaimable inodes and there 702 * Queue a new inode reclaim pass if there are reclaimable inodes and there
423 * isn't a reclaim pass already in progress. By default it runs every 5s based 703 * isn't a reclaim pass already in progress. By default it runs every 5s based
424 * on the xfs syncd work default of 30s. Perhaps this should have it's own 704 * on the xfs periodic sync default of 30s. Perhaps this should have it's own
425 * tunable, but that can be done if this method proves to be ineffective or too 705 * tunable, but that can be done if this method proves to be ineffective or too
426 * aggressive. 706 * aggressive.
427 */ 707 */
428static void 708static void
429xfs_syncd_queue_reclaim( 709xfs_reclaim_work_queue(
430 struct xfs_mount *mp) 710 struct xfs_mount *mp)
431{ 711{
432 712
433 rcu_read_lock(); 713 rcu_read_lock();
434 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 714 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
435 queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, 715 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
436 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 716 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
437 } 717 }
438 rcu_read_unlock(); 718 rcu_read_unlock();
@@ -445,7 +725,7 @@ xfs_syncd_queue_reclaim(
445 * goes low. It scans as quickly as possible avoiding locked inodes or those 725 * goes low. It scans as quickly as possible avoiding locked inodes or those
446 * already being flushed, and once done schedules a future pass. 726 * already being flushed, and once done schedules a future pass.
447 */ 727 */
448STATIC void 728void
449xfs_reclaim_worker( 729xfs_reclaim_worker(
450 struct work_struct *work) 730 struct work_struct *work)
451{ 731{
@@ -453,65 +733,10 @@ xfs_reclaim_worker(
453 struct xfs_mount, m_reclaim_work); 733 struct xfs_mount, m_reclaim_work);
454 734
455 xfs_reclaim_inodes(mp, SYNC_TRYLOCK); 735 xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
456 xfs_syncd_queue_reclaim(mp); 736 xfs_reclaim_work_queue(mp);
457} 737}
458 738
459/* 739static void
460 * Flush delayed allocate data, attempting to free up reserved space
461 * from existing allocations. At this point a new allocation attempt
462 * has failed with ENOSPC and we are in the process of scratching our
463 * heads, looking about for more room.
464 *
465 * Queue a new data flush if there isn't one already in progress and
466 * wait for completion of the flush. This means that we only ever have one
467 * inode flush in progress no matter how many ENOSPC events are occurring and
468 * so will prevent the system from bogging down due to every concurrent
469 * ENOSPC event scanning all the active inodes in the system for writeback.
470 */
471void
472xfs_flush_inodes(
473 struct xfs_inode *ip)
474{
475 struct xfs_mount *mp = ip->i_mount;
476
477 queue_work(xfs_syncd_wq, &mp->m_flush_work);
478 flush_work(&mp->m_flush_work);
479}
480
481STATIC void
482xfs_flush_worker(
483 struct work_struct *work)
484{
485 struct xfs_mount *mp = container_of(work,
486 struct xfs_mount, m_flush_work);
487
488 xfs_sync_data(mp, SYNC_TRYLOCK);
489 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
490}
491
492int
493xfs_syncd_init(
494 struct xfs_mount *mp)
495{
496 INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
497 INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
498 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
499
500 xfs_syncd_queue_sync(mp);
501
502 return 0;
503}
504
505void
506xfs_syncd_stop(
507 struct xfs_mount *mp)
508{
509 cancel_delayed_work_sync(&mp->m_sync_work);
510 cancel_delayed_work_sync(&mp->m_reclaim_work);
511 cancel_work_sync(&mp->m_flush_work);
512}
513
514void
515__xfs_inode_set_reclaim_tag( 740__xfs_inode_set_reclaim_tag(
516 struct xfs_perag *pag, 741 struct xfs_perag *pag,
517 struct xfs_inode *ip) 742 struct xfs_inode *ip)
@@ -529,7 +754,7 @@ __xfs_inode_set_reclaim_tag(
529 spin_unlock(&ip->i_mount->m_perag_lock); 754 spin_unlock(&ip->i_mount->m_perag_lock);
530 755
531 /* schedule periodic background inode reclaim */ 756 /* schedule periodic background inode reclaim */
532 xfs_syncd_queue_reclaim(ip->i_mount); 757 xfs_reclaim_work_queue(ip->i_mount);
533 758
534 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, 759 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
535 -1, _RET_IP_); 760 -1, _RET_IP_);
@@ -577,7 +802,7 @@ __xfs_inode_clear_reclaim(
577 } 802 }
578} 803}
579 804
580void 805STATIC void
581__xfs_inode_clear_reclaim_tag( 806__xfs_inode_clear_reclaim_tag(
582 xfs_mount_t *mp, 807 xfs_mount_t *mp,
583 xfs_perag_t *pag, 808 xfs_perag_t *pag,
@@ -787,9 +1012,9 @@ out:
787 /* 1012 /*
788 * We could return EAGAIN here to make reclaim rescan the inode tree in 1013 * We could return EAGAIN here to make reclaim rescan the inode tree in
789 * a short while. However, this just burns CPU time scanning the tree 1014 * a short while. However, this just burns CPU time scanning the tree
790 * waiting for IO to complete and xfssyncd never goes back to the idle 1015 * waiting for IO to complete and the reclaim work never goes back to
791 * state. Instead, return 0 to let the next scheduled background reclaim 1016 * the idle state. Instead, return 0 to let the next scheduled
792 * attempt to reclaim the inode again. 1017 * background reclaim attempt to reclaim the inode again.
793 */ 1018 */
794 return 0; 1019 return 0;
795} 1020}
@@ -800,7 +1025,7 @@ out:
800 * then a shut down during filesystem unmount reclaim walk leak all the 1025 * then a shut down during filesystem unmount reclaim walk leak all the
801 * unreclaimed inodes. 1026 * unreclaimed inodes.
802 */ 1027 */
803int 1028STATIC int
804xfs_reclaim_inodes_ag( 1029xfs_reclaim_inodes_ag(
805 struct xfs_mount *mp, 1030 struct xfs_mount *mp,
806 int flags, 1031 int flags,
@@ -945,7 +1170,7 @@ xfs_reclaim_inodes_nr(
945 int nr_to_scan) 1170 int nr_to_scan)
946{ 1171{
947 /* kick background reclaimer and push the AIL */ 1172 /* kick background reclaimer and push the AIL */
948 xfs_syncd_queue_reclaim(mp); 1173 xfs_reclaim_work_queue(mp);
949 xfs_ail_push_all(mp->m_ail); 1174 xfs_ail_push_all(mp->m_ail);
950 1175
951 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); 1176 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
@@ -971,3 +1196,146 @@ xfs_reclaim_inodes_count(
971 return reclaimable; 1196 return reclaimable;
972} 1197}
973 1198
1199STATIC int
1200xfs_inode_match_id(
1201 struct xfs_inode *ip,
1202 struct xfs_eofblocks *eofb)
1203{
1204 if (eofb->eof_flags & XFS_EOF_FLAGS_UID &&
1205 ip->i_d.di_uid != eofb->eof_uid)
1206 return 0;
1207
1208 if (eofb->eof_flags & XFS_EOF_FLAGS_GID &&
1209 ip->i_d.di_gid != eofb->eof_gid)
1210 return 0;
1211
1212 if (eofb->eof_flags & XFS_EOF_FLAGS_PRID &&
1213 xfs_get_projid(ip) != eofb->eof_prid)
1214 return 0;
1215
1216 return 1;
1217}
1218
1219STATIC int
1220xfs_inode_free_eofblocks(
1221 struct xfs_inode *ip,
1222 struct xfs_perag *pag,
1223 int flags,
1224 void *args)
1225{
1226 int ret;
1227 struct xfs_eofblocks *eofb = args;
1228
1229 if (!xfs_can_free_eofblocks(ip, false)) {
1230 /* inode could be preallocated or append-only */
1231 trace_xfs_inode_free_eofblocks_invalid(ip);
1232 xfs_inode_clear_eofblocks_tag(ip);
1233 return 0;
1234 }
1235
1236 /*
1237 * If the mapping is dirty the operation can block and wait for some
1238 * time. Unless we are waiting, skip it.
1239 */
1240 if (!(flags & SYNC_WAIT) &&
1241 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
1242 return 0;
1243
1244 if (eofb) {
1245 if (!xfs_inode_match_id(ip, eofb))
1246 return 0;
1247
1248 /* skip the inode if the file size is too small */
1249 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
1250 XFS_ISIZE(ip) < eofb->eof_min_file_size)
1251 return 0;
1252 }
1253
1254 ret = xfs_free_eofblocks(ip->i_mount, ip, true);
1255
1256 /* don't revisit the inode if we're not waiting */
1257 if (ret == EAGAIN && !(flags & SYNC_WAIT))
1258 ret = 0;
1259
1260 return ret;
1261}
1262
1263int
1264xfs_icache_free_eofblocks(
1265 struct xfs_mount *mp,
1266 struct xfs_eofblocks *eofb)
1267{
1268 int flags = SYNC_TRYLOCK;
1269
1270 if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
1271 flags = SYNC_WAIT;
1272
1273 return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags,
1274 eofb, XFS_ICI_EOFBLOCKS_TAG);
1275}
1276
1277void
1278xfs_inode_set_eofblocks_tag(
1279 xfs_inode_t *ip)
1280{
1281 struct xfs_mount *mp = ip->i_mount;
1282 struct xfs_perag *pag;
1283 int tagged;
1284
1285 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1286 spin_lock(&pag->pag_ici_lock);
1287 trace_xfs_inode_set_eofblocks_tag(ip);
1288
1289 tagged = radix_tree_tagged(&pag->pag_ici_root,
1290 XFS_ICI_EOFBLOCKS_TAG);
1291 radix_tree_tag_set(&pag->pag_ici_root,
1292 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
1293 XFS_ICI_EOFBLOCKS_TAG);
1294 if (!tagged) {
1295 /* propagate the eofblocks tag up into the perag radix tree */
1296 spin_lock(&ip->i_mount->m_perag_lock);
1297 radix_tree_tag_set(&ip->i_mount->m_perag_tree,
1298 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
1299 XFS_ICI_EOFBLOCKS_TAG);
1300 spin_unlock(&ip->i_mount->m_perag_lock);
1301
1302 /* kick off background trimming */
1303 xfs_queue_eofblocks(ip->i_mount);
1304
1305 trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno,
1306 -1, _RET_IP_);
1307 }
1308
1309 spin_unlock(&pag->pag_ici_lock);
1310 xfs_perag_put(pag);
1311}
1312
1313void
1314xfs_inode_clear_eofblocks_tag(
1315 xfs_inode_t *ip)
1316{
1317 struct xfs_mount *mp = ip->i_mount;
1318 struct xfs_perag *pag;
1319
1320 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1321 spin_lock(&pag->pag_ici_lock);
1322 trace_xfs_inode_clear_eofblocks_tag(ip);
1323
1324 radix_tree_tag_clear(&pag->pag_ici_root,
1325 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
1326 XFS_ICI_EOFBLOCKS_TAG);
1327 if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) {
1328 /* clear the eofblocks tag from the perag radix tree */
1329 spin_lock(&ip->i_mount->m_perag_lock);
1330 radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
1331 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
1332 XFS_ICI_EOFBLOCKS_TAG);
1333 spin_unlock(&ip->i_mount->m_perag_lock);
1334 trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno,
1335 -1, _RET_IP_);
1336 }
1337
1338 spin_unlock(&pag->pag_ici_lock);
1339 xfs_perag_put(pag);
1340}
1341
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_icache.h
index 941202e7ac6e..e0f138c70a2f 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_icache.h
@@ -24,28 +24,30 @@ struct xfs_perag;
24#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ 24#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
25#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ 25#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
26 26
27extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ 27int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
28 uint flags, uint lock_flags, xfs_inode_t **ipp);
28 29
29int xfs_syncd_init(struct xfs_mount *mp); 30void xfs_reclaim_worker(struct work_struct *work);
30void xfs_syncd_stop(struct xfs_mount *mp);
31
32int xfs_quiesce_data(struct xfs_mount *mp);
33void xfs_quiesce_attr(struct xfs_mount *mp);
34
35void xfs_flush_inodes(struct xfs_inode *ip);
36 31
37int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); 32int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
38int xfs_reclaim_inodes_count(struct xfs_mount *mp); 33int xfs_reclaim_inodes_count(struct xfs_mount *mp);
39void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); 34void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
40 35
41void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); 36void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
42void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); 37
43void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, 38void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
44 struct xfs_inode *ip); 39void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
40int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
41void xfs_eofblocks_worker(struct work_struct *);
45 42
46int xfs_sync_inode_grab(struct xfs_inode *ip); 43int xfs_sync_inode_grab(struct xfs_inode *ip);
47int xfs_inode_ag_iterator(struct xfs_mount *mp, 44int xfs_inode_ag_iterator(struct xfs_mount *mp,
48 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), 45 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
49 int flags); 46 int flags, void *args),
47 int flags, void *args);
48int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
49 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
50 int flags, void *args),
51 int flags, void *args, int tag);
50 52
51#endif 53#endif
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
deleted file mode 100644
index 784a803383ec..000000000000
--- a/fs/xfs/xfs_iget.c
+++ /dev/null
@@ -1,705 +0,0 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_acl.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_bmap_btree.h"
29#include "xfs_alloc_btree.h"
30#include "xfs_ialloc_btree.h"
31#include "xfs_dinode.h"
32#include "xfs_inode.h"
33#include "xfs_btree.h"
34#include "xfs_ialloc.h"
35#include "xfs_quota.h"
36#include "xfs_utils.h"
37#include "xfs_trans_priv.h"
38#include "xfs_inode_item.h"
39#include "xfs_bmap.h"
40#include "xfs_trace.h"
41
42
43/*
44 * Allocate and initialise an xfs_inode.
45 */
46STATIC struct xfs_inode *
47xfs_inode_alloc(
48 struct xfs_mount *mp,
49 xfs_ino_t ino)
50{
51 struct xfs_inode *ip;
52
53 /*
54 * if this didn't occur in transactions, we could use
55 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
56 * code up to do this anyway.
57 */
58 ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
59 if (!ip)
60 return NULL;
61 if (inode_init_always(mp->m_super, VFS_I(ip))) {
62 kmem_zone_free(xfs_inode_zone, ip);
63 return NULL;
64 }
65
66 ASSERT(atomic_read(&ip->i_pincount) == 0);
67 ASSERT(!spin_is_locked(&ip->i_flags_lock));
68 ASSERT(!xfs_isiflocked(ip));
69 ASSERT(ip->i_ino == 0);
70
71 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
72
73 /* initialise the xfs inode */
74 ip->i_ino = ino;
75 ip->i_mount = mp;
76 memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
77 ip->i_afp = NULL;
78 memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
79 ip->i_flags = 0;
80 ip->i_delayed_blks = 0;
81 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
82
83 return ip;
84}
85
86STATIC void
87xfs_inode_free_callback(
88 struct rcu_head *head)
89{
90 struct inode *inode = container_of(head, struct inode, i_rcu);
91 struct xfs_inode *ip = XFS_I(inode);
92
93 kmem_zone_free(xfs_inode_zone, ip);
94}
95
96void
97xfs_inode_free(
98 struct xfs_inode *ip)
99{
100 switch (ip->i_d.di_mode & S_IFMT) {
101 case S_IFREG:
102 case S_IFDIR:
103 case S_IFLNK:
104 xfs_idestroy_fork(ip, XFS_DATA_FORK);
105 break;
106 }
107
108 if (ip->i_afp)
109 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
110
111 if (ip->i_itemp) {
112 ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
113 xfs_inode_item_destroy(ip);
114 ip->i_itemp = NULL;
115 }
116
117 /* asserts to verify all state is correct here */
118 ASSERT(atomic_read(&ip->i_pincount) == 0);
119 ASSERT(!spin_is_locked(&ip->i_flags_lock));
120 ASSERT(!xfs_isiflocked(ip));
121
122 /*
123 * Because we use RCU freeing we need to ensure the inode always
124 * appears to be reclaimed with an invalid inode number when in the
125 * free state. The ip->i_flags_lock provides the barrier against lookup
126 * races.
127 */
128 spin_lock(&ip->i_flags_lock);
129 ip->i_flags = XFS_IRECLAIM;
130 ip->i_ino = 0;
131 spin_unlock(&ip->i_flags_lock);
132
133 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
134}
135
136/*
137 * Check the validity of the inode we just found it the cache
138 */
139static int
140xfs_iget_cache_hit(
141 struct xfs_perag *pag,
142 struct xfs_inode *ip,
143 xfs_ino_t ino,
144 int flags,
145 int lock_flags) __releases(RCU)
146{
147 struct inode *inode = VFS_I(ip);
148 struct xfs_mount *mp = ip->i_mount;
149 int error;
150
151 /*
152 * check for re-use of an inode within an RCU grace period due to the
153 * radix tree nodes not being updated yet. We monitor for this by
154 * setting the inode number to zero before freeing the inode structure.
155 * If the inode has been reallocated and set up, then the inode number
156 * will not match, so check for that, too.
157 */
158 spin_lock(&ip->i_flags_lock);
159 if (ip->i_ino != ino) {
160 trace_xfs_iget_skip(ip);
161 XFS_STATS_INC(xs_ig_frecycle);
162 error = EAGAIN;
163 goto out_error;
164 }
165
166
167 /*
168 * If we are racing with another cache hit that is currently
169 * instantiating this inode or currently recycling it out of
170 * reclaimabe state, wait for the initialisation to complete
171 * before continuing.
172 *
173 * XXX(hch): eventually we should do something equivalent to
174 * wait_on_inode to wait for these flags to be cleared
175 * instead of polling for it.
176 */
177 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
178 trace_xfs_iget_skip(ip);
179 XFS_STATS_INC(xs_ig_frecycle);
180 error = EAGAIN;
181 goto out_error;
182 }
183
184 /*
185 * If lookup is racing with unlink return an error immediately.
186 */
187 if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
188 error = ENOENT;
189 goto out_error;
190 }
191
192 /*
193 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
194 * Need to carefully get it back into useable state.
195 */
196 if (ip->i_flags & XFS_IRECLAIMABLE) {
197 trace_xfs_iget_reclaim(ip);
198
199 /*
200 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
201 * from stomping over us while we recycle the inode. We can't
202 * clear the radix tree reclaimable tag yet as it requires
203 * pag_ici_lock to be held exclusive.
204 */
205 ip->i_flags |= XFS_IRECLAIM;
206
207 spin_unlock(&ip->i_flags_lock);
208 rcu_read_unlock();
209
210 error = -inode_init_always(mp->m_super, inode);
211 if (error) {
212 /*
213 * Re-initializing the inode failed, and we are in deep
214 * trouble. Try to re-add it to the reclaim list.
215 */
216 rcu_read_lock();
217 spin_lock(&ip->i_flags_lock);
218
219 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
220 ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
221 trace_xfs_iget_reclaim_fail(ip);
222 goto out_error;
223 }
224
225 spin_lock(&pag->pag_ici_lock);
226 spin_lock(&ip->i_flags_lock);
227
228 /*
229 * Clear the per-lifetime state in the inode as we are now
230 * effectively a new inode and need to return to the initial
231 * state before reuse occurs.
232 */
233 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
234 ip->i_flags |= XFS_INEW;
235 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
236 inode->i_state = I_NEW;
237
238 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
239 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
240
241 spin_unlock(&ip->i_flags_lock);
242 spin_unlock(&pag->pag_ici_lock);
243 } else {
244 /* If the VFS inode is being torn down, pause and try again. */
245 if (!igrab(inode)) {
246 trace_xfs_iget_skip(ip);
247 error = EAGAIN;
248 goto out_error;
249 }
250
251 /* We've got a live one. */
252 spin_unlock(&ip->i_flags_lock);
253 rcu_read_unlock();
254 trace_xfs_iget_hit(ip);
255 }
256
257 if (lock_flags != 0)
258 xfs_ilock(ip, lock_flags);
259
260 xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
261 XFS_STATS_INC(xs_ig_found);
262
263 return 0;
264
265out_error:
266 spin_unlock(&ip->i_flags_lock);
267 rcu_read_unlock();
268 return error;
269}
270
271
272static int
273xfs_iget_cache_miss(
274 struct xfs_mount *mp,
275 struct xfs_perag *pag,
276 xfs_trans_t *tp,
277 xfs_ino_t ino,
278 struct xfs_inode **ipp,
279 int flags,
280 int lock_flags)
281{
282 struct xfs_inode *ip;
283 int error;
284 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
285 int iflags;
286
287 ip = xfs_inode_alloc(mp, ino);
288 if (!ip)
289 return ENOMEM;
290
291 error = xfs_iread(mp, tp, ip, flags);
292 if (error)
293 goto out_destroy;
294
295 trace_xfs_iget_miss(ip);
296
297 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
298 error = ENOENT;
299 goto out_destroy;
300 }
301
302 /*
303 * Preload the radix tree so we can insert safely under the
304 * write spinlock. Note that we cannot sleep inside the preload
305 * region. Since we can be called from transaction context, don't
306 * recurse into the file system.
307 */
308 if (radix_tree_preload(GFP_NOFS)) {
309 error = EAGAIN;
310 goto out_destroy;
311 }
312
313 /*
314 * Because the inode hasn't been added to the radix-tree yet it can't
315 * be found by another thread, so we can do the non-sleeping lock here.
316 */
317 if (lock_flags) {
318 if (!xfs_ilock_nowait(ip, lock_flags))
319 BUG();
320 }
321
322 /*
323 * These values must be set before inserting the inode into the radix
324 * tree as the moment it is inserted a concurrent lookup (allowed by the
325 * RCU locking mechanism) can find it and that lookup must see that this
326 * is an inode currently under construction (i.e. that XFS_INEW is set).
327 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
328 * memory barrier that ensures this detection works correctly at lookup
329 * time.
330 */
331 iflags = XFS_INEW;
332 if (flags & XFS_IGET_DONTCACHE)
333 iflags |= XFS_IDONTCACHE;
334 ip->i_udquot = ip->i_gdquot = NULL;
335 xfs_iflags_set(ip, iflags);
336
337 /* insert the new inode */
338 spin_lock(&pag->pag_ici_lock);
339 error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
340 if (unlikely(error)) {
341 WARN_ON(error != -EEXIST);
342 XFS_STATS_INC(xs_ig_dup);
343 error = EAGAIN;
344 goto out_preload_end;
345 }
346 spin_unlock(&pag->pag_ici_lock);
347 radix_tree_preload_end();
348
349 *ipp = ip;
350 return 0;
351
352out_preload_end:
353 spin_unlock(&pag->pag_ici_lock);
354 radix_tree_preload_end();
355 if (lock_flags)
356 xfs_iunlock(ip, lock_flags);
357out_destroy:
358 __destroy_inode(VFS_I(ip));
359 xfs_inode_free(ip);
360 return error;
361}
362
363/*
364 * Look up an inode by number in the given file system.
365 * The inode is looked up in the cache held in each AG.
366 * If the inode is found in the cache, initialise the vfs inode
367 * if necessary.
368 *
369 * If it is not in core, read it in from the file system's device,
370 * add it to the cache and initialise the vfs inode.
371 *
372 * The inode is locked according to the value of the lock_flags parameter.
373 * This flag parameter indicates how and if the inode's IO lock and inode lock
374 * should be taken.
375 *
376 * mp -- the mount point structure for the current file system. It points
377 * to the inode hash table.
378 * tp -- a pointer to the current transaction if there is one. This is
379 * simply passed through to the xfs_iread() call.
380 * ino -- the number of the inode desired. This is the unique identifier
381 * within the file system for the inode being requested.
382 * lock_flags -- flags indicating how to lock the inode. See the comment
383 * for xfs_ilock() for a list of valid values.
384 */
385int
386xfs_iget(
387 xfs_mount_t *mp,
388 xfs_trans_t *tp,
389 xfs_ino_t ino,
390 uint flags,
391 uint lock_flags,
392 xfs_inode_t **ipp)
393{
394 xfs_inode_t *ip;
395 int error;
396 xfs_perag_t *pag;
397 xfs_agino_t agino;
398
399 /*
400 * xfs_reclaim_inode() uses the ILOCK to ensure an inode
401 * doesn't get freed while it's being referenced during a
402 * radix tree traversal here. It assumes this function
403 * aqcuires only the ILOCK (and therefore it has no need to
404 * involve the IOLOCK in this synchronization).
405 */
406 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
407
408 /* reject inode numbers outside existing AGs */
409 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
410 return EINVAL;
411
412 /* get the perag structure and ensure that it's inode capable */
413 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
414 agino = XFS_INO_TO_AGINO(mp, ino);
415
416again:
417 error = 0;
418 rcu_read_lock();
419 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
420
421 if (ip) {
422 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
423 if (error)
424 goto out_error_or_again;
425 } else {
426 rcu_read_unlock();
427 XFS_STATS_INC(xs_ig_missed);
428
429 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
430 flags, lock_flags);
431 if (error)
432 goto out_error_or_again;
433 }
434 xfs_perag_put(pag);
435
436 *ipp = ip;
437
438 /*
439 * If we have a real type for an on-disk inode, we can set ops(&unlock)
440 * now. If it's a new inode being created, xfs_ialloc will handle it.
441 */
442 if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
443 xfs_setup_inode(ip);
444 return 0;
445
446out_error_or_again:
447 if (error == EAGAIN) {
448 delay(1);
449 goto again;
450 }
451 xfs_perag_put(pag);
452 return error;
453}
454
455/*
456 * This is a wrapper routine around the xfs_ilock() routine
457 * used to centralize some grungy code. It is used in places
458 * that wish to lock the inode solely for reading the extents.
459 * The reason these places can't just call xfs_ilock(SHARED)
460 * is that the inode lock also guards to bringing in of the
461 * extents from disk for a file in b-tree format. If the inode
462 * is in b-tree format, then we need to lock the inode exclusively
463 * until the extents are read in. Locking it exclusively all
464 * the time would limit our parallelism unnecessarily, though.
465 * What we do instead is check to see if the extents have been
466 * read in yet, and only lock the inode exclusively if they
467 * have not.
468 *
469 * The function returns a value which should be given to the
470 * corresponding xfs_iunlock_map_shared(). This value is
471 * the mode in which the lock was actually taken.
472 */
473uint
474xfs_ilock_map_shared(
475 xfs_inode_t *ip)
476{
477 uint lock_mode;
478
479 if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
480 ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
481 lock_mode = XFS_ILOCK_EXCL;
482 } else {
483 lock_mode = XFS_ILOCK_SHARED;
484 }
485
486 xfs_ilock(ip, lock_mode);
487
488 return lock_mode;
489}
490
491/*
492 * This is simply the unlock routine to go with xfs_ilock_map_shared().
493 * All it does is call xfs_iunlock() with the given lock_mode.
494 */
495void
496xfs_iunlock_map_shared(
497 xfs_inode_t *ip,
498 unsigned int lock_mode)
499{
500 xfs_iunlock(ip, lock_mode);
501}
502
503/*
504 * The xfs inode contains 2 locks: a multi-reader lock called the
505 * i_iolock and a multi-reader lock called the i_lock. This routine
506 * allows either or both of the locks to be obtained.
507 *
508 * The 2 locks should always be ordered so that the IO lock is
509 * obtained first in order to prevent deadlock.
510 *
511 * ip -- the inode being locked
512 * lock_flags -- this parameter indicates the inode's locks
513 * to be locked. It can be:
514 * XFS_IOLOCK_SHARED,
515 * XFS_IOLOCK_EXCL,
516 * XFS_ILOCK_SHARED,
517 * XFS_ILOCK_EXCL,
518 * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
519 * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
520 * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
521 * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
522 */
523void
524xfs_ilock(
525 xfs_inode_t *ip,
526 uint lock_flags)
527{
528 /*
529 * You can't set both SHARED and EXCL for the same lock,
530 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
531 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
532 */
533 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
534 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
535 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
536 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
537 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
538
539 if (lock_flags & XFS_IOLOCK_EXCL)
540 mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
541 else if (lock_flags & XFS_IOLOCK_SHARED)
542 mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
543
544 if (lock_flags & XFS_ILOCK_EXCL)
545 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
546 else if (lock_flags & XFS_ILOCK_SHARED)
547 mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
548
549 trace_xfs_ilock(ip, lock_flags, _RET_IP_);
550}
551
552/*
553 * This is just like xfs_ilock(), except that the caller
554 * is guaranteed not to sleep. It returns 1 if it gets
555 * the requested locks and 0 otherwise. If the IO lock is
556 * obtained but the inode lock cannot be, then the IO lock
557 * is dropped before returning.
558 *
559 * ip -- the inode being locked
560 * lock_flags -- this parameter indicates the inode's locks to be
561 * to be locked. See the comment for xfs_ilock() for a list
562 * of valid values.
563 */
564int
565xfs_ilock_nowait(
566 xfs_inode_t *ip,
567 uint lock_flags)
568{
569 /*
570 * You can't set both SHARED and EXCL for the same lock,
571 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
572 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
573 */
574 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
575 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
576 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
577 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
578 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
579
580 if (lock_flags & XFS_IOLOCK_EXCL) {
581 if (!mrtryupdate(&ip->i_iolock))
582 goto out;
583 } else if (lock_flags & XFS_IOLOCK_SHARED) {
584 if (!mrtryaccess(&ip->i_iolock))
585 goto out;
586 }
587 if (lock_flags & XFS_ILOCK_EXCL) {
588 if (!mrtryupdate(&ip->i_lock))
589 goto out_undo_iolock;
590 } else if (lock_flags & XFS_ILOCK_SHARED) {
591 if (!mrtryaccess(&ip->i_lock))
592 goto out_undo_iolock;
593 }
594 trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
595 return 1;
596
597 out_undo_iolock:
598 if (lock_flags & XFS_IOLOCK_EXCL)
599 mrunlock_excl(&ip->i_iolock);
600 else if (lock_flags & XFS_IOLOCK_SHARED)
601 mrunlock_shared(&ip->i_iolock);
602 out:
603 return 0;
604}
605
606/*
607 * xfs_iunlock() is used to drop the inode locks acquired with
608 * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
609 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
610 * that we know which locks to drop.
611 *
612 * ip -- the inode being unlocked
613 * lock_flags -- this parameter indicates the inode's locks to be
614 * to be unlocked. See the comment for xfs_ilock() for a list
615 * of valid values for this parameter.
616 *
617 */
618void
619xfs_iunlock(
620 xfs_inode_t *ip,
621 uint lock_flags)
622{
623 /*
624 * You can't set both SHARED and EXCL for the same lock,
625 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
626 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
627 */
628 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
629 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
630 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
631 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
632 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
633 ASSERT(lock_flags != 0);
634
635 if (lock_flags & XFS_IOLOCK_EXCL)
636 mrunlock_excl(&ip->i_iolock);
637 else if (lock_flags & XFS_IOLOCK_SHARED)
638 mrunlock_shared(&ip->i_iolock);
639
640 if (lock_flags & XFS_ILOCK_EXCL)
641 mrunlock_excl(&ip->i_lock);
642 else if (lock_flags & XFS_ILOCK_SHARED)
643 mrunlock_shared(&ip->i_lock);
644
645 trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
646}
647
648/*
649 * give up write locks. the i/o lock cannot be held nested
650 * if it is being demoted.
651 */
652void
653xfs_ilock_demote(
654 xfs_inode_t *ip,
655 uint lock_flags)
656{
657 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
658 ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
659
660 if (lock_flags & XFS_ILOCK_EXCL)
661 mrdemote(&ip->i_lock);
662 if (lock_flags & XFS_IOLOCK_EXCL)
663 mrdemote(&ip->i_iolock);
664
665 trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
666}
667
668#ifdef DEBUG
669int
670xfs_isilocked(
671 xfs_inode_t *ip,
672 uint lock_flags)
673{
674 if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
675 if (!(lock_flags & XFS_ILOCK_SHARED))
676 return !!ip->i_lock.mr_writer;
677 return rwsem_is_locked(&ip->i_lock.mr_lock);
678 }
679
680 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
681 if (!(lock_flags & XFS_IOLOCK_SHARED))
682 return !!ip->i_iolock.mr_writer;
683 return rwsem_is_locked(&ip->i_iolock.mr_lock);
684 }
685
686 ASSERT(0);
687 return 0;
688}
689#endif
690
691void
692__xfs_iflock(
693 struct xfs_inode *ip)
694{
695 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
696 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
697
698 do {
699 prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
700 if (xfs_isiflocked(ip))
701 io_schedule();
702 } while (!xfs_iflock_nowait(ip));
703
704 finish_wait(wq, &wait.wait);
705}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 2778258fcfa2..66282dcb821b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -45,6 +45,7 @@
45#include "xfs_filestream.h" 45#include "xfs_filestream.h"
46#include "xfs_vnodeops.h" 46#include "xfs_vnodeops.h"
47#include "xfs_trace.h" 47#include "xfs_trace.h"
48#include "xfs_icache.h"
48 49
49kmem_zone_t *xfs_ifork_zone; 50kmem_zone_t *xfs_ifork_zone;
50kmem_zone_t *xfs_inode_zone; 51kmem_zone_t *xfs_inode_zone;
@@ -74,6 +75,256 @@ xfs_get_extsz_hint(
74 return 0; 75 return 0;
75} 76}
76 77
78/*
79 * This is a wrapper routine around the xfs_ilock() routine used to centralize
80 * some grungy code. It is used in places that wish to lock the inode solely
81 * for reading the extents. The reason these places can't just call
82 * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
83 * extents from disk for a file in b-tree format. If the inode is in b-tree
84 * format, then we need to lock the inode exclusively until the extents are read
85 * in. Locking it exclusively all the time would limit our parallelism
86 * unnecessarily, though. What we do instead is check to see if the extents
87 * have been read in yet, and only lock the inode exclusively if they have not.
88 *
89 * The function returns a value which should be given to the corresponding
90 * xfs_iunlock_map_shared(). This value is the mode in which the lock was
91 * actually taken.
92 */
93uint
94xfs_ilock_map_shared(
95 xfs_inode_t *ip)
96{
97 uint lock_mode;
98
99 if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
100 ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
101 lock_mode = XFS_ILOCK_EXCL;
102 } else {
103 lock_mode = XFS_ILOCK_SHARED;
104 }
105
106 xfs_ilock(ip, lock_mode);
107
108 return lock_mode;
109}
110
111/*
112 * This is simply the unlock routine to go with xfs_ilock_map_shared().
113 * All it does is call xfs_iunlock() with the given lock_mode.
114 */
115void
116xfs_iunlock_map_shared(
117 xfs_inode_t *ip,
118 unsigned int lock_mode)
119{
120 xfs_iunlock(ip, lock_mode);
121}
122
123/*
124 * The xfs inode contains 2 locks: a multi-reader lock called the
125 * i_iolock and a multi-reader lock called the i_lock. This routine
126 * allows either or both of the locks to be obtained.
127 *
128 * The 2 locks should always be ordered so that the IO lock is
129 * obtained first in order to prevent deadlock.
130 *
131 * ip -- the inode being locked
132 * lock_flags -- this parameter indicates the inode's locks
133 * to be locked. It can be:
134 * XFS_IOLOCK_SHARED,
135 * XFS_IOLOCK_EXCL,
136 * XFS_ILOCK_SHARED,
137 * XFS_ILOCK_EXCL,
138 * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
139 * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
140 * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
141 * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
142 */
143void
144xfs_ilock(
145 xfs_inode_t *ip,
146 uint lock_flags)
147{
148 trace_xfs_ilock(ip, lock_flags, _RET_IP_);
149
150 /*
151 * You can't set both SHARED and EXCL for the same lock,
152 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
153 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
154 */
155 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
156 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
157 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
158 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
159 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
160
161 if (lock_flags & XFS_IOLOCK_EXCL)
162 mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
163 else if (lock_flags & XFS_IOLOCK_SHARED)
164 mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
165
166 if (lock_flags & XFS_ILOCK_EXCL)
167 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
168 else if (lock_flags & XFS_ILOCK_SHARED)
169 mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
170}
171
172/*
173 * This is just like xfs_ilock(), except that the caller
174 * is guaranteed not to sleep. It returns 1 if it gets
175 * the requested locks and 0 otherwise. If the IO lock is
176 * obtained but the inode lock cannot be, then the IO lock
177 * is dropped before returning.
178 *
179 * ip -- the inode being locked
180 * lock_flags -- this parameter indicates the inode's locks to be
181 * to be locked. See the comment for xfs_ilock() for a list
182 * of valid values.
183 */
184int
185xfs_ilock_nowait(
186 xfs_inode_t *ip,
187 uint lock_flags)
188{
189 trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
190
191 /*
192 * You can't set both SHARED and EXCL for the same lock,
193 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
194 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
195 */
196 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
197 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
198 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
199 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
200 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
201
202 if (lock_flags & XFS_IOLOCK_EXCL) {
203 if (!mrtryupdate(&ip->i_iolock))
204 goto out;
205 } else if (lock_flags & XFS_IOLOCK_SHARED) {
206 if (!mrtryaccess(&ip->i_iolock))
207 goto out;
208 }
209 if (lock_flags & XFS_ILOCK_EXCL) {
210 if (!mrtryupdate(&ip->i_lock))
211 goto out_undo_iolock;
212 } else if (lock_flags & XFS_ILOCK_SHARED) {
213 if (!mrtryaccess(&ip->i_lock))
214 goto out_undo_iolock;
215 }
216 return 1;
217
218 out_undo_iolock:
219 if (lock_flags & XFS_IOLOCK_EXCL)
220 mrunlock_excl(&ip->i_iolock);
221 else if (lock_flags & XFS_IOLOCK_SHARED)
222 mrunlock_shared(&ip->i_iolock);
223 out:
224 return 0;
225}
226
227/*
228 * xfs_iunlock() is used to drop the inode locks acquired with
229 * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
230 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
231 * that we know which locks to drop.
232 *
233 * ip -- the inode being unlocked
234 * lock_flags -- this parameter indicates the inode's locks to be
235 * to be unlocked. See the comment for xfs_ilock() for a list
236 * of valid values for this parameter.
237 *
238 */
239void
240xfs_iunlock(
241 xfs_inode_t *ip,
242 uint lock_flags)
243{
244 /*
245 * You can't set both SHARED and EXCL for the same lock,
246 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
247 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
248 */
249 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
250 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
251 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
252 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
253 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
254 ASSERT(lock_flags != 0);
255
256 if (lock_flags & XFS_IOLOCK_EXCL)
257 mrunlock_excl(&ip->i_iolock);
258 else if (lock_flags & XFS_IOLOCK_SHARED)
259 mrunlock_shared(&ip->i_iolock);
260
261 if (lock_flags & XFS_ILOCK_EXCL)
262 mrunlock_excl(&ip->i_lock);
263 else if (lock_flags & XFS_ILOCK_SHARED)
264 mrunlock_shared(&ip->i_lock);
265
266 trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
267}
268
269/*
270 * give up write locks. the i/o lock cannot be held nested
271 * if it is being demoted.
272 */
273void
274xfs_ilock_demote(
275 xfs_inode_t *ip,
276 uint lock_flags)
277{
278 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
279 ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
280
281 if (lock_flags & XFS_ILOCK_EXCL)
282 mrdemote(&ip->i_lock);
283 if (lock_flags & XFS_IOLOCK_EXCL)
284 mrdemote(&ip->i_iolock);
285
286 trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
287}
288
289#ifdef DEBUG
290int
291xfs_isilocked(
292 xfs_inode_t *ip,
293 uint lock_flags)
294{
295 if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
296 if (!(lock_flags & XFS_ILOCK_SHARED))
297 return !!ip->i_lock.mr_writer;
298 return rwsem_is_locked(&ip->i_lock.mr_lock);
299 }
300
301 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
302 if (!(lock_flags & XFS_IOLOCK_SHARED))
303 return !!ip->i_iolock.mr_writer;
304 return rwsem_is_locked(&ip->i_iolock.mr_lock);
305 }
306
307 ASSERT(0);
308 return 0;
309}
310#endif
311
312void
313__xfs_iflock(
314 struct xfs_inode *ip)
315{
316 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
317 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
318
319 do {
320 prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
321 if (xfs_isiflocked(ip))
322 io_schedule();
323 } while (!xfs_iflock_nowait(ip));
324
325 finish_wait(wq, &wait.wait);
326}
327
77#ifdef DEBUG 328#ifdef DEBUG
78/* 329/*
79 * Make sure that the extents in the given memory buffer 330 * Make sure that the extents in the given memory buffer
@@ -131,6 +382,65 @@ xfs_inobp_check(
131} 382}
132#endif 383#endif
133 384
385static void
386xfs_inode_buf_verify(
387 struct xfs_buf *bp)
388{
389 struct xfs_mount *mp = bp->b_target->bt_mount;
390 int i;
391 int ni;
392
393 /*
394 * Validate the magic number and version of every inode in the buffer
395 */
396 ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
397 for (i = 0; i < ni; i++) {
398 int di_ok;
399 xfs_dinode_t *dip;
400
401 dip = (struct xfs_dinode *)xfs_buf_offset(bp,
402 (i << mp->m_sb.sb_inodelog));
403 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
404 XFS_DINODE_GOOD_VERSION(dip->di_version);
405 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
406 XFS_ERRTAG_ITOBP_INOTOBP,
407 XFS_RANDOM_ITOBP_INOTOBP))) {
408 xfs_buf_ioerror(bp, EFSCORRUPTED);
409 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
410 mp, dip);
411#ifdef DEBUG
412 xfs_emerg(mp,
413 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
414 (unsigned long long)bp->b_bn, i,
415 be16_to_cpu(dip->di_magic));
416 ASSERT(0);
417#endif
418 }
419 }
420 xfs_inobp_check(mp, bp);
421}
422
423
424static void
425xfs_inode_buf_read_verify(
426 struct xfs_buf *bp)
427{
428 xfs_inode_buf_verify(bp);
429}
430
431static void
432xfs_inode_buf_write_verify(
433 struct xfs_buf *bp)
434{
435 xfs_inode_buf_verify(bp);
436}
437
438const struct xfs_buf_ops xfs_inode_buf_ops = {
439 .verify_read = xfs_inode_buf_read_verify,
440 .verify_write = xfs_inode_buf_write_verify,
441};
442
443
134/* 444/*
135 * This routine is called to map an inode to the buffer containing the on-disk 445 * This routine is called to map an inode to the buffer containing the on-disk
136 * version of the inode. It returns a pointer to the buffer containing the 446 * version of the inode. It returns a pointer to the buffer containing the
@@ -145,71 +455,33 @@ xfs_imap_to_bp(
145 struct xfs_mount *mp, 455 struct xfs_mount *mp,
146 struct xfs_trans *tp, 456 struct xfs_trans *tp,
147 struct xfs_imap *imap, 457 struct xfs_imap *imap,
148 struct xfs_dinode **dipp, 458 struct xfs_dinode **dipp,
149 struct xfs_buf **bpp, 459 struct xfs_buf **bpp,
150 uint buf_flags, 460 uint buf_flags,
151 uint iget_flags) 461 uint iget_flags)
152{ 462{
153 struct xfs_buf *bp; 463 struct xfs_buf *bp;
154 int error; 464 int error;
155 int i;
156 int ni;
157 465
158 buf_flags |= XBF_UNMAPPED; 466 buf_flags |= XBF_UNMAPPED;
159 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, 467 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
160 (int)imap->im_len, buf_flags, &bp); 468 (int)imap->im_len, buf_flags, &bp,
469 &xfs_inode_buf_ops);
161 if (error) { 470 if (error) {
162 if (error != EAGAIN) { 471 if (error == EAGAIN) {
163 xfs_warn(mp,
164 "%s: xfs_trans_read_buf() returned error %d.",
165 __func__, error);
166 } else {
167 ASSERT(buf_flags & XBF_TRYLOCK); 472 ASSERT(buf_flags & XBF_TRYLOCK);
473 return error;
168 } 474 }
169 return error;
170 }
171
172 /*
173 * Validate the magic number and version of every inode in the buffer
174 * (if DEBUG kernel) or the first inode in the buffer, otherwise.
175 */
176#ifdef DEBUG
177 ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
178#else /* usual case */
179 ni = 1;
180#endif
181 475
182 for (i = 0; i < ni; i++) { 476 if (error == EFSCORRUPTED &&
183 int di_ok; 477 (iget_flags & XFS_IGET_UNTRUSTED))
184 xfs_dinode_t *dip; 478 return XFS_ERROR(EINVAL);
185 479
186 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 480 xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
187 (i << mp->m_sb.sb_inodelog)); 481 __func__, error);
188 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && 482 return error;
189 XFS_DINODE_GOOD_VERSION(dip->di_version);
190 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
191 XFS_ERRTAG_ITOBP_INOTOBP,
192 XFS_RANDOM_ITOBP_INOTOBP))) {
193 if (iget_flags & XFS_IGET_UNTRUSTED) {
194 xfs_trans_brelse(tp, bp);
195 return XFS_ERROR(EINVAL);
196 }
197 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
198 mp, dip);
199#ifdef DEBUG
200 xfs_emerg(mp,
201 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
202 (unsigned long long)imap->im_blkno, i,
203 be16_to_cpu(dip->di_magic));
204 ASSERT(0);
205#endif
206 xfs_trans_brelse(tp, bp);
207 return XFS_ERROR(EFSCORRUPTED);
208 }
209 } 483 }
210 484
211 xfs_inobp_check(mp, bp);
212
213 *bpp = bp; 485 *bpp = bp;
214 *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); 486 *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
215 return 0; 487 return 0;
@@ -853,16 +1125,16 @@ xfs_iread_extents(
853 * set according to the contents of the given cred structure. 1125 * set according to the contents of the given cred structure.
854 * 1126 *
855 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() 1127 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
856 * has a free inode available, call xfs_iget() 1128 * has a free inode available, call xfs_iget() to obtain the in-core
857 * to obtain the in-core version of the allocated inode. Finally, 1129 * version of the allocated inode. Finally, fill in the inode and
858 * fill in the inode and log its initial contents. In this case, 1130 * log its initial contents. In this case, ialloc_context would be
859 * ialloc_context would be set to NULL and call_again set to false. 1131 * set to NULL.
860 * 1132 *
861 * If xfs_dialloc() does not have an available inode, 1133 * If xfs_dialloc() does not have an available inode, it will replenish
862 * it will replenish its supply by doing an allocation. Since we can 1134 * its supply by doing an allocation. Since we can only do one
863 * only do one allocation within a transaction without deadlocks, we 1135 * allocation within a transaction without deadlocks, we must commit
864 * must commit the current transaction before returning the inode itself. 1136 * the current transaction before returning the inode itself.
865 * In this case, therefore, we will set call_again to true and return. 1137 * In this case, therefore, we will set ialloc_context and return.
866 * The caller should then commit the current transaction, start a new 1138 * The caller should then commit the current transaction, start a new
867 * transaction, and call xfs_ialloc() again to actually get the inode. 1139 * transaction, and call xfs_ialloc() again to actually get the inode.
868 * 1140 *
@@ -1509,10 +1781,23 @@ xfs_ifree_cluster(
1509 * to mark all the active inodes on the buffer stale. 1781 * to mark all the active inodes on the buffer stale.
1510 */ 1782 */
1511 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 1783 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
1512 mp->m_bsize * blks_per_cluster, 0); 1784 mp->m_bsize * blks_per_cluster,
1785 XBF_UNMAPPED);
1513 1786
1514 if (!bp) 1787 if (!bp)
1515 return ENOMEM; 1788 return ENOMEM;
1789
1790 /*
1791 * This buffer may not have been correctly initialised as we
1792 * didn't read it from disk. That's not important because we are
1793 * only using to mark the buffer as stale in the log, and to
1794 * attach stale cached inodes on it. That means it will never be
1795 * dispatched for IO. If it is, we want to know about it, and we
1796 * want it to fail. We can acheive this by adding a write
1797 * verifier to the buffer.
1798 */
1799 bp->b_ops = &xfs_inode_buf_ops;
1800
1516 /* 1801 /*
1517 * Walk the inodes already attached to the buffer and mark them 1802 * Walk the inodes already attached to the buffer and mark them
1518 * stale. These will all have the flush locks held, so an 1803 * stale. These will all have the flush locks held, so an
@@ -3660,3 +3945,40 @@ xfs_iext_irec_update_extoffs(
3660 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; 3945 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
3661 } 3946 }
3662} 3947}
3948
3949/*
3950 * Test whether it is appropriate to check an inode for and free post EOF
3951 * blocks. The 'force' parameter determines whether we should also consider
3952 * regular files that are marked preallocated or append-only.
3953 */
3954bool
3955xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
3956{
3957 /* prealloc/delalloc exists only on regular files */
3958 if (!S_ISREG(ip->i_d.di_mode))
3959 return false;
3960
3961 /*
3962 * Zero sized files with no cached pages and delalloc blocks will not
3963 * have speculative prealloc/delalloc blocks to remove.
3964 */
3965 if (VFS_I(ip)->i_size == 0 &&
3966 VN_CACHED(VFS_I(ip)) == 0 &&
3967 ip->i_delayed_blks == 0)
3968 return false;
3969
3970 /* If we haven't read in the extent list, then don't do it now. */
3971 if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
3972 return false;
3973
3974 /*
3975 * Do not free real preallocated or append-only files unless the file
3976 * has delalloc blocks and we are forced to remove them.
3977 */
3978 if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
3979 if (!force || ip->i_delayed_blks == 0)
3980 return false;
3981
3982 return true;
3983}
3984
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 94b32f906e79..22baf6ea4fac 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -496,11 +496,10 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
496 (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ 496 (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
497 ((pip)->i_d.di_mode & S_ISGID)) 497 ((pip)->i_d.di_mode & S_ISGID))
498 498
499
499/* 500/*
500 * xfs_iget.c prototypes. 501 * xfs_inode.c prototypes.
501 */ 502 */
502int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
503 uint, uint, xfs_inode_t **);
504void xfs_ilock(xfs_inode_t *, uint); 503void xfs_ilock(xfs_inode_t *, uint);
505int xfs_ilock_nowait(xfs_inode_t *, uint); 504int xfs_ilock_nowait(xfs_inode_t *, uint);
506void xfs_iunlock(xfs_inode_t *, uint); 505void xfs_iunlock(xfs_inode_t *, uint);
@@ -508,11 +507,6 @@ void xfs_ilock_demote(xfs_inode_t *, uint);
508int xfs_isilocked(xfs_inode_t *, uint); 507int xfs_isilocked(xfs_inode_t *, uint);
509uint xfs_ilock_map_shared(xfs_inode_t *); 508uint xfs_ilock_map_shared(xfs_inode_t *);
510void xfs_iunlock_map_shared(xfs_inode_t *, uint); 509void xfs_iunlock_map_shared(xfs_inode_t *, uint);
511void xfs_inode_free(struct xfs_inode *ip);
512
513/*
514 * xfs_inode.c prototypes.
515 */
516int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, 510int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
517 xfs_nlink_t, xfs_dev_t, prid_t, int, 511 xfs_nlink_t, xfs_dev_t, prid_t, int,
518 struct xfs_buf **, xfs_inode_t **); 512 struct xfs_buf **, xfs_inode_t **);
@@ -591,6 +585,7 @@ void xfs_iext_irec_compact(xfs_ifork_t *);
591void xfs_iext_irec_compact_pages(xfs_ifork_t *); 585void xfs_iext_irec_compact_pages(xfs_ifork_t *);
592void xfs_iext_irec_compact_full(xfs_ifork_t *); 586void xfs_iext_irec_compact_full(xfs_ifork_t *);
593void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int); 587void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
588bool xfs_can_free_eofblocks(struct xfs_inode *, bool);
594 589
595#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) 590#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
596 591
@@ -603,5 +598,6 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
603extern struct kmem_zone *xfs_ifork_zone; 598extern struct kmem_zone *xfs_ifork_zone;
604extern struct kmem_zone *xfs_inode_zone; 599extern struct kmem_zone *xfs_inode_zone;
605extern struct kmem_zone *xfs_ili_zone; 600extern struct kmem_zone *xfs_ili_zone;
601extern const struct xfs_buf_ops xfs_inode_buf_ops;
606 602
607#endif /* __XFS_INODE_H__ */ 603#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 8305f2ac6773..c1c3ef88a260 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -42,6 +42,7 @@
42#include "xfs_inode_item.h" 42#include "xfs_inode_item.h"
43#include "xfs_export.h" 43#include "xfs_export.h"
44#include "xfs_trace.h" 44#include "xfs_trace.h"
45#include "xfs_icache.h"
45 46
46#include <linux/capability.h> 47#include <linux/capability.h>
47#include <linux/dcache.h> 48#include <linux/dcache.h>
@@ -70,7 +71,7 @@ xfs_find_handle(
70 int hsize; 71 int hsize;
71 xfs_handle_t handle; 72 xfs_handle_t handle;
72 struct inode *inode; 73 struct inode *inode;
73 struct fd f; 74 struct fd f = {0};
74 struct path path; 75 struct path path;
75 int error; 76 int error;
76 struct xfs_inode *ip; 77 struct xfs_inode *ip;
@@ -1602,6 +1603,26 @@ xfs_file_ioctl(
1602 error = xfs_errortag_clearall(mp, 1); 1603 error = xfs_errortag_clearall(mp, 1);
1603 return -error; 1604 return -error;
1604 1605
1606 case XFS_IOC_FREE_EOFBLOCKS: {
1607 struct xfs_eofblocks eofb;
1608
1609 if (copy_from_user(&eofb, arg, sizeof(eofb)))
1610 return -XFS_ERROR(EFAULT);
1611
1612 if (eofb.eof_version != XFS_EOFBLOCKS_VERSION)
1613 return -XFS_ERROR(EINVAL);
1614
1615 if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID)
1616 return -XFS_ERROR(EINVAL);
1617
1618 if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) ||
1619 memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64)))
1620 return -XFS_ERROR(EINVAL);
1621
1622 error = xfs_icache_free_eofblocks(mp, &eofb);
1623 return -error;
1624 }
1625
1605 default: 1626 default:
1606 return -ENOTTY; 1627 return -ENOTTY;
1607 } 1628 }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 973dff6ad935..add06b4e9a63 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -41,6 +41,7 @@
41#include "xfs_utils.h" 41#include "xfs_utils.h"
42#include "xfs_iomap.h" 42#include "xfs_iomap.h"
43#include "xfs_trace.h" 43#include "xfs_trace.h"
44#include "xfs_icache.h"
44 45
45 46
46#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ 47#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
@@ -373,7 +374,7 @@ xfs_iomap_write_delay(
373 xfs_extlen_t extsz; 374 xfs_extlen_t extsz;
374 int nimaps; 375 int nimaps;
375 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; 376 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
376 int prealloc, flushed = 0; 377 int prealloc;
377 int error; 378 int error;
378 379
379 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 380 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -434,31 +435,29 @@ retry:
434 } 435 }
435 436
436 /* 437 /*
437 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For 438 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
438 * ENOSPC, * flush all other inodes with delalloc blocks to free up
439 * some of the excess reserved metadata space. For both cases, retry
440 * without EOF preallocation. 439 * without EOF preallocation.
441 */ 440 */
442 if (nimaps == 0) { 441 if (nimaps == 0) {
443 trace_xfs_delalloc_enospc(ip, offset, count); 442 trace_xfs_delalloc_enospc(ip, offset, count);
444 if (flushed) 443 if (prealloc) {
445 return XFS_ERROR(error ? error : ENOSPC); 444 prealloc = 0;
446 445 error = 0;
447 if (error == ENOSPC) { 446 goto retry;
448 xfs_iunlock(ip, XFS_ILOCK_EXCL);
449 xfs_flush_inodes(ip);
450 xfs_ilock(ip, XFS_ILOCK_EXCL);
451 } 447 }
452 448 return XFS_ERROR(error ? error : ENOSPC);
453 flushed = 1;
454 error = 0;
455 prealloc = 0;
456 goto retry;
457 } 449 }
458 450
459 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) 451 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
460 return xfs_alert_fsblock_zero(ip, &imap[0]); 452 return xfs_alert_fsblock_zero(ip, &imap[0]);
461 453
454 /*
455 * Tag the inode as speculatively preallocated so we can reclaim this
456 * space on demand, if necessary.
457 */
458 if (prealloc)
459 xfs_inode_set_eofblocks_tag(ip);
460
462 *ret_imap = imap[0]; 461 *ret_imap = imap[0];
463 return 0; 462 return 0;
464} 463}
@@ -584,7 +583,9 @@ xfs_iomap_write_allocate(
584 * pointer that the caller gave to us. 583 * pointer that the caller gave to us.
585 */ 584 */
586 error = xfs_bmapi_write(tp, ip, map_start_fsb, 585 error = xfs_bmapi_write(tp, ip, map_start_fsb,
587 count_fsb, 0, &first_block, 1, 586 count_fsb,
587 XFS_BMAPI_STACK_SWITCH,
588 &first_block, 1,
588 imap, &nimaps, &free_list); 589 imap, &nimaps, &free_list);
589 if (error) 590 if (error)
590 goto trans_cancel; 591 goto trans_cancel;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 4e00cf091d2c..d82efaa2ac73 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,6 +38,7 @@
38#include "xfs_vnodeops.h" 38#include "xfs_vnodeops.h"
39#include "xfs_inode_item.h" 39#include "xfs_inode_item.h"
40#include "xfs_trace.h" 40#include "xfs_trace.h"
41#include "xfs_icache.h"
41 42
42#include <linux/capability.h> 43#include <linux/capability.h>
43#include <linux/xattr.h> 44#include <linux/xattr.h>
@@ -779,8 +780,8 @@ xfs_setattr_size(
779 * care about here. 780 * care about here.
780 */ 781 */
781 if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { 782 if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
782 error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0, 783 error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
783 FI_NONE); 784 ip->i_d.di_size, newsize);
784 if (error) 785 if (error)
785 goto out_unlock; 786 goto out_unlock;
786 } 787 }
@@ -854,6 +855,9 @@ xfs_setattr_size(
854 * and do not wait the usual (long) time for writeout. 855 * and do not wait the usual (long) time for writeout.
855 */ 856 */
856 xfs_iflags_set(ip, XFS_ITRUNCATED); 857 xfs_iflags_set(ip, XFS_ITRUNCATED);
858
859 /* A truncate down always removes post-EOF blocks. */
860 xfs_inode_clear_eofblocks_tag(ip);
857 } 861 }
858 862
859 if (mask & ATTR_CTIME) { 863 if (mask & ATTR_CTIME) {
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 01d10a66e302..2ea7d402188d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -34,6 +34,7 @@
34#include "xfs_error.h" 34#include "xfs_error.h"
35#include "xfs_btree.h" 35#include "xfs_btree.h"
36#include "xfs_trace.h" 36#include "xfs_trace.h"
37#include "xfs_icache.h"
37 38
38STATIC int 39STATIC int
39xfs_internal_inum( 40xfs_internal_inum(
@@ -395,7 +396,8 @@ xfs_bulkstat(
395 if (xfs_inobt_maskn(chunkidx, nicluster) 396 if (xfs_inobt_maskn(chunkidx, nicluster)
396 & ~r.ir_free) 397 & ~r.ir_free)
397 xfs_btree_reada_bufs(mp, agno, 398 xfs_btree_reada_bufs(mp, agno,
398 agbno, nbcluster); 399 agbno, nbcluster,
400 &xfs_inode_buf_ops);
399 } 401 }
400 irbp->ir_startino = r.ir_startino; 402 irbp->ir_startino = r.ir_startino;
401 irbp->ir_freecount = r.ir_freecount; 403 irbp->ir_freecount = r.ir_freecount;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 828662f70d64..fe7e4df85a7b 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -44,6 +44,7 @@
44#include <linux/kernel.h> 44#include <linux/kernel.h>
45#include <linux/blkdev.h> 45#include <linux/blkdev.h>
46#include <linux/slab.h> 46#include <linux/slab.h>
47#include <linux/crc32c.h>
47#include <linux/module.h> 48#include <linux/module.h>
48#include <linux/mutex.h> 49#include <linux/mutex.h>
49#include <linux/file.h> 50#include <linux/file.h>
@@ -118,6 +119,7 @@
118#define xfs_rotorstep xfs_params.rotorstep.val 119#define xfs_rotorstep xfs_params.rotorstep.val
119#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val 120#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val
120#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val 121#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val
122#define xfs_eofb_secs xfs_params.eofb_timer.val
121 123
122#define current_cpu() (raw_smp_processor_id()) 124#define current_cpu() (raw_smp_processor_id())
123#define current_pid() (current->pid) 125#define current_pid() (current->pid)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 7f4f9370d0e7..46bd9d52ab51 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -34,6 +34,8 @@
34#include "xfs_dinode.h" 34#include "xfs_dinode.h"
35#include "xfs_inode.h" 35#include "xfs_inode.h"
36#include "xfs_trace.h" 36#include "xfs_trace.h"
37#include "xfs_fsops.h"
38#include "xfs_cksum.h"
37 39
38kmem_zone_t *xfs_log_ticket_zone; 40kmem_zone_t *xfs_log_ticket_zone;
39 41
@@ -458,7 +460,8 @@ xfs_log_reserve(
458 tic->t_trans_type = t_type; 460 tic->t_trans_type = t_type;
459 *ticp = tic; 461 *ticp = tic;
460 462
461 xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt); 463 xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
464 : tic->t_unit_res);
462 465
463 trace_xfs_log_reserve(log, tic); 466 trace_xfs_log_reserve(log, tic);
464 467
@@ -679,25 +682,29 @@ out:
679} 682}
680 683
681/* 684/*
682 * Finish the recovery of the file system. This is separate from 685 * Finish the recovery of the file system. This is separate from the
683 * the xfs_log_mount() call, because it depends on the code in 686 * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read
684 * xfs_mountfs() to read in the root and real-time bitmap inodes 687 * in the root and real-time bitmap inodes between calling xfs_log_mount() and
685 * between calling xfs_log_mount() and here. 688 * here.
686 * 689 *
687 * mp - ubiquitous xfs mount point structure 690 * If we finish recovery successfully, start the background log work. If we are
691 * not doing recovery, then we have a RO filesystem and we don't need to start
692 * it.
688 */ 693 */
689int 694int
690xfs_log_mount_finish(xfs_mount_t *mp) 695xfs_log_mount_finish(xfs_mount_t *mp)
691{ 696{
692 int error; 697 int error = 0;
693 698
694 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) 699 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
695 error = xlog_recover_finish(mp->m_log); 700 error = xlog_recover_finish(mp->m_log);
696 else { 701 if (!error)
697 error = 0; 702 xfs_log_work_queue(mp);
703 } else {
698 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 704 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
699 } 705 }
700 706
707
701 return error; 708 return error;
702} 709}
703 710
@@ -850,15 +857,49 @@ xfs_log_unmount_write(xfs_mount_t *mp)
850} /* xfs_log_unmount_write */ 857} /* xfs_log_unmount_write */
851 858
852/* 859/*
853 * Deallocate log structures for unmount/relocation. 860 * Empty the log for unmount/freeze.
861 *
862 * To do this, we first need to shut down the background log work so it is not
863 * trying to cover the log as we clean up. We then need to unpin all objects in
864 * the log so we can then flush them out. Once they have completed their IO and
865 * run the callbacks removing themselves from the AIL, we can write the unmount
866 * record.
867 */
868void
869xfs_log_quiesce(
870 struct xfs_mount *mp)
871{
872 cancel_delayed_work_sync(&mp->m_log->l_work);
873 xfs_log_force(mp, XFS_LOG_SYNC);
874
875 /*
876 * The superblock buffer is uncached and while xfs_ail_push_all_sync()
877 * will push it, xfs_wait_buftarg() will not wait for it. Further,
878 * xfs_buf_iowait() cannot be used because it was pushed with the
879 * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
880 * the IO to complete.
881 */
882 xfs_ail_push_all_sync(mp->m_ail);
883 xfs_wait_buftarg(mp->m_ddev_targp);
884 xfs_buf_lock(mp->m_sb_bp);
885 xfs_buf_unlock(mp->m_sb_bp);
886
887 xfs_log_unmount_write(mp);
888}
889
890/*
891 * Shut down and release the AIL and Log.
854 * 892 *
855 * We need to stop the aild from running before we destroy 893 * During unmount, we need to ensure we flush all the dirty metadata objects
856 * and deallocate the log as the aild references the log. 894 * from the AIL so that the log is empty before we write the unmount record to
895 * the log. Once this is done, we can tear down the AIL and the log.
857 */ 896 */
858void 897void
859xfs_log_unmount(xfs_mount_t *mp) 898xfs_log_unmount(
899 struct xfs_mount *mp)
860{ 900{
861 cancel_delayed_work_sync(&mp->m_sync_work); 901 xfs_log_quiesce(mp);
902
862 xfs_trans_ail_destroy(mp); 903 xfs_trans_ail_destroy(mp);
863 xlog_dealloc_log(mp->m_log); 904 xlog_dealloc_log(mp->m_log);
864} 905}
@@ -1090,8 +1131,7 @@ xlog_iodone(xfs_buf_t *bp)
1090 * with it being freed after writing the unmount record to the 1131 * with it being freed after writing the unmount record to the
1091 * log. 1132 * log.
1092 */ 1133 */
1093 1134}
1094} /* xlog_iodone */
1095 1135
1096/* 1136/*
1097 * Return size of each in-core log record buffer. 1137 * Return size of each in-core log record buffer.
@@ -1161,6 +1201,40 @@ done:
1161} /* xlog_get_iclog_buffer_size */ 1201} /* xlog_get_iclog_buffer_size */
1162 1202
1163 1203
1204void
1205xfs_log_work_queue(
1206 struct xfs_mount *mp)
1207{
1208 queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work,
1209 msecs_to_jiffies(xfs_syncd_centisecs * 10));
1210}
1211
1212/*
1213 * Every sync period we need to unpin all items in the AIL and push them to
1214 * disk. If there is nothing dirty, then we might need to cover the log to
1215 * indicate that the filesystem is idle.
1216 */
1217void
1218xfs_log_worker(
1219 struct work_struct *work)
1220{
1221 struct xlog *log = container_of(to_delayed_work(work),
1222 struct xlog, l_work);
1223 struct xfs_mount *mp = log->l_mp;
1224
1225 /* dgc: errors ignored - not fatal and nowhere to report them */
1226 if (xfs_log_need_covered(mp))
1227 xfs_fs_log_dummy(mp);
1228 else
1229 xfs_log_force(mp, 0);
1230
1231 /* start pushing all the metadata that is currently dirty */
1232 xfs_ail_push_all(mp->m_ail);
1233
1234 /* queue us up again */
1235 xfs_log_work_queue(mp);
1236}
1237
1164/* 1238/*
1165 * This routine initializes some of the log structure for a given mount point. 1239 * This routine initializes some of the log structure for a given mount point.
1166 * Its primary purpose is to fill in enough, so recovery can occur. However, 1240 * Its primary purpose is to fill in enough, so recovery can occur. However,
@@ -1195,6 +1269,7 @@ xlog_alloc_log(
1195 log->l_logBBsize = num_bblks; 1269 log->l_logBBsize = num_bblks;
1196 log->l_covered_state = XLOG_STATE_COVER_IDLE; 1270 log->l_covered_state = XLOG_STATE_COVER_IDLE;
1197 log->l_flags |= XLOG_ACTIVE_RECOVERY; 1271 log->l_flags |= XLOG_ACTIVE_RECOVERY;
1272 INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
1198 1273
1199 log->l_prev_block = -1; 1274 log->l_prev_block = -1;
1200 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ 1275 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
@@ -1417,6 +1492,84 @@ xlog_grant_push_ail(
1417} 1492}
1418 1493
1419/* 1494/*
1495 * Stamp cycle number in every block
1496 */
1497STATIC void
1498xlog_pack_data(
1499 struct xlog *log,
1500 struct xlog_in_core *iclog,
1501 int roundoff)
1502{
1503 int i, j, k;
1504 int size = iclog->ic_offset + roundoff;
1505 __be32 cycle_lsn;
1506 xfs_caddr_t dp;
1507
1508 cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
1509
1510 dp = iclog->ic_datap;
1511 for (i = 0; i < BTOBB(size); i++) {
1512 if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE))
1513 break;
1514 iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
1515 *(__be32 *)dp = cycle_lsn;
1516 dp += BBSIZE;
1517 }
1518
1519 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1520 xlog_in_core_2_t *xhdr = iclog->ic_data;
1521
1522 for ( ; i < BTOBB(size); i++) {
1523 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1524 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1525 xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
1526 *(__be32 *)dp = cycle_lsn;
1527 dp += BBSIZE;
1528 }
1529
1530 for (i = 1; i < log->l_iclog_heads; i++)
1531 xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
1532 }
1533}
1534
1535/*
1536 * Calculate the checksum for a log buffer.
1537 *
1538 * This is a little more complicated than it should be because the various
1539 * headers and the actual data are non-contiguous.
1540 */
1541__le32
1542xlog_cksum(
1543 struct xlog *log,
1544 struct xlog_rec_header *rhead,
1545 char *dp,
1546 int size)
1547{
1548 __uint32_t crc;
1549
1550 /* first generate the crc for the record header ... */
1551 crc = xfs_start_cksum((char *)rhead,
1552 sizeof(struct xlog_rec_header),
1553 offsetof(struct xlog_rec_header, h_crc));
1554
1555 /* ... then for additional cycle data for v2 logs ... */
1556 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1557 union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
1558 int i;
1559
1560 for (i = 1; i < log->l_iclog_heads; i++) {
1561 crc = crc32c(crc, &xhdr[i].hic_xheader,
1562 sizeof(struct xlog_rec_ext_header));
1563 }
1564 }
1565
1566 /* ... and finally for the payload */
1567 crc = crc32c(crc, dp, size);
1568
1569 return xfs_end_cksum(crc);
1570}
1571
1572/*
1420 * The bdstrat callback function for log bufs. This gives us a central 1573 * The bdstrat callback function for log bufs. This gives us a central
1421 * place to trap bufs in case we get hit by a log I/O error and need to 1574 * place to trap bufs in case we get hit by a log I/O error and need to
1422 * shutdown. Actually, in practice, even when we didn't get a log error, 1575 * shutdown. Actually, in practice, even when we didn't get a log error,
@@ -1476,7 +1629,6 @@ xlog_sync(
1476 struct xlog *log, 1629 struct xlog *log,
1477 struct xlog_in_core *iclog) 1630 struct xlog_in_core *iclog)
1478{ 1631{
1479 xfs_caddr_t dptr; /* pointer to byte sized element */
1480 xfs_buf_t *bp; 1632 xfs_buf_t *bp;
1481 int i; 1633 int i;
1482 uint count; /* byte count of bwrite */ 1634 uint count; /* byte count of bwrite */
@@ -1485,6 +1637,7 @@ xlog_sync(
1485 int split = 0; /* split write into two regions */ 1637 int split = 0; /* split write into two regions */
1486 int error; 1638 int error;
1487 int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); 1639 int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
1640 int size;
1488 1641
1489 XFS_STATS_INC(xs_log_writes); 1642 XFS_STATS_INC(xs_log_writes);
1490 ASSERT(atomic_read(&iclog->ic_refcnt) == 0); 1643 ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
@@ -1515,13 +1668,10 @@ xlog_sync(
1515 xlog_pack_data(log, iclog, roundoff); 1668 xlog_pack_data(log, iclog, roundoff);
1516 1669
1517 /* real byte length */ 1670 /* real byte length */
1518 if (v2) { 1671 size = iclog->ic_offset;
1519 iclog->ic_header.h_len = 1672 if (v2)
1520 cpu_to_be32(iclog->ic_offset + roundoff); 1673 size += roundoff;
1521 } else { 1674 iclog->ic_header.h_len = cpu_to_be32(size);
1522 iclog->ic_header.h_len =
1523 cpu_to_be32(iclog->ic_offset);
1524 }
1525 1675
1526 bp = iclog->ic_bp; 1676 bp = iclog->ic_bp;
1527 XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); 1677 XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
@@ -1530,12 +1680,36 @@ xlog_sync(
1530 1680
1531 /* Do we need to split this write into 2 parts? */ 1681 /* Do we need to split this write into 2 parts? */
1532 if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { 1682 if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
1683 char *dptr;
1684
1533 split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp))); 1685 split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
1534 count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)); 1686 count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
1535 iclog->ic_bwritecnt = 2; /* split into 2 writes */ 1687 iclog->ic_bwritecnt = 2;
1688
1689 /*
1690 * Bump the cycle numbers at the start of each block in the
1691 * part of the iclog that ends up in the buffer that gets
1692 * written to the start of the log.
1693 *
1694 * Watch out for the header magic number case, though.
1695 */
1696 dptr = (char *)&iclog->ic_header + count;
1697 for (i = 0; i < split; i += BBSIZE) {
1698 __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr);
1699 if (++cycle == XLOG_HEADER_MAGIC_NUM)
1700 cycle++;
1701 *(__be32 *)dptr = cpu_to_be32(cycle);
1702
1703 dptr += BBSIZE;
1704 }
1536 } else { 1705 } else {
1537 iclog->ic_bwritecnt = 1; 1706 iclog->ic_bwritecnt = 1;
1538 } 1707 }
1708
1709 /* calculcate the checksum */
1710 iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
1711 iclog->ic_datap, size);
1712
1539 bp->b_io_length = BTOBB(count); 1713 bp->b_io_length = BTOBB(count);
1540 bp->b_fspriv = iclog; 1714 bp->b_fspriv = iclog;
1541 XFS_BUF_ZEROFLAGS(bp); 1715 XFS_BUF_ZEROFLAGS(bp);
@@ -1589,19 +1763,6 @@ xlog_sync(
1589 bp->b_flags |= XBF_SYNCIO; 1763 bp->b_flags |= XBF_SYNCIO;
1590 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) 1764 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
1591 bp->b_flags |= XBF_FUA; 1765 bp->b_flags |= XBF_FUA;
1592 dptr = bp->b_addr;
1593 /*
1594 * Bump the cycle numbers at the start of each block
1595 * since this part of the buffer is at the start of
1596 * a new cycle. Watch out for the header magic number
1597 * case, though.
1598 */
1599 for (i = 0; i < split; i += BBSIZE) {
1600 be32_add_cpu((__be32 *)dptr, 1);
1601 if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM)
1602 be32_add_cpu((__be32 *)dptr, 1);
1603 dptr += BBSIZE;
1604 }
1605 1766
1606 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); 1767 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
1607 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); 1768 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -1618,7 +1779,6 @@ xlog_sync(
1618 return 0; 1779 return 0;
1619} /* xlog_sync */ 1780} /* xlog_sync */
1620 1781
1621
1622/* 1782/*
1623 * Deallocate a log structure 1783 * Deallocate a log structure
1624 */ 1784 */
@@ -2387,14 +2547,27 @@ xlog_state_do_callback(
2387 2547
2388 2548
2389 /* 2549 /*
2390 * update the last_sync_lsn before we drop the 2550 * Completion of a iclog IO does not imply that
2551 * a transaction has completed, as transactions
2552 * can be large enough to span many iclogs. We
2553 * cannot change the tail of the log half way
2554 * through a transaction as this may be the only
2555 * transaction in the log and moving th etail to
2556 * point to the middle of it will prevent
2557 * recovery from finding the start of the
2558 * transaction. Hence we should only update the
2559 * last_sync_lsn if this iclog contains
2560 * transaction completion callbacks on it.
2561 *
2562 * We have to do this before we drop the
2391 * icloglock to ensure we are the only one that 2563 * icloglock to ensure we are the only one that
2392 * can update it. 2564 * can update it.
2393 */ 2565 */
2394 ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), 2566 ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
2395 be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); 2567 be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
2396 atomic64_set(&log->l_last_sync_lsn, 2568 if (iclog->ic_callback)
2397 be64_to_cpu(iclog->ic_header.h_lsn)); 2569 atomic64_set(&log->l_last_sync_lsn,
2570 be64_to_cpu(iclog->ic_header.h_lsn));
2398 2571
2399 } else 2572 } else
2400 ioerrors++; 2573 ioerrors++;
@@ -3700,3 +3873,4 @@ xlog_iclogs_empty(
3700 } while (iclog != log->l_iclog); 3873 } while (iclog != log->l_iclog);
3701 return 1; 3874 return 1;
3702} 3875}
3876
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 748d312850e2..5caee96059df 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -181,5 +181,9 @@ int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
181 xfs_lsn_t *commit_lsn, int flags); 181 xfs_lsn_t *commit_lsn, int flags);
182bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); 182bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
183 183
184void xfs_log_work_queue(struct xfs_mount *mp);
185void xfs_log_worker(struct work_struct *work);
186void xfs_log_quiesce(struct xfs_mount *mp);
187
184#endif 188#endif
185#endif /* __XFS_LOG_H__ */ 189#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 18a801d76a42..16d8d12ea3b4 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -139,7 +139,6 @@ static inline uint xlog_get_client_id(__be32 i)
139/* 139/*
140 * Flags for log structure 140 * Flags for log structure
141 */ 141 */
142#define XLOG_CHKSUM_MISMATCH 0x1 /* used only during recovery */
143#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ 142#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */
144#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ 143#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
145#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being 144#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
@@ -291,7 +290,7 @@ typedef struct xlog_rec_header {
291 __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */ 290 __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */
292 __be64 h_lsn; /* lsn of this LR : 8 */ 291 __be64 h_lsn; /* lsn of this LR : 8 */
293 __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ 292 __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */
294 __be32 h_chksum; /* may not be used; non-zero if used : 4 */ 293 __le32 h_crc; /* crc of log record : 4 */
295 __be32 h_prev_block; /* block number to previous LR : 4 */ 294 __be32 h_prev_block; /* block number to previous LR : 4 */
296 __be32 h_num_logops; /* number of log operations in this LR : 4 */ 295 __be32 h_num_logops; /* number of log operations in this LR : 4 */
297 __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; 296 __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
@@ -495,6 +494,7 @@ struct xlog {
495 struct xfs_buf *l_xbuf; /* extra buffer for log 494 struct xfs_buf *l_xbuf; /* extra buffer for log
496 * wrapping */ 495 * wrapping */
497 struct xfs_buftarg *l_targ; /* buftarg of log */ 496 struct xfs_buftarg *l_targ; /* buftarg of log */
497 struct delayed_work l_work; /* background flush work */
498 uint l_flags; 498 uint l_flags;
499 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ 499 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
500 struct list_head *l_buf_cancel_table; 500 struct list_head *l_buf_cancel_table;
@@ -554,11 +554,9 @@ xlog_recover(
554extern int 554extern int
555xlog_recover_finish( 555xlog_recover_finish(
556 struct xlog *log); 556 struct xlog *log);
557extern void 557
558xlog_pack_data( 558extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
559 struct xlog *log, 559 char *dp, int size);
560 struct xlog_in_core *iclog,
561 int);
562 560
563extern kmem_zone_t *xfs_log_ticket_zone; 561extern kmem_zone_t *xfs_log_ticket_zone;
564struct xlog_ticket * 562struct xlog_ticket *
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 5da3ace352bf..96fcbb85ff83 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -41,7 +41,9 @@
41#include "xfs_trans_priv.h" 41#include "xfs_trans_priv.h"
42#include "xfs_quota.h" 42#include "xfs_quota.h"
43#include "xfs_utils.h" 43#include "xfs_utils.h"
44#include "xfs_cksum.h"
44#include "xfs_trace.h" 45#include "xfs_trace.h"
46#include "xfs_icache.h"
45 47
46STATIC int 48STATIC int
47xlog_find_zeroed( 49xlog_find_zeroed(
@@ -2143,7 +2145,7 @@ xlog_recover_buffer_pass2(
2143 buf_flags |= XBF_UNMAPPED; 2145 buf_flags |= XBF_UNMAPPED;
2144 2146
2145 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, 2147 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2146 buf_flags); 2148 buf_flags, NULL);
2147 if (!bp) 2149 if (!bp)
2148 return XFS_ERROR(ENOMEM); 2150 return XFS_ERROR(ENOMEM);
2149 error = bp->b_error; 2151 error = bp->b_error;
@@ -2236,7 +2238,8 @@ xlog_recover_inode_pass2(
2236 } 2238 }
2237 trace_xfs_log_recover_inode_recover(log, in_f); 2239 trace_xfs_log_recover_inode_recover(log, in_f);
2238 2240
2239 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0); 2241 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
2242 NULL);
2240 if (!bp) { 2243 if (!bp) {
2241 error = ENOMEM; 2244 error = ENOMEM;
2242 goto error; 2245 goto error;
@@ -2547,7 +2550,8 @@ xlog_recover_dquot_pass2(
2547 ASSERT(dq_f->qlf_len == 1); 2550 ASSERT(dq_f->qlf_len == 1);
2548 2551
2549 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, 2552 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
2550 XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp); 2553 XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
2554 NULL);
2551 if (error) 2555 if (error)
2552 return error; 2556 return error;
2553 2557
@@ -3213,80 +3217,58 @@ xlog_recover_process_iunlinks(
3213 mp->m_dmevmask = mp_dmevmask; 3217 mp->m_dmevmask = mp_dmevmask;
3214} 3218}
3215 3219
3216
3217#ifdef DEBUG
3218STATIC void
3219xlog_pack_data_checksum(
3220 struct xlog *log,
3221 struct xlog_in_core *iclog,
3222 int size)
3223{
3224 int i;
3225 __be32 *up;
3226 uint chksum = 0;
3227
3228 up = (__be32 *)iclog->ic_datap;
3229 /* divide length by 4 to get # words */
3230 for (i = 0; i < (size >> 2); i++) {
3231 chksum ^= be32_to_cpu(*up);
3232 up++;
3233 }
3234 iclog->ic_header.h_chksum = cpu_to_be32(chksum);
3235}
3236#else
3237#define xlog_pack_data_checksum(log, iclog, size)
3238#endif
3239
3240/* 3220/*
3241 * Stamp cycle number in every block 3221 * Upack the log buffer data and crc check it. If the check fails, issue a
3222 * warning if and only if the CRC in the header is non-zero. This makes the
3223 * check an advisory warning, and the zero CRC check will prevent failure
3224 * warnings from being emitted when upgrading the kernel from one that does not
3225 * add CRCs by default.
3226 *
3227 * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
3228 * corruption failure
3242 */ 3229 */
3243void 3230STATIC int
3244xlog_pack_data( 3231xlog_unpack_data_crc(
3245 struct xlog *log, 3232 struct xlog_rec_header *rhead,
3246 struct xlog_in_core *iclog, 3233 xfs_caddr_t dp,
3247 int roundoff) 3234 struct xlog *log)
3248{ 3235{
3249 int i, j, k; 3236 __le32 crc;
3250 int size = iclog->ic_offset + roundoff; 3237
3251 __be32 cycle_lsn; 3238 crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
3252 xfs_caddr_t dp; 3239 if (crc != rhead->h_crc) {
3253 3240 if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
3254 xlog_pack_data_checksum(log, iclog, size); 3241 xfs_alert(log->l_mp,
3255 3242 "log record CRC mismatch: found 0x%x, expected 0x%x.\n",
3256 cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); 3243 le32_to_cpu(rhead->h_crc),
3257 3244 le32_to_cpu(crc));
3258 dp = iclog->ic_datap; 3245 xfs_hex_dump(dp, 32);
3259 for (i = 0; i < BTOBB(size) &&
3260 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3261 iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
3262 *(__be32 *)dp = cycle_lsn;
3263 dp += BBSIZE;
3264 }
3265
3266 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3267 xlog_in_core_2_t *xhdr = iclog->ic_data;
3268
3269 for ( ; i < BTOBB(size); i++) {
3270 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3271 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3272 xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
3273 *(__be32 *)dp = cycle_lsn;
3274 dp += BBSIZE;
3275 } 3246 }
3276 3247
3277 for (i = 1; i < log->l_iclog_heads; i++) { 3248 /*
3278 xhdr[i].hic_xheader.xh_cycle = cycle_lsn; 3249 * If we've detected a log record corruption, then we can't
3279 } 3250 * recover past this point. Abort recovery if we are enforcing
3251 * CRC protection by punting an error back up the stack.
3252 */
3253 if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
3254 return EFSCORRUPTED;
3280 } 3255 }
3256
3257 return 0;
3281} 3258}
3282 3259
3283STATIC void 3260STATIC int
3284xlog_unpack_data( 3261xlog_unpack_data(
3285 struct xlog_rec_header *rhead, 3262 struct xlog_rec_header *rhead,
3286 xfs_caddr_t dp, 3263 xfs_caddr_t dp,
3287 struct xlog *log) 3264 struct xlog *log)
3288{ 3265{
3289 int i, j, k; 3266 int i, j, k;
3267 int error;
3268
3269 error = xlog_unpack_data_crc(rhead, dp, log);
3270 if (error)
3271 return error;
3290 3272
3291 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && 3273 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
3292 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { 3274 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
@@ -3303,6 +3285,8 @@ xlog_unpack_data(
3303 dp += BBSIZE; 3285 dp += BBSIZE;
3304 } 3286 }
3305 } 3287 }
3288
3289 return 0;
3306} 3290}
3307 3291
3308STATIC int 3292STATIC int
@@ -3434,9 +3418,13 @@ xlog_do_recovery_pass(
3434 if (error) 3418 if (error)
3435 goto bread_err2; 3419 goto bread_err2;
3436 3420
3437 xlog_unpack_data(rhead, offset, log); 3421 error = xlog_unpack_data(rhead, offset, log);
3438 if ((error = xlog_recover_process_data(log, 3422 if (error)
3439 rhash, rhead, offset, pass))) 3423 goto bread_err2;
3424
3425 error = xlog_recover_process_data(log,
3426 rhash, rhead, offset, pass);
3427 if (error)
3440 goto bread_err2; 3428 goto bread_err2;
3441 blk_no += bblks + hblks; 3429 blk_no += bblks + hblks;
3442 } 3430 }
@@ -3541,14 +3529,19 @@ xlog_do_recovery_pass(
3541 * - order is important. 3529 * - order is important.
3542 */ 3530 */
3543 error = xlog_bread_offset(log, 0, 3531 error = xlog_bread_offset(log, 0,
3544 bblks - split_bblks, hbp, 3532 bblks - split_bblks, dbp,
3545 offset + BBTOB(split_bblks)); 3533 offset + BBTOB(split_bblks));
3546 if (error) 3534 if (error)
3547 goto bread_err2; 3535 goto bread_err2;
3548 } 3536 }
3549 xlog_unpack_data(rhead, offset, log); 3537
3550 if ((error = xlog_recover_process_data(log, rhash, 3538 error = xlog_unpack_data(rhead, offset, log);
3551 rhead, offset, pass))) 3539 if (error)
3540 goto bread_err2;
3541
3542 error = xlog_recover_process_data(log, rhash,
3543 rhead, offset, pass);
3544 if (error)
3552 goto bread_err2; 3545 goto bread_err2;
3553 blk_no += bblks; 3546 blk_no += bblks;
3554 } 3547 }
@@ -3573,9 +3566,13 @@ xlog_do_recovery_pass(
3573 if (error) 3566 if (error)
3574 goto bread_err2; 3567 goto bread_err2;
3575 3568
3576 xlog_unpack_data(rhead, offset, log); 3569 error = xlog_unpack_data(rhead, offset, log);
3577 if ((error = xlog_recover_process_data(log, rhash, 3570 if (error)
3578 rhead, offset, pass))) 3571 goto bread_err2;
3572
3573 error = xlog_recover_process_data(log, rhash,
3574 rhead, offset, pass);
3575 if (error)
3579 goto bread_err2; 3576 goto bread_err2;
3580 blk_no += bblks + hblks; 3577 blk_no += bblks + hblks;
3581 } 3578 }
@@ -3689,13 +3686,14 @@ xlog_do_recover(
3689 3686
3690 /* 3687 /*
3691 * Now that we've finished replaying all buffer and inode 3688 * Now that we've finished replaying all buffer and inode
3692 * updates, re-read in the superblock. 3689 * updates, re-read in the superblock and reverify it.
3693 */ 3690 */
3694 bp = xfs_getsb(log->l_mp, 0); 3691 bp = xfs_getsb(log->l_mp, 0);
3695 XFS_BUF_UNDONE(bp); 3692 XFS_BUF_UNDONE(bp);
3696 ASSERT(!(XFS_BUF_ISWRITE(bp))); 3693 ASSERT(!(XFS_BUF_ISWRITE(bp)));
3697 XFS_BUF_READ(bp); 3694 XFS_BUF_READ(bp);
3698 XFS_BUF_UNASYNC(bp); 3695 XFS_BUF_UNASYNC(bp);
3696 bp->b_ops = &xfs_sb_buf_ops;
3699 xfsbdstrat(log->l_mp, bp); 3697 xfsbdstrat(log->l_mp, bp);
3700 error = xfs_buf_iowait(bp); 3698 error = xfs_buf_iowait(bp);
3701 if (error) { 3699 if (error) {
@@ -3707,7 +3705,7 @@ xlog_do_recover(
3707 3705
3708 /* Convert superblock from on-disk format */ 3706 /* Convert superblock from on-disk format */
3709 sbp = &log->l_mp->m_sb; 3707 sbp = &log->l_mp->m_sb;
3710 xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp)); 3708 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
3711 ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); 3709 ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
3712 ASSERT(xfs_sb_good_version(sbp)); 3710 ASSERT(xfs_sb_good_version(sbp));
3713 xfs_buf_relse(bp); 3711 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b2bd3a0e6376..da508463ff10 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -42,6 +42,7 @@
42#include "xfs_fsops.h" 42#include "xfs_fsops.h"
43#include "xfs_utils.h" 43#include "xfs_utils.h"
44#include "xfs_trace.h" 44#include "xfs_trace.h"
45#include "xfs_icache.h"
45 46
46 47
47#ifdef HAVE_PERCPU_SB 48#ifdef HAVE_PERCPU_SB
@@ -303,9 +304,8 @@ STATIC int
303xfs_mount_validate_sb( 304xfs_mount_validate_sb(
304 xfs_mount_t *mp, 305 xfs_mount_t *mp,
305 xfs_sb_t *sbp, 306 xfs_sb_t *sbp,
306 int flags) 307 bool check_inprogress)
307{ 308{
308 int loud = !(flags & XFS_MFSI_QUIET);
309 309
310 /* 310 /*
311 * If the log device and data device have the 311 * If the log device and data device have the
@@ -315,21 +315,18 @@ xfs_mount_validate_sb(
315 * a volume filesystem in a non-volume manner. 315 * a volume filesystem in a non-volume manner.
316 */ 316 */
317 if (sbp->sb_magicnum != XFS_SB_MAGIC) { 317 if (sbp->sb_magicnum != XFS_SB_MAGIC) {
318 if (loud) 318 xfs_warn(mp, "bad magic number");
319 xfs_warn(mp, "bad magic number");
320 return XFS_ERROR(EWRONGFS); 319 return XFS_ERROR(EWRONGFS);
321 } 320 }
322 321
323 if (!xfs_sb_good_version(sbp)) { 322 if (!xfs_sb_good_version(sbp)) {
324 if (loud) 323 xfs_warn(mp, "bad version");
325 xfs_warn(mp, "bad version");
326 return XFS_ERROR(EWRONGFS); 324 return XFS_ERROR(EWRONGFS);
327 } 325 }
328 326
329 if (unlikely( 327 if (unlikely(
330 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { 328 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
331 if (loud) 329 xfs_warn(mp,
332 xfs_warn(mp,
333 "filesystem is marked as having an external log; " 330 "filesystem is marked as having an external log; "
334 "specify logdev on the mount command line."); 331 "specify logdev on the mount command line.");
335 return XFS_ERROR(EINVAL); 332 return XFS_ERROR(EINVAL);
@@ -337,8 +334,7 @@ xfs_mount_validate_sb(
337 334
338 if (unlikely( 335 if (unlikely(
339 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { 336 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
340 if (loud) 337 xfs_warn(mp,
341 xfs_warn(mp,
342 "filesystem is marked as having an internal log; " 338 "filesystem is marked as having an internal log; "
343 "do not specify logdev on the mount command line."); 339 "do not specify logdev on the mount command line.");
344 return XFS_ERROR(EINVAL); 340 return XFS_ERROR(EINVAL);
@@ -372,8 +368,7 @@ xfs_mount_validate_sb(
372 sbp->sb_dblocks == 0 || 368 sbp->sb_dblocks == 0 ||
373 sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || 369 sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) ||
374 sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) { 370 sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
375 if (loud) 371 XFS_CORRUPTION_ERROR("SB sanity check failed",
376 XFS_CORRUPTION_ERROR("SB sanity check failed",
377 XFS_ERRLEVEL_LOW, mp, sbp); 372 XFS_ERRLEVEL_LOW, mp, sbp);
378 return XFS_ERROR(EFSCORRUPTED); 373 return XFS_ERROR(EFSCORRUPTED);
379 } 374 }
@@ -382,12 +377,10 @@ xfs_mount_validate_sb(
382 * Until this is fixed only page-sized or smaller data blocks work. 377 * Until this is fixed only page-sized or smaller data blocks work.
383 */ 378 */
384 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { 379 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
385 if (loud) { 380 xfs_warn(mp,
386 xfs_warn(mp,
387 "File system with blocksize %d bytes. " 381 "File system with blocksize %d bytes. "
388 "Only pagesize (%ld) or less will currently work.", 382 "Only pagesize (%ld) or less will currently work.",
389 sbp->sb_blocksize, PAGE_SIZE); 383 sbp->sb_blocksize, PAGE_SIZE);
390 }
391 return XFS_ERROR(ENOSYS); 384 return XFS_ERROR(ENOSYS);
392 } 385 }
393 386
@@ -401,23 +394,20 @@ xfs_mount_validate_sb(
401 case 2048: 394 case 2048:
402 break; 395 break;
403 default: 396 default:
404 if (loud) 397 xfs_warn(mp, "inode size of %d bytes not supported",
405 xfs_warn(mp, "inode size of %d bytes not supported",
406 sbp->sb_inodesize); 398 sbp->sb_inodesize);
407 return XFS_ERROR(ENOSYS); 399 return XFS_ERROR(ENOSYS);
408 } 400 }
409 401
410 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || 402 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
411 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { 403 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
412 if (loud) 404 xfs_warn(mp,
413 xfs_warn(mp,
414 "file system too large to be mounted on this system."); 405 "file system too large to be mounted on this system.");
415 return XFS_ERROR(EFBIG); 406 return XFS_ERROR(EFBIG);
416 } 407 }
417 408
418 if (unlikely(sbp->sb_inprogress)) { 409 if (check_inprogress && sbp->sb_inprogress) {
419 if (loud) 410 xfs_warn(mp, "Offline file system operation in progress!");
420 xfs_warn(mp, "file system busy");
421 return XFS_ERROR(EFSCORRUPTED); 411 return XFS_ERROR(EFSCORRUPTED);
422 } 412 }
423 413
@@ -425,9 +415,7 @@ xfs_mount_validate_sb(
425 * Version 1 directory format has never worked on Linux. 415 * Version 1 directory format has never worked on Linux.
426 */ 416 */
427 if (unlikely(!xfs_sb_version_hasdirv2(sbp))) { 417 if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
428 if (loud) 418 xfs_warn(mp, "file system using version 1 directory format");
429 xfs_warn(mp,
430 "file system using version 1 directory format");
431 return XFS_ERROR(ENOSYS); 419 return XFS_ERROR(ENOSYS);
432 } 420 }
433 421
@@ -520,11 +508,9 @@ out_unwind:
520 508
521void 509void
522xfs_sb_from_disk( 510xfs_sb_from_disk(
523 struct xfs_mount *mp, 511 struct xfs_sb *to,
524 xfs_dsb_t *from) 512 xfs_dsb_t *from)
525{ 513{
526 struct xfs_sb *to = &mp->m_sb;
527
528 to->sb_magicnum = be32_to_cpu(from->sb_magicnum); 514 to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
529 to->sb_blocksize = be32_to_cpu(from->sb_blocksize); 515 to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
530 to->sb_dblocks = be64_to_cpu(from->sb_dblocks); 516 to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
@@ -626,6 +612,72 @@ xfs_sb_to_disk(
626 } 612 }
627} 613}
628 614
615static void
616xfs_sb_verify(
617 struct xfs_buf *bp)
618{
619 struct xfs_mount *mp = bp->b_target->bt_mount;
620 struct xfs_sb sb;
621 int error;
622
623 xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
624
625 /*
626 * Only check the in progress field for the primary superblock as
627 * mkfs.xfs doesn't clear it from secondary superblocks.
628 */
629 error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR);
630 if (error)
631 xfs_buf_ioerror(bp, error);
632}
633
634static void
635xfs_sb_read_verify(
636 struct xfs_buf *bp)
637{
638 xfs_sb_verify(bp);
639}
640
641/*
642 * We may be probed for a filesystem match, so we may not want to emit
643 * messages when the superblock buffer is not actually an XFS superblock.
644 * If we find an XFS superblock, the run a normal, noisy mount because we are
645 * really going to mount it and want to know about errors.
646 */
647static void
648xfs_sb_quiet_read_verify(
649 struct xfs_buf *bp)
650{
651 struct xfs_sb sb;
652
653 xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
654
655 if (sb.sb_magicnum == XFS_SB_MAGIC) {
656 /* XFS filesystem, verify noisily! */
657 xfs_sb_read_verify(bp);
658 return;
659 }
660 /* quietly fail */
661 xfs_buf_ioerror(bp, EFSCORRUPTED);
662}
663
664static void
665xfs_sb_write_verify(
666 struct xfs_buf *bp)
667{
668 xfs_sb_verify(bp);
669}
670
671const struct xfs_buf_ops xfs_sb_buf_ops = {
672 .verify_read = xfs_sb_read_verify,
673 .verify_write = xfs_sb_write_verify,
674};
675
676static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
677 .verify_read = xfs_sb_quiet_read_verify,
678 .verify_write = xfs_sb_write_verify,
679};
680
629/* 681/*
630 * xfs_readsb 682 * xfs_readsb
631 * 683 *
@@ -651,26 +703,27 @@ xfs_readsb(xfs_mount_t *mp, int flags)
651 703
652reread: 704reread:
653 bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, 705 bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
654 BTOBB(sector_size), 0); 706 BTOBB(sector_size), 0,
707 loud ? &xfs_sb_buf_ops
708 : &xfs_sb_quiet_buf_ops);
655 if (!bp) { 709 if (!bp) {
656 if (loud) 710 if (loud)
657 xfs_warn(mp, "SB buffer read failed"); 711 xfs_warn(mp, "SB buffer read failed");
658 return EIO; 712 return EIO;
659 } 713 }
660 714 if (bp->b_error) {
661 /* 715 error = bp->b_error;
662 * Initialize the mount structure from the superblock.
663 * But first do some basic consistency checking.
664 */
665 xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp));
666 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
667 if (error) {
668 if (loud) 716 if (loud)
669 xfs_warn(mp, "SB validate failed"); 717 xfs_warn(mp, "SB validate failed");
670 goto release_buf; 718 goto release_buf;
671 } 719 }
672 720
673 /* 721 /*
722 * Initialize the mount structure from the superblock.
723 */
724 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
725
726 /*
674 * We must be able to do sector-sized and sector-aligned IO. 727 * We must be able to do sector-sized and sector-aligned IO.
675 */ 728 */
676 if (sector_size > mp->m_sb.sb_sectsize) { 729 if (sector_size > mp->m_sb.sb_sectsize) {
@@ -1001,7 +1054,7 @@ xfs_check_sizes(xfs_mount_t *mp)
1001 } 1054 }
1002 bp = xfs_buf_read_uncached(mp->m_ddev_targp, 1055 bp = xfs_buf_read_uncached(mp->m_ddev_targp,
1003 d - XFS_FSS_TO_BB(mp, 1), 1056 d - XFS_FSS_TO_BB(mp, 1),
1004 XFS_FSS_TO_BB(mp, 1), 0); 1057 XFS_FSS_TO_BB(mp, 1), 0, NULL);
1005 if (!bp) { 1058 if (!bp) {
1006 xfs_warn(mp, "last sector read failed"); 1059 xfs_warn(mp, "last sector read failed");
1007 return EIO; 1060 return EIO;
@@ -1016,7 +1069,7 @@ xfs_check_sizes(xfs_mount_t *mp)
1016 } 1069 }
1017 bp = xfs_buf_read_uncached(mp->m_logdev_targp, 1070 bp = xfs_buf_read_uncached(mp->m_logdev_targp,
1018 d - XFS_FSB_TO_BB(mp, 1), 1071 d - XFS_FSB_TO_BB(mp, 1),
1019 XFS_FSB_TO_BB(mp, 1), 0); 1072 XFS_FSB_TO_BB(mp, 1), 0, NULL);
1020 if (!bp) { 1073 if (!bp) {
1021 xfs_warn(mp, "log device read failed"); 1074 xfs_warn(mp, "log device read failed");
1022 return EIO; 1075 return EIO;
@@ -1427,6 +1480,8 @@ xfs_unmountfs(
1427 __uint64_t resblks; 1480 __uint64_t resblks;
1428 int error; 1481 int error;
1429 1482
1483 cancel_delayed_work_sync(&mp->m_eofblocks_work);
1484
1430 xfs_qm_unmount_quotas(mp); 1485 xfs_qm_unmount_quotas(mp);
1431 xfs_rtunmount_inodes(mp); 1486 xfs_rtunmount_inodes(mp);
1432 IRELE(mp->m_rootip); 1487 IRELE(mp->m_rootip);
@@ -1450,21 +1505,16 @@ xfs_unmountfs(
1450 1505
1451 /* 1506 /*
1452 * And reclaim all inodes. At this point there should be no dirty 1507 * And reclaim all inodes. At this point there should be no dirty
1453 * inode, and none should be pinned or locked, but use synchronous 1508 * inodes and none should be pinned or locked, but use synchronous
1454 * reclaim just to be sure. 1509 * reclaim just to be sure. We can stop background inode reclaim
1510 * here as well if it is still running.
1455 */ 1511 */
1512 cancel_delayed_work_sync(&mp->m_reclaim_work);
1456 xfs_reclaim_inodes(mp, SYNC_WAIT); 1513 xfs_reclaim_inodes(mp, SYNC_WAIT);
1457 1514
1458 xfs_qm_unmount(mp); 1515 xfs_qm_unmount(mp);
1459 1516
1460 /* 1517 /*
1461 * Flush out the log synchronously so that we know for sure
1462 * that nothing is pinned. This is important because bflush()
1463 * will skip pinned buffers.
1464 */
1465 xfs_log_force(mp, XFS_LOG_SYNC);
1466
1467 /*
1468 * Unreserve any blocks we have so that when we unmount we don't account 1518 * Unreserve any blocks we have so that when we unmount we don't account
1469 * the reserved free space as used. This is really only necessary for 1519 * the reserved free space as used. This is really only necessary for
1470 * lazy superblock counting because it trusts the incore superblock 1520 * lazy superblock counting because it trusts the incore superblock
@@ -1489,23 +1539,6 @@ xfs_unmountfs(
1489 xfs_warn(mp, "Unable to update superblock counters. " 1539 xfs_warn(mp, "Unable to update superblock counters. "
1490 "Freespace may not be correct on next mount."); 1540 "Freespace may not be correct on next mount.");
1491 1541
1492 /*
1493 * At this point we might have modified the superblock again and thus
1494 * added an item to the AIL, thus flush it again.
1495 */
1496 xfs_ail_push_all_sync(mp->m_ail);
1497 xfs_wait_buftarg(mp->m_ddev_targp);
1498
1499 /*
1500 * The superblock buffer is uncached and xfsaild_push() will lock and
1501 * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
1502 * here but a lock on the superblock buffer will block until iodone()
1503 * has completed.
1504 */
1505 xfs_buf_lock(mp->m_sb_bp);
1506 xfs_buf_unlock(mp->m_sb_bp);
1507
1508 xfs_log_unmount_write(mp);
1509 xfs_log_unmount(mp); 1542 xfs_log_unmount(mp);
1510 xfs_uuid_unmount(mp); 1543 xfs_uuid_unmount(mp);
1511 1544
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index deee09e534dc..bab8314507e4 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -51,8 +51,6 @@ typedef struct xfs_trans_reservations {
51 51
52#else /* __KERNEL__ */ 52#else /* __KERNEL__ */
53 53
54#include "xfs_sync.h"
55
56struct xlog; 54struct xlog;
57struct xfs_inode; 55struct xfs_inode;
58struct xfs_mru_cache; 56struct xfs_mru_cache;
@@ -197,9 +195,9 @@ typedef struct xfs_mount {
197 struct mutex m_icsb_mutex; /* balancer sync lock */ 195 struct mutex m_icsb_mutex; /* balancer sync lock */
198#endif 196#endif
199 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ 197 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
200 struct delayed_work m_sync_work; /* background sync work */
201 struct delayed_work m_reclaim_work; /* background inode reclaim */ 198 struct delayed_work m_reclaim_work; /* background inode reclaim */
202 struct work_struct m_flush_work; /* background inode flush */ 199 struct delayed_work m_eofblocks_work; /* background eof blocks
200 trimming */
203 __int64_t m_update_flags; /* sb flags we need to update 201 __int64_t m_update_flags; /* sb flags we need to update
204 on the next remount,rw */ 202 on the next remount,rw */
205 struct shrinker m_inode_shrink; /* inode reclaim shrinker */ 203 struct shrinker m_inode_shrink; /* inode reclaim shrinker */
@@ -209,6 +207,9 @@ typedef struct xfs_mount {
209 struct workqueue_struct *m_data_workqueue; 207 struct workqueue_struct *m_data_workqueue;
210 struct workqueue_struct *m_unwritten_workqueue; 208 struct workqueue_struct *m_unwritten_workqueue;
211 struct workqueue_struct *m_cil_workqueue; 209 struct workqueue_struct *m_cil_workqueue;
210 struct workqueue_struct *m_reclaim_workqueue;
211 struct workqueue_struct *m_log_workqueue;
212 struct workqueue_struct *m_eofblocks_workqueue;
212} xfs_mount_t; 213} xfs_mount_t;
213 214
214/* 215/*
@@ -387,7 +388,9 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *);
387extern void xfs_mod_sb(struct xfs_trans *, __int64_t); 388extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
388extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, 389extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
389 xfs_agnumber_t *); 390 xfs_agnumber_t *);
390extern void xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *); 391extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
391extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); 392extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
392 393
394extern const struct xfs_buf_ops xfs_sb_buf_ops;
395
393#endif /* __XFS_MOUNT_H__ */ 396#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 2e86fa0cfc0d..60eff4763156 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -40,6 +40,7 @@
40#include "xfs_utils.h" 40#include "xfs_utils.h"
41#include "xfs_qm.h" 41#include "xfs_qm.h"
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43#include "xfs_icache.h"
43 44
44/* 45/*
45 * The global quota manager. There is only one of these for the entire 46 * The global quota manager. There is only one of these for the entire
@@ -891,7 +892,8 @@ xfs_qm_dqiter_bufs(
891 while (blkcnt--) { 892 while (blkcnt--) {
892 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, 893 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
893 XFS_FSB_TO_DADDR(mp, bno), 894 XFS_FSB_TO_DADDR(mp, bno),
894 mp->m_quotainfo->qi_dqchunklen, 0, &bp); 895 mp->m_quotainfo->qi_dqchunklen, 0, &bp,
896 &xfs_dquot_buf_ops);
895 if (error) 897 if (error)
896 break; 898 break;
897 899
@@ -978,7 +980,8 @@ xfs_qm_dqiterate(
978 while (rablkcnt--) { 980 while (rablkcnt--) {
979 xfs_buf_readahead(mp->m_ddev_targp, 981 xfs_buf_readahead(mp->m_ddev_targp,
980 XFS_FSB_TO_DADDR(mp, rablkno), 982 XFS_FSB_TO_DADDR(mp, rablkno),
981 mp->m_quotainfo->qi_dqchunklen); 983 mp->m_quotainfo->qi_dqchunklen,
984 NULL);
982 rablkno++; 985 rablkno++;
983 } 986 }
984 } 987 }
@@ -1453,7 +1456,7 @@ xfs_qm_dqreclaim_one(
1453 int error; 1456 int error;
1454 1457
1455 if (!xfs_dqlock_nowait(dqp)) 1458 if (!xfs_dqlock_nowait(dqp))
1456 goto out_busy; 1459 goto out_move_tail;
1457 1460
1458 /* 1461 /*
1459 * This dquot has acquired a reference in the meantime remove it from 1462 * This dquot has acquired a reference in the meantime remove it from
@@ -1476,7 +1479,7 @@ xfs_qm_dqreclaim_one(
1476 * getting flushed to disk, we don't want to reclaim it. 1479 * getting flushed to disk, we don't want to reclaim it.
1477 */ 1480 */
1478 if (!xfs_dqflock_nowait(dqp)) 1481 if (!xfs_dqflock_nowait(dqp))
1479 goto out_busy; 1482 goto out_unlock_move_tail;
1480 1483
1481 if (XFS_DQ_IS_DIRTY(dqp)) { 1484 if (XFS_DQ_IS_DIRTY(dqp)) {
1482 struct xfs_buf *bp = NULL; 1485 struct xfs_buf *bp = NULL;
@@ -1487,7 +1490,7 @@ xfs_qm_dqreclaim_one(
1487 if (error) { 1490 if (error) {
1488 xfs_warn(mp, "%s: dquot %p flush failed", 1491 xfs_warn(mp, "%s: dquot %p flush failed",
1489 __func__, dqp); 1492 __func__, dqp);
1490 goto out_busy; 1493 goto out_unlock_move_tail;
1491 } 1494 }
1492 1495
1493 xfs_buf_delwri_queue(bp, buffer_list); 1496 xfs_buf_delwri_queue(bp, buffer_list);
@@ -1496,7 +1499,7 @@ xfs_qm_dqreclaim_one(
1496 * Give the dquot another try on the freelist, as the 1499 * Give the dquot another try on the freelist, as the
1497 * flushing will take some time. 1500 * flushing will take some time.
1498 */ 1501 */
1499 goto out_busy; 1502 goto out_unlock_move_tail;
1500 } 1503 }
1501 xfs_dqfunlock(dqp); 1504 xfs_dqfunlock(dqp);
1502 1505
@@ -1515,14 +1518,13 @@ xfs_qm_dqreclaim_one(
1515 XFS_STATS_INC(xs_qm_dqreclaims); 1518 XFS_STATS_INC(xs_qm_dqreclaims);
1516 return; 1519 return;
1517 1520
1518out_busy:
1519 xfs_dqunlock(dqp);
1520
1521 /* 1521 /*
1522 * Move the dquot to the tail of the list so that we don't spin on it. 1522 * Move the dquot to the tail of the list so that we don't spin on it.
1523 */ 1523 */
1524out_unlock_move_tail:
1525 xfs_dqunlock(dqp);
1526out_move_tail:
1524 list_move_tail(&dqp->q_lru, &qi->qi_lru_list); 1527 list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
1525
1526 trace_xfs_dqreclaim_busy(dqp); 1528 trace_xfs_dqreclaim_busy(dqp);
1527 XFS_STATS_INC(xs_qm_dqreclaim_misses); 1529 XFS_STATS_INC(xs_qm_dqreclaim_misses);
1528} 1530}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 858a3b186110..5f53e75409b8 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -40,6 +40,7 @@
40#include "xfs_utils.h" 40#include "xfs_utils.h"
41#include "xfs_qm.h" 41#include "xfs_qm.h"
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43#include "xfs_icache.h"
43 44
44STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); 45STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
45STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, 46STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
@@ -845,7 +846,8 @@ STATIC int
845xfs_dqrele_inode( 846xfs_dqrele_inode(
846 struct xfs_inode *ip, 847 struct xfs_inode *ip,
847 struct xfs_perag *pag, 848 struct xfs_perag *pag,
848 int flags) 849 int flags,
850 void *args)
849{ 851{
850 /* skip quota inodes */ 852 /* skip quota inodes */
851 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || 853 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
@@ -881,5 +883,5 @@ xfs_qm_dqrele_all_inodes(
881 uint flags) 883 uint flags)
882{ 884{
883 ASSERT(mp->m_quotainfo); 885 ASSERT(mp->m_quotainfo);
884 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags); 886 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL);
885} 887}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ca28a4ba4b54..98dc670d3ee0 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -38,6 +38,7 @@
38#include "xfs_utils.h" 38#include "xfs_utils.h"
39#include "xfs_trace.h" 39#include "xfs_trace.h"
40#include "xfs_buf.h" 40#include "xfs_buf.h"
41#include "xfs_icache.h"
41 42
42 43
43/* 44/*
@@ -869,7 +870,7 @@ xfs_rtbuf_get(
869 ASSERT(map.br_startblock != NULLFSBLOCK); 870 ASSERT(map.br_startblock != NULLFSBLOCK);
870 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 871 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
871 XFS_FSB_TO_DADDR(mp, map.br_startblock), 872 XFS_FSB_TO_DADDR(mp, map.br_startblock),
872 mp->m_bsize, 0, &bp); 873 mp->m_bsize, 0, &bp, NULL);
873 if (error) 874 if (error)
874 return error; 875 return error;
875 ASSERT(!xfs_buf_geterror(bp)); 876 ASSERT(!xfs_buf_geterror(bp));
@@ -1872,9 +1873,14 @@ xfs_growfs_rt(
1872 */ 1873 */
1873 bp = xfs_buf_read_uncached(mp->m_rtdev_targp, 1874 bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
1874 XFS_FSB_TO_BB(mp, nrblocks - 1), 1875 XFS_FSB_TO_BB(mp, nrblocks - 1),
1875 XFS_FSB_TO_BB(mp, 1), 0); 1876 XFS_FSB_TO_BB(mp, 1), 0, NULL);
1876 if (!bp) 1877 if (!bp)
1877 return EIO; 1878 return EIO;
1879 if (bp->b_error) {
1880 error = bp->b_error;
1881 xfs_buf_relse(bp);
1882 return error;
1883 }
1878 xfs_buf_relse(bp); 1884 xfs_buf_relse(bp);
1879 1885
1880 /* 1886 /*
@@ -2219,9 +2225,11 @@ xfs_rtmount_init(
2219 } 2225 }
2220 bp = xfs_buf_read_uncached(mp->m_rtdev_targp, 2226 bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
2221 d - XFS_FSB_TO_BB(mp, 1), 2227 d - XFS_FSB_TO_BB(mp, 1),
2222 XFS_FSB_TO_BB(mp, 1), 0); 2228 XFS_FSB_TO_BB(mp, 1), 0, NULL);
2223 if (!bp) { 2229 if (!bp || bp->b_error) {
2224 xfs_warn(mp, "realtime device size check failed"); 2230 xfs_warn(mp, "realtime device size check failed");
2231 if (bp)
2232 xfs_buf_relse(bp);
2225 return EIO; 2233 return EIO;
2226 } 2234 }
2227 xfs_buf_relse(bp); 2235 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index f429d9d5d325..a05b45175fb0 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -81,6 +81,7 @@ struct xfs_mount;
81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ 81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
82#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ 82#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
83#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ 83#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
84#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */
84 85
85#define XFS_SB_VERSION2_OKREALFBITS \ 86#define XFS_SB_VERSION2_OKREALFBITS \
86 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ 87 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
@@ -503,6 +504,12 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
503 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT); 504 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
504} 505}
505 506
507static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
508{
509 return (xfs_sb_version_hasmorebits(sbp) &&
510 (sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT));
511}
512
506/* 513/*
507 * end of superblock version macros 514 * end of superblock version macros
508 */ 515 */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 26a09bd7f975..ab8839b26272 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -49,7 +49,7 @@
49#include "xfs_extfree_item.h" 49#include "xfs_extfree_item.h"
50#include "xfs_mru_cache.h" 50#include "xfs_mru_cache.h"
51#include "xfs_inode_item.h" 51#include "xfs_inode_item.h"
52#include "xfs_sync.h" 52#include "xfs_icache.h"
53#include "xfs_trace.h" 53#include "xfs_trace.h"
54 54
55#include <linux/namei.h> 55#include <linux/namei.h>
@@ -863,8 +863,30 @@ xfs_init_mount_workqueues(
863 WQ_MEM_RECLAIM, 0, mp->m_fsname); 863 WQ_MEM_RECLAIM, 0, mp->m_fsname);
864 if (!mp->m_cil_workqueue) 864 if (!mp->m_cil_workqueue)
865 goto out_destroy_unwritten; 865 goto out_destroy_unwritten;
866
867 mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
868 WQ_NON_REENTRANT, 0, mp->m_fsname);
869 if (!mp->m_reclaim_workqueue)
870 goto out_destroy_cil;
871
872 mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
873 WQ_NON_REENTRANT, 0, mp->m_fsname);
874 if (!mp->m_log_workqueue)
875 goto out_destroy_reclaim;
876
877 mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
878 WQ_NON_REENTRANT, 0, mp->m_fsname);
879 if (!mp->m_eofblocks_workqueue)
880 goto out_destroy_log;
881
866 return 0; 882 return 0;
867 883
884out_destroy_log:
885 destroy_workqueue(mp->m_log_workqueue);
886out_destroy_reclaim:
887 destroy_workqueue(mp->m_reclaim_workqueue);
888out_destroy_cil:
889 destroy_workqueue(mp->m_cil_workqueue);
868out_destroy_unwritten: 890out_destroy_unwritten:
869 destroy_workqueue(mp->m_unwritten_workqueue); 891 destroy_workqueue(mp->m_unwritten_workqueue);
870out_destroy_data_iodone_queue: 892out_destroy_data_iodone_queue:
@@ -877,11 +899,32 @@ STATIC void
877xfs_destroy_mount_workqueues( 899xfs_destroy_mount_workqueues(
878 struct xfs_mount *mp) 900 struct xfs_mount *mp)
879{ 901{
902 destroy_workqueue(mp->m_eofblocks_workqueue);
903 destroy_workqueue(mp->m_log_workqueue);
904 destroy_workqueue(mp->m_reclaim_workqueue);
880 destroy_workqueue(mp->m_cil_workqueue); 905 destroy_workqueue(mp->m_cil_workqueue);
881 destroy_workqueue(mp->m_data_workqueue); 906 destroy_workqueue(mp->m_data_workqueue);
882 destroy_workqueue(mp->m_unwritten_workqueue); 907 destroy_workqueue(mp->m_unwritten_workqueue);
883} 908}
884 909
910/*
911 * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
912 * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
913 * for IO to complete so that we effectively throttle multiple callers to the
914 * rate at which IO is completing.
915 */
916void
917xfs_flush_inodes(
918 struct xfs_mount *mp)
919{
920 struct super_block *sb = mp->m_super;
921
922 if (down_read_trylock(&sb->s_umount)) {
923 sync_inodes_sb(sb);
924 up_read(&sb->s_umount);
925 }
926}
927
885/* Catch misguided souls that try to use this interface on XFS */ 928/* Catch misguided souls that try to use this interface on XFS */
886STATIC struct inode * 929STATIC struct inode *
887xfs_fs_alloc_inode( 930xfs_fs_alloc_inode(
@@ -1006,9 +1049,8 @@ xfs_fs_put_super(
1006 struct xfs_mount *mp = XFS_M(sb); 1049 struct xfs_mount *mp = XFS_M(sb);
1007 1050
1008 xfs_filestream_unmount(mp); 1051 xfs_filestream_unmount(mp);
1009 cancel_delayed_work_sync(&mp->m_sync_work);
1010 xfs_unmountfs(mp); 1052 xfs_unmountfs(mp);
1011 xfs_syncd_stop(mp); 1053
1012 xfs_freesb(mp); 1054 xfs_freesb(mp);
1013 xfs_icsb_destroy_counters(mp); 1055 xfs_icsb_destroy_counters(mp);
1014 xfs_destroy_mount_workqueues(mp); 1056 xfs_destroy_mount_workqueues(mp);
@@ -1023,7 +1065,6 @@ xfs_fs_sync_fs(
1023 int wait) 1065 int wait)
1024{ 1066{
1025 struct xfs_mount *mp = XFS_M(sb); 1067 struct xfs_mount *mp = XFS_M(sb);
1026 int error;
1027 1068
1028 /* 1069 /*
1029 * Doing anything during the async pass would be counterproductive. 1070 * Doing anything during the async pass would be counterproductive.
@@ -1031,17 +1072,14 @@ xfs_fs_sync_fs(
1031 if (!wait) 1072 if (!wait)
1032 return 0; 1073 return 0;
1033 1074
1034 error = xfs_quiesce_data(mp); 1075 xfs_log_force(mp, XFS_LOG_SYNC);
1035 if (error)
1036 return -error;
1037
1038 if (laptop_mode) { 1076 if (laptop_mode) {
1039 /* 1077 /*
1040 * The disk must be active because we're syncing. 1078 * The disk must be active because we're syncing.
1041 * We schedule xfssyncd now (now that the disk is 1079 * We schedule log work now (now that the disk is
1042 * active) instead of later (when it might not be). 1080 * active) instead of later (when it might not be).
1043 */ 1081 */
1044 flush_delayed_work(&mp->m_sync_work); 1082 flush_delayed_work(&mp->m_log->l_work);
1045 } 1083 }
1046 1084
1047 return 0; 1085 return 0;
@@ -1118,6 +1156,48 @@ xfs_restore_resvblks(struct xfs_mount *mp)
1118 xfs_reserve_blocks(mp, &resblks, NULL); 1156 xfs_reserve_blocks(mp, &resblks, NULL);
1119} 1157}
1120 1158
1159/*
1160 * Trigger writeback of all the dirty metadata in the file system.
1161 *
1162 * This ensures that the metadata is written to their location on disk rather
1163 * than just existing in transactions in the log. This means after a quiesce
1164 * there is no log replay required to write the inodes to disk - this is the
1165 * primary difference between a sync and a quiesce.
1166 *
1167 * Note: xfs_log_quiesce() stops background log work - the callers must ensure
1168 * it is started again when appropriate.
1169 */
1170void
1171xfs_quiesce_attr(
1172 struct xfs_mount *mp)
1173{
1174 int error = 0;
1175
1176 /* wait for all modifications to complete */
1177 while (atomic_read(&mp->m_active_trans) > 0)
1178 delay(100);
1179
1180 /* force the log to unpin objects from the now complete transactions */
1181 xfs_log_force(mp, XFS_LOG_SYNC);
1182
1183 /* reclaim inodes to do any IO before the freeze completes */
1184 xfs_reclaim_inodes(mp, 0);
1185 xfs_reclaim_inodes(mp, SYNC_WAIT);
1186
1187 /* Push the superblock and write an unmount record */
1188 error = xfs_log_sbcount(mp);
1189 if (error)
1190 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
1191 "Frozen image may not be consistent.");
1192 /*
1193 * Just warn here till VFS can correctly support
1194 * read-only remount without racing.
1195 */
1196 WARN_ON(atomic_read(&mp->m_active_trans) != 0);
1197
1198 xfs_log_quiesce(mp);
1199}
1200
1121STATIC int 1201STATIC int
1122xfs_fs_remount( 1202xfs_fs_remount(
1123 struct super_block *sb, 1203 struct super_block *sb,
@@ -1198,20 +1278,18 @@ xfs_fs_remount(
1198 * value if it is non-zero, otherwise go with the default. 1278 * value if it is non-zero, otherwise go with the default.
1199 */ 1279 */
1200 xfs_restore_resvblks(mp); 1280 xfs_restore_resvblks(mp);
1281 xfs_log_work_queue(mp);
1201 } 1282 }
1202 1283
1203 /* rw -> ro */ 1284 /* rw -> ro */
1204 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { 1285 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
1205 /* 1286 /*
1206 * After we have synced the data but before we sync the 1287 * Before we sync the metadata, we need to free up the reserve
1207 * metadata, we need to free up the reserve block pool so that 1288 * block pool so that the used block count in the superblock on
1208 * the used block count in the superblock on disk is correct at 1289 * disk is correct at the end of the remount. Stash the current
1209 * the end of the remount. Stash the current reserve pool size 1290 * reserve pool size so that if we get remounted rw, we can
1210 * so that if we get remounted rw, we can return it to the same 1291 * return it to the same size.
1211 * size.
1212 */ 1292 */
1213
1214 xfs_quiesce_data(mp);
1215 xfs_save_resvblks(mp); 1293 xfs_save_resvblks(mp);
1216 xfs_quiesce_attr(mp); 1294 xfs_quiesce_attr(mp);
1217 mp->m_flags |= XFS_MOUNT_RDONLY; 1295 mp->m_flags |= XFS_MOUNT_RDONLY;
@@ -1243,6 +1321,7 @@ xfs_fs_unfreeze(
1243 struct xfs_mount *mp = XFS_M(sb); 1321 struct xfs_mount *mp = XFS_M(sb);
1244 1322
1245 xfs_restore_resvblks(mp); 1323 xfs_restore_resvblks(mp);
1324 xfs_log_work_queue(mp);
1246 return 0; 1325 return 0;
1247} 1326}
1248 1327
@@ -1321,6 +1400,8 @@ xfs_fs_fill_super(
1321 spin_lock_init(&mp->m_sb_lock); 1400 spin_lock_init(&mp->m_sb_lock);
1322 mutex_init(&mp->m_growlock); 1401 mutex_init(&mp->m_growlock);
1323 atomic_set(&mp->m_active_trans, 0); 1402 atomic_set(&mp->m_active_trans, 0);
1403 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
1404 INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
1324 1405
1325 mp->m_super = sb; 1406 mp->m_super = sb;
1326 sb->s_fs_info = mp; 1407 sb->s_fs_info = mp;
@@ -1371,10 +1452,6 @@ xfs_fs_fill_super(
1371 /* 1452 /*
1372 * we must configure the block size in the superblock before we run the 1453 * we must configure the block size in the superblock before we run the
1373 * full mount process as the mount process can lookup and cache inodes. 1454 * full mount process as the mount process can lookup and cache inodes.
1374 * For the same reason we must also initialise the syncd and register
1375 * the inode cache shrinker so that inodes can be reclaimed during
1376 * operations like a quotacheck that iterate all inodes in the
1377 * filesystem.
1378 */ 1455 */
1379 sb->s_magic = XFS_SB_MAGIC; 1456 sb->s_magic = XFS_SB_MAGIC;
1380 sb->s_blocksize = mp->m_sb.sb_blocksize; 1457 sb->s_blocksize = mp->m_sb.sb_blocksize;
@@ -1384,13 +1461,9 @@ xfs_fs_fill_super(
1384 sb->s_time_gran = 1; 1461 sb->s_time_gran = 1;
1385 set_posix_acl_flag(sb); 1462 set_posix_acl_flag(sb);
1386 1463
1387 error = xfs_syncd_init(mp);
1388 if (error)
1389 goto out_filestream_unmount;
1390
1391 error = xfs_mountfs(mp); 1464 error = xfs_mountfs(mp);
1392 if (error) 1465 if (error)
1393 goto out_syncd_stop; 1466 goto out_filestream_unmount;
1394 1467
1395 root = igrab(VFS_I(mp->m_rootip)); 1468 root = igrab(VFS_I(mp->m_rootip));
1396 if (!root) { 1469 if (!root) {
@@ -1408,8 +1481,7 @@ xfs_fs_fill_super(
1408 } 1481 }
1409 1482
1410 return 0; 1483 return 0;
1411 out_syncd_stop: 1484
1412 xfs_syncd_stop(mp);
1413 out_filestream_unmount: 1485 out_filestream_unmount:
1414 xfs_filestream_unmount(mp); 1486 xfs_filestream_unmount(mp);
1415 out_free_sb: 1487 out_free_sb:
@@ -1429,7 +1501,6 @@ out_destroy_workqueues:
1429 out_unmount: 1501 out_unmount:
1430 xfs_filestream_unmount(mp); 1502 xfs_filestream_unmount(mp);
1431 xfs_unmountfs(mp); 1503 xfs_unmountfs(mp);
1432 xfs_syncd_stop(mp);
1433 goto out_free_sb; 1504 goto out_free_sb;
1434} 1505}
1435 1506
@@ -1625,16 +1696,6 @@ STATIC int __init
1625xfs_init_workqueues(void) 1696xfs_init_workqueues(void)
1626{ 1697{
1627 /* 1698 /*
1628 * We never want to the same work item to run twice, reclaiming inodes
1629 * or idling the log is not going to get any faster by multiple CPUs
1630 * competing for ressources. Use the default large max_active value
1631 * so that even lots of filesystems can perform these task in parallel.
1632 */
1633 xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0);
1634 if (!xfs_syncd_wq)
1635 return -ENOMEM;
1636
1637 /*
1638 * The allocation workqueue can be used in memory reclaim situations 1699 * The allocation workqueue can be used in memory reclaim situations
1639 * (writepage path), and parallelism is only limited by the number of 1700 * (writepage path), and parallelism is only limited by the number of
1640 * AGs in all the filesystems mounted. Hence use the default large 1701 * AGs in all the filesystems mounted. Hence use the default large
@@ -1642,20 +1703,15 @@ xfs_init_workqueues(void)
1642 */ 1703 */
1643 xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0); 1704 xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0);
1644 if (!xfs_alloc_wq) 1705 if (!xfs_alloc_wq)
1645 goto out_destroy_syncd; 1706 return -ENOMEM;
1646 1707
1647 return 0; 1708 return 0;
1648
1649out_destroy_syncd:
1650 destroy_workqueue(xfs_syncd_wq);
1651 return -ENOMEM;
1652} 1709}
1653 1710
1654STATIC void 1711STATIC void
1655xfs_destroy_workqueues(void) 1712xfs_destroy_workqueues(void)
1656{ 1713{
1657 destroy_workqueue(xfs_alloc_wq); 1714 destroy_workqueue(xfs_alloc_wq);
1658 destroy_workqueue(xfs_syncd_wq);
1659} 1715}
1660 1716
1661STATIC int __init 1717STATIC int __init
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 9de4a920ba05..bbe3d15a7904 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -74,6 +74,7 @@ struct block_device;
74 74
75extern __uint64_t xfs_max_file_offset(unsigned int); 75extern __uint64_t xfs_max_file_offset(unsigned int);
76 76
77extern void xfs_flush_inodes(struct xfs_mount *mp);
77extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); 78extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
78extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *); 79extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *);
79extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *); 80extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *);
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index ee2d2adaa438..2801b5ce6cdb 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -202,6 +202,15 @@ static ctl_table xfs_table[] = {
202 .extra1 = &xfs_params.fstrm_timer.min, 202 .extra1 = &xfs_params.fstrm_timer.min,
203 .extra2 = &xfs_params.fstrm_timer.max, 203 .extra2 = &xfs_params.fstrm_timer.max,
204 }, 204 },
205 {
206 .procname = "speculative_prealloc_lifetime",
207 .data = &xfs_params.eofb_timer.val,
208 .maxlen = sizeof(int),
209 .mode = 0644,
210 .proc_handler = proc_dointvec_minmax,
211 .extra1 = &xfs_params.eofb_timer.min,
212 .extra2 = &xfs_params.eofb_timer.max,
213 },
205 /* please keep this the last entry */ 214 /* please keep this the last entry */
206#ifdef CONFIG_PROC_FS 215#ifdef CONFIG_PROC_FS
207 { 216 {
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index b9937d450f8e..bd8e157c20ef 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -47,6 +47,7 @@ typedef struct xfs_param {
47 xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ 47 xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */
48 xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ 48 xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
49 xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ 49 xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */
50 xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */
50} xfs_param_t; 51} xfs_param_t;
51 52
52/* 53/*
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 7d36ccf57f93..2e137d4a85ae 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -96,6 +96,8 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full);
96DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add); 96DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
97DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk); 97DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
98DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); 98DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
99DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
100DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
99 101
100DECLARE_EVENT_CLASS(xfs_perag_class, 102DECLARE_EVENT_CLASS(xfs_perag_class,
101 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, 103 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
@@ -130,6 +132,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
130DEFINE_PERAG_REF_EVENT(xfs_perag_put); 132DEFINE_PERAG_REF_EVENT(xfs_perag_put);
131DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); 133DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
132DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); 134DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
135DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);
136DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks);
133 137
134TRACE_EVENT(xfs_attr_list_node_descend, 138TRACE_EVENT(xfs_attr_list_node_descend,
135 TP_PROTO(struct xfs_attr_list_context *ctx, 139 TP_PROTO(struct xfs_attr_list_context *ctx,
@@ -585,6 +589,10 @@ DEFINE_INODE_EVENT(xfs_update_time);
585DEFINE_INODE_EVENT(xfs_dquot_dqalloc); 589DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
586DEFINE_INODE_EVENT(xfs_dquot_dqdetach); 590DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
587 591
592DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
593DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
594DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
595
588DECLARE_EVENT_CLASS(xfs_iref_class, 596DECLARE_EVENT_CLASS(xfs_iref_class,
589 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), 597 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
590 TP_ARGS(ip, caller_ip), 598 TP_ARGS(ip, caller_ip),
@@ -1496,8 +1504,42 @@ DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
1496DEFINE_DIR2_EVENT(xfs_dir2_node_removename); 1504DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
1497DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf); 1505DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
1498 1506
1507DECLARE_EVENT_CLASS(xfs_attr_class,
1508 TP_PROTO(struct xfs_da_args *args),
1509 TP_ARGS(args),
1510 TP_STRUCT__entry(
1511 __field(dev_t, dev)
1512 __field(xfs_ino_t, ino)
1513 __dynamic_array(char, name, args->namelen)
1514 __field(int, namelen)
1515 __field(int, valuelen)
1516 __field(xfs_dahash_t, hashval)
1517 __field(int, op_flags)
1518 ),
1519 TP_fast_assign(
1520 __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
1521 __entry->ino = args->dp->i_ino;
1522 if (args->namelen)
1523 memcpy(__get_str(name), args->name, args->namelen);
1524 __entry->namelen = args->namelen;
1525 __entry->valuelen = args->valuelen;
1526 __entry->hashval = args->hashval;
1527 __entry->op_flags = args->op_flags;
1528 ),
1529 TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
1530 "hashval 0x%x op_flags %s",
1531 MAJOR(__entry->dev), MINOR(__entry->dev),
1532 __entry->ino,
1533 __entry->namelen,
1534 __entry->namelen ? __get_str(name) : NULL,
1535 __entry->namelen,
1536 __entry->valuelen,
1537 __entry->hashval,
1538 __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
1539)
1540
1499#define DEFINE_ATTR_EVENT(name) \ 1541#define DEFINE_ATTR_EVENT(name) \
1500DEFINE_EVENT(xfs_da_class, name, \ 1542DEFINE_EVENT(xfs_attr_class, name, \
1501 TP_PROTO(struct xfs_da_args *args), \ 1543 TP_PROTO(struct xfs_da_args *args), \
1502 TP_ARGS(args)) 1544 TP_ARGS(args))
1503DEFINE_ATTR_EVENT(xfs_attr_sf_add); 1545DEFINE_ATTR_EVENT(xfs_attr_sf_add);
@@ -1511,10 +1553,14 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);
1511DEFINE_ATTR_EVENT(xfs_attr_leaf_add); 1553DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
1512DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old); 1554DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old);
1513DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new); 1555DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new);
1556DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work);
1514DEFINE_ATTR_EVENT(xfs_attr_leaf_addname); 1557DEFINE_ATTR_EVENT(xfs_attr_leaf_addname);
1515DEFINE_ATTR_EVENT(xfs_attr_leaf_create); 1558DEFINE_ATTR_EVENT(xfs_attr_leaf_create);
1559DEFINE_ATTR_EVENT(xfs_attr_leaf_compact);
1560DEFINE_ATTR_EVENT(xfs_attr_leaf_get);
1516DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup); 1561DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup);
1517DEFINE_ATTR_EVENT(xfs_attr_leaf_replace); 1562DEFINE_ATTR_EVENT(xfs_attr_leaf_replace);
1563DEFINE_ATTR_EVENT(xfs_attr_leaf_remove);
1518DEFINE_ATTR_EVENT(xfs_attr_leaf_removename); 1564DEFINE_ATTR_EVENT(xfs_attr_leaf_removename);
1519DEFINE_ATTR_EVENT(xfs_attr_leaf_split); 1565DEFINE_ATTR_EVENT(xfs_attr_leaf_split);
1520DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before); 1566DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before);
@@ -1526,12 +1572,21 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf);
1526DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node); 1572DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node);
1527DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance); 1573DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance);
1528DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance); 1574DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance);
1575DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall);
1529 1576
1530DEFINE_ATTR_EVENT(xfs_attr_node_addname); 1577DEFINE_ATTR_EVENT(xfs_attr_node_addname);
1578DEFINE_ATTR_EVENT(xfs_attr_node_get);
1531DEFINE_ATTR_EVENT(xfs_attr_node_lookup); 1579DEFINE_ATTR_EVENT(xfs_attr_node_lookup);
1532DEFINE_ATTR_EVENT(xfs_attr_node_replace); 1580DEFINE_ATTR_EVENT(xfs_attr_node_replace);
1533DEFINE_ATTR_EVENT(xfs_attr_node_removename); 1581DEFINE_ATTR_EVENT(xfs_attr_node_removename);
1534 1582
1583DEFINE_ATTR_EVENT(xfs_attr_fillstate);
1584DEFINE_ATTR_EVENT(xfs_attr_refillstate);
1585
1586DEFINE_ATTR_EVENT(xfs_attr_rmtval_get);
1587DEFINE_ATTR_EVENT(xfs_attr_rmtval_set);
1588DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove);
1589
1535#define DEFINE_DA_EVENT(name) \ 1590#define DEFINE_DA_EVENT(name) \
1536DEFINE_EVENT(xfs_da_class, name, \ 1591DEFINE_EVENT(xfs_da_class, name, \
1537 TP_PROTO(struct xfs_da_args *args), \ 1592 TP_PROTO(struct xfs_da_args *args), \
@@ -1550,9 +1605,12 @@ DEFINE_DA_EVENT(xfs_da_node_split);
1550DEFINE_DA_EVENT(xfs_da_node_remove); 1605DEFINE_DA_EVENT(xfs_da_node_remove);
1551DEFINE_DA_EVENT(xfs_da_node_rebalance); 1606DEFINE_DA_EVENT(xfs_da_node_rebalance);
1552DEFINE_DA_EVENT(xfs_da_node_unbalance); 1607DEFINE_DA_EVENT(xfs_da_node_unbalance);
1608DEFINE_DA_EVENT(xfs_da_node_toosmall);
1553DEFINE_DA_EVENT(xfs_da_swap_lastblock); 1609DEFINE_DA_EVENT(xfs_da_swap_lastblock);
1554DEFINE_DA_EVENT(xfs_da_grow_inode); 1610DEFINE_DA_EVENT(xfs_da_grow_inode);
1555DEFINE_DA_EVENT(xfs_da_shrink_inode); 1611DEFINE_DA_EVENT(xfs_da_shrink_inode);
1612DEFINE_DA_EVENT(xfs_da_fixhashpath);
1613DEFINE_DA_EVENT(xfs_da_path_shift);
1556 1614
1557DECLARE_EVENT_CLASS(xfs_dir2_space_class, 1615DECLARE_EVENT_CLASS(xfs_dir2_space_class,
1558 TP_PROTO(struct xfs_da_args *args, int idx), 1616 TP_PROTO(struct xfs_da_args *args, int idx),
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index db056544cbb5..c6c0601abd7a 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -464,10 +464,7 @@ xfs_trans_get_buf(
464 int numblks, 464 int numblks,
465 uint flags) 465 uint flags)
466{ 466{
467 struct xfs_buf_map map = { 467 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
468 .bm_bn = blkno,
469 .bm_len = numblks,
470 };
471 return xfs_trans_get_buf_map(tp, target, &map, 1, flags); 468 return xfs_trans_get_buf_map(tp, target, &map, 1, flags);
472} 469}
473 470
@@ -476,7 +473,8 @@ int xfs_trans_read_buf_map(struct xfs_mount *mp,
476 struct xfs_buftarg *target, 473 struct xfs_buftarg *target,
477 struct xfs_buf_map *map, int nmaps, 474 struct xfs_buf_map *map, int nmaps,
478 xfs_buf_flags_t flags, 475 xfs_buf_flags_t flags,
479 struct xfs_buf **bpp); 476 struct xfs_buf **bpp,
477 const struct xfs_buf_ops *ops);
480 478
481static inline int 479static inline int
482xfs_trans_read_buf( 480xfs_trans_read_buf(
@@ -486,13 +484,12 @@ xfs_trans_read_buf(
486 xfs_daddr_t blkno, 484 xfs_daddr_t blkno,
487 int numblks, 485 int numblks,
488 xfs_buf_flags_t flags, 486 xfs_buf_flags_t flags,
489 struct xfs_buf **bpp) 487 struct xfs_buf **bpp,
488 const struct xfs_buf_ops *ops)
490{ 489{
491 struct xfs_buf_map map = { 490 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
492 .bm_bn = blkno, 491 return xfs_trans_read_buf_map(mp, tp, target, &map, 1,
493 .bm_len = numblks, 492 flags, bpp, ops);
494 };
495 return xfs_trans_read_buf_map(mp, tp, target, &map, 1, flags, bpp);
496} 493}
497 494
498struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int); 495struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 6311b99c267f..4fc17d479d42 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -257,7 +257,8 @@ xfs_trans_read_buf_map(
257 struct xfs_buf_map *map, 257 struct xfs_buf_map *map,
258 int nmaps, 258 int nmaps,
259 xfs_buf_flags_t flags, 259 xfs_buf_flags_t flags,
260 struct xfs_buf **bpp) 260 struct xfs_buf **bpp,
261 const struct xfs_buf_ops *ops)
261{ 262{
262 xfs_buf_t *bp; 263 xfs_buf_t *bp;
263 xfs_buf_log_item_t *bip; 264 xfs_buf_log_item_t *bip;
@@ -265,7 +266,7 @@ xfs_trans_read_buf_map(
265 266
266 *bpp = NULL; 267 *bpp = NULL;
267 if (!tp) { 268 if (!tp) {
268 bp = xfs_buf_read_map(target, map, nmaps, flags); 269 bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
269 if (!bp) 270 if (!bp)
270 return (flags & XBF_TRYLOCK) ? 271 return (flags & XBF_TRYLOCK) ?
271 EAGAIN : XFS_ERROR(ENOMEM); 272 EAGAIN : XFS_ERROR(ENOMEM);
@@ -312,7 +313,9 @@ xfs_trans_read_buf_map(
312 if (!(XFS_BUF_ISDONE(bp))) { 313 if (!(XFS_BUF_ISDONE(bp))) {
313 trace_xfs_trans_read_buf_io(bp, _RET_IP_); 314 trace_xfs_trans_read_buf_io(bp, _RET_IP_);
314 ASSERT(!XFS_BUF_ISASYNC(bp)); 315 ASSERT(!XFS_BUF_ISASYNC(bp));
316 ASSERT(bp->b_iodone == NULL);
315 XFS_BUF_READ(bp); 317 XFS_BUF_READ(bp);
318 bp->b_ops = ops;
316 xfsbdstrat(tp->t_mountp, bp); 319 xfsbdstrat(tp->t_mountp, bp);
317 error = xfs_buf_iowait(bp); 320 error = xfs_buf_iowait(bp);
318 if (error) { 321 if (error) {
@@ -349,7 +352,7 @@ xfs_trans_read_buf_map(
349 return 0; 352 return 0;
350 } 353 }
351 354
352 bp = xfs_buf_read_map(target, map, nmaps, flags); 355 bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
353 if (bp == NULL) { 356 if (bp == NULL) {
354 *bpp = NULL; 357 *bpp = NULL;
355 return (flags & XBF_TRYLOCK) ? 358 return (flags & XBF_TRYLOCK) ?
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 2a5c637344b4..d95f565a390e 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -47,6 +47,7 @@
47#include "xfs_filestream.h" 47#include "xfs_filestream.h"
48#include "xfs_vnodeops.h" 48#include "xfs_vnodeops.h"
49#include "xfs_trace.h" 49#include "xfs_trace.h"
50#include "xfs_icache.h"
50 51
51/* 52/*
52 * The maximum pathlen is 1024 bytes. Since the minimum file system 53 * The maximum pathlen is 1024 bytes. Since the minimum file system
@@ -79,7 +80,7 @@ xfs_readlink_bmap(
79 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); 80 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
80 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); 81 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
81 82
82 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); 83 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL);
83 if (!bp) 84 if (!bp)
84 return XFS_ERROR(ENOMEM); 85 return XFS_ERROR(ENOMEM);
85 error = bp->b_error; 86 error = bp->b_error;
@@ -150,7 +151,7 @@ xfs_readlink(
150 * when the link count isn't zero and by xfs_dm_punch_hole() when 151 * when the link count isn't zero and by xfs_dm_punch_hole() when
151 * punching a hole to EOF. 152 * punching a hole to EOF.
152 */ 153 */
153STATIC int 154int
154xfs_free_eofblocks( 155xfs_free_eofblocks(
155 xfs_mount_t *mp, 156 xfs_mount_t *mp,
156 xfs_inode_t *ip, 157 xfs_inode_t *ip,
@@ -199,7 +200,7 @@ xfs_free_eofblocks(
199 if (need_iolock) { 200 if (need_iolock) {
200 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 201 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
201 xfs_trans_cancel(tp, 0); 202 xfs_trans_cancel(tp, 0);
202 return 0; 203 return EAGAIN;
203 } 204 }
204 } 205 }
205 206
@@ -237,6 +238,8 @@ xfs_free_eofblocks(
237 } else { 238 } else {
238 error = xfs_trans_commit(tp, 239 error = xfs_trans_commit(tp,
239 XFS_TRANS_RELEASE_LOG_RES); 240 XFS_TRANS_RELEASE_LOG_RES);
241 if (!error)
242 xfs_inode_clear_eofblocks_tag(ip);
240 } 243 }
241 244
242 xfs_iunlock(ip, XFS_ILOCK_EXCL); 245 xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -425,19 +428,18 @@ xfs_release(
425 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); 428 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
426 if (truncated) { 429 if (truncated) {
427 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); 430 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
428 if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) 431 if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
429 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); 432 error = -filemap_flush(VFS_I(ip)->i_mapping);
433 if (error)
434 return error;
435 }
430 } 436 }
431 } 437 }
432 438
433 if (ip->i_d.di_nlink == 0) 439 if (ip->i_d.di_nlink == 0)
434 return 0; 440 return 0;
435 441
436 if ((S_ISREG(ip->i_d.di_mode) && 442 if (xfs_can_free_eofblocks(ip, false)) {
437 (VFS_I(ip)->i_size > 0 ||
438 (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
439 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
440 (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
441 443
442 /* 444 /*
443 * If we can't get the iolock just skip truncating the blocks 445 * If we can't get the iolock just skip truncating the blocks
@@ -464,7 +466,7 @@ xfs_release(
464 return 0; 466 return 0;
465 467
466 error = xfs_free_eofblocks(mp, ip, true); 468 error = xfs_free_eofblocks(mp, ip, true);
467 if (error) 469 if (error && error != EAGAIN)
468 return error; 470 return error;
469 471
470 /* delalloc blocks after truncation means it really is dirty */ 472 /* delalloc blocks after truncation means it really is dirty */
@@ -513,13 +515,12 @@ xfs_inactive(
513 goto out; 515 goto out;
514 516
515 if (ip->i_d.di_nlink != 0) { 517 if (ip->i_d.di_nlink != 0) {
516 if ((S_ISREG(ip->i_d.di_mode) && 518 /*
517 (VFS_I(ip)->i_size > 0 || 519 * force is true because we are evicting an inode from the
518 (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && 520 * cache. Post-eof blocks must be freed, lest we end up with
519 (ip->i_df.if_flags & XFS_IFEXTENTS) && 521 * broken free space accounting.
520 (!(ip->i_d.di_flags & 522 */
521 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || 523 if (xfs_can_free_eofblocks(ip, true)) {
522 ip->i_delayed_blks != 0))) {
523 error = xfs_free_eofblocks(mp, ip, false); 524 error = xfs_free_eofblocks(mp, ip, false);
524 if (error) 525 if (error)
525 return VN_INACTIVE_CACHE; 526 return VN_INACTIVE_CACHE;
@@ -777,7 +778,7 @@ xfs_create(
777 XFS_TRANS_PERM_LOG_RES, log_count); 778 XFS_TRANS_PERM_LOG_RES, log_count);
778 if (error == ENOSPC) { 779 if (error == ENOSPC) {
779 /* flush outstanding delalloc blocks and retry */ 780 /* flush outstanding delalloc blocks and retry */
780 xfs_flush_inodes(dp); 781 xfs_flush_inodes(mp);
781 error = xfs_trans_reserve(tp, resblks, log_res, 0, 782 error = xfs_trans_reserve(tp, resblks, log_res, 0,
782 XFS_TRANS_PERM_LOG_RES, log_count); 783 XFS_TRANS_PERM_LOG_RES, log_count);
783 } 784 }
@@ -1957,12 +1958,11 @@ xfs_free_file_space(
1957 1958
1958 rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); 1959 rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
1959 ioffset = offset & ~(rounding - 1); 1960 ioffset = offset & ~(rounding - 1);
1960 1961 error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
1961 if (VN_CACHED(VFS_I(ip)) != 0) { 1962 ioffset, -1);
1962 error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED); 1963 if (error)
1963 if (error) 1964 goto out_unlock_iolock;
1964 goto out_unlock_iolock; 1965 truncate_pagecache_range(VFS_I(ip), ioffset, -1);
1965 }
1966 1966
1967 /* 1967 /*
1968 * Need to zero the stuff we're not freeing, on disk. 1968 * Need to zero the stuff we're not freeing, on disk.
@@ -2095,6 +2095,73 @@ xfs_free_file_space(
2095 return error; 2095 return error;
2096} 2096}
2097 2097
2098
2099STATIC int
2100xfs_zero_file_space(
2101 struct xfs_inode *ip,
2102 xfs_off_t offset,
2103 xfs_off_t len,
2104 int attr_flags)
2105{
2106 struct xfs_mount *mp = ip->i_mount;
2107 uint granularity;
2108 xfs_off_t start_boundary;
2109 xfs_off_t end_boundary;
2110 int error;
2111
2112 granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
2113
2114 /*
2115 * Round the range of extents we are going to convert inwards. If the
2116 * offset is aligned, then it doesn't get changed so we zero from the
2117 * start of the block offset points to.
2118 */
2119 start_boundary = round_up(offset, granularity);
2120 end_boundary = round_down(offset + len, granularity);
2121
2122 ASSERT(start_boundary >= offset);
2123 ASSERT(end_boundary <= offset + len);
2124
2125 if (!(attr_flags & XFS_ATTR_NOLOCK))
2126 xfs_ilock(ip, XFS_IOLOCK_EXCL);
2127
2128 if (start_boundary < end_boundary - 1) {
2129 /* punch out the page cache over the conversion range */
2130 truncate_pagecache_range(VFS_I(ip), start_boundary,
2131 end_boundary - 1);
2132 /* convert the blocks */
2133 error = xfs_alloc_file_space(ip, start_boundary,
2134 end_boundary - start_boundary - 1,
2135 XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
2136 attr_flags);
2137 if (error)
2138 goto out_unlock;
2139
2140 /* We've handled the interior of the range, now for the edges */
2141 if (start_boundary != offset)
2142 error = xfs_iozero(ip, offset, start_boundary - offset);
2143 if (error)
2144 goto out_unlock;
2145
2146 if (end_boundary != offset + len)
2147 error = xfs_iozero(ip, end_boundary,
2148 offset + len - end_boundary);
2149
2150 } else {
2151 /*
2152 * It's either a sub-granularity range or the range spanned lies
2153 * partially across two adjacent blocks.
2154 */
2155 error = xfs_iozero(ip, offset, len);
2156 }
2157
2158out_unlock:
2159 if (!(attr_flags & XFS_ATTR_NOLOCK))
2160 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
2161 return error;
2162
2163}
2164
2098/* 2165/*
2099 * xfs_change_file_space() 2166 * xfs_change_file_space()
2100 * This routine allocates or frees disk space for the given file. 2167 * This routine allocates or frees disk space for the given file.
@@ -2120,10 +2187,8 @@ xfs_change_file_space(
2120 xfs_fsize_t fsize; 2187 xfs_fsize_t fsize;
2121 int setprealloc; 2188 int setprealloc;
2122 xfs_off_t startoffset; 2189 xfs_off_t startoffset;
2123 xfs_off_t llen;
2124 xfs_trans_t *tp; 2190 xfs_trans_t *tp;
2125 struct iattr iattr; 2191 struct iattr iattr;
2126 int prealloc_type;
2127 2192
2128 if (!S_ISREG(ip->i_d.di_mode)) 2193 if (!S_ISREG(ip->i_d.di_mode))
2129 return XFS_ERROR(EINVAL); 2194 return XFS_ERROR(EINVAL);
@@ -2141,12 +2206,30 @@ xfs_change_file_space(
2141 return XFS_ERROR(EINVAL); 2206 return XFS_ERROR(EINVAL);
2142 } 2207 }
2143 2208
2144 llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len; 2209 /*
2210 * length of <= 0 for resv/unresv/zero is invalid. length for
2211 * alloc/free is ignored completely and we have no idea what userspace
2212 * might have set it to, so set it to zero to allow range
2213 * checks to pass.
2214 */
2215 switch (cmd) {
2216 case XFS_IOC_ZERO_RANGE:
2217 case XFS_IOC_RESVSP:
2218 case XFS_IOC_RESVSP64:
2219 case XFS_IOC_UNRESVSP:
2220 case XFS_IOC_UNRESVSP64:
2221 if (bf->l_len <= 0)
2222 return XFS_ERROR(EINVAL);
2223 break;
2224 default:
2225 bf->l_len = 0;
2226 break;
2227 }
2145 2228
2146 if (bf->l_start < 0 || 2229 if (bf->l_start < 0 ||
2147 bf->l_start > mp->m_super->s_maxbytes || 2230 bf->l_start > mp->m_super->s_maxbytes ||
2148 bf->l_start + llen < 0 || 2231 bf->l_start + bf->l_len < 0 ||
2149 bf->l_start + llen > mp->m_super->s_maxbytes) 2232 bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
2150 return XFS_ERROR(EINVAL); 2233 return XFS_ERROR(EINVAL);
2151 2234
2152 bf->l_whence = 0; 2235 bf->l_whence = 0;
@@ -2154,29 +2237,20 @@ xfs_change_file_space(
2154 startoffset = bf->l_start; 2237 startoffset = bf->l_start;
2155 fsize = XFS_ISIZE(ip); 2238 fsize = XFS_ISIZE(ip);
2156 2239
2157 /*
2158 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
2159 * file space.
2160 * These calls do NOT zero the data space allocated to the file,
2161 * nor do they change the file size.
2162 *
2163 * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
2164 * space.
2165 * These calls cause the new file data to be zeroed and the file
2166 * size to be changed.
2167 */
2168 setprealloc = clrprealloc = 0; 2240 setprealloc = clrprealloc = 0;
2169 prealloc_type = XFS_BMAPI_PREALLOC;
2170
2171 switch (cmd) { 2241 switch (cmd) {
2172 case XFS_IOC_ZERO_RANGE: 2242 case XFS_IOC_ZERO_RANGE:
2173 prealloc_type |= XFS_BMAPI_CONVERT; 2243 error = xfs_zero_file_space(ip, startoffset, bf->l_len,
2174 xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0); 2244 attr_flags);
2175 /* FALLTHRU */ 2245 if (error)
2246 return error;
2247 setprealloc = 1;
2248 break;
2249
2176 case XFS_IOC_RESVSP: 2250 case XFS_IOC_RESVSP:
2177 case XFS_IOC_RESVSP64: 2251 case XFS_IOC_RESVSP64:
2178 error = xfs_alloc_file_space(ip, startoffset, bf->l_len, 2252 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
2179 prealloc_type, attr_flags); 2253 XFS_BMAPI_PREALLOC, attr_flags);
2180 if (error) 2254 if (error)
2181 return error; 2255 return error;
2182 setprealloc = 1; 2256 setprealloc = 1;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 447e146b2ba6..5163022d9808 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -48,14 +48,9 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
48int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); 48int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
49int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, 49int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
50 int flags, struct attrlist_cursor_kern *cursor); 50 int flags, struct attrlist_cursor_kern *cursor);
51void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
52 xfs_off_t last, int fiopt);
53int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,
54 xfs_off_t last, int fiopt);
55int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
56 xfs_off_t last, uint64_t flags, int fiopt);
57int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
58 51
52int xfs_iozero(struct xfs_inode *, loff_t, size_t);
59int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); 53int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
54int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool);
60 55
61#endif /* _XFS_VNODEOPS_H */ 56#endif /* _XFS_VNODEOPS_H */