aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c2
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile3
-rw-r--r--fs/adfs/super.c1
-rw-r--r--fs/affs/inode.c2
-rw-r--r--fs/affs/super.c1
-rw-r--r--fs/afs/inode.c2
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/rxrpc.c12
-rw-r--r--fs/befs/Makefile2
-rw-r--r--fs/befs/befs.h3
-rw-r--r--fs/befs/btree.c93
-rw-r--r--fs/befs/datastream.c87
-rw-r--r--fs/befs/debug.c74
-rw-r--r--fs/befs/inode.c10
-rw-r--r--fs/befs/io.c24
-rw-r--r--fs/befs/linuxvfs.c113
-rw-r--r--fs/bfs/inode.c2
-rw-r--r--fs/binfmt_elf.c9
-rw-r--r--fs/binfmt_misc.c1
-rw-r--r--fs/bio-integrity.c84
-rw-r--r--fs/bio.c3
-rw-r--r--fs/block_dev.c6
-rw-r--r--fs/btrfs/async-thread.c848
-rw-r--r--fs/btrfs/async-thread.h121
-rw-r--r--fs/btrfs/backref.c84
-rw-r--r--fs/btrfs/btrfs_inode.h14
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.c11
-rw-r--r--fs/btrfs/ctree.h73
-rw-r--r--fs/btrfs/delayed-inode.c6
-rw-r--r--fs/btrfs/delayed-ref.c29
-rw-r--r--fs/btrfs/dev-replace.c79
-rw-r--r--fs/btrfs/disk-io.c281
-rw-r--r--fs/btrfs/extent-tree.c58
-rw-r--r--fs/btrfs/extent_io.c15
-rw-r--r--fs/btrfs/extent_map.c56
-rw-r--r--fs/btrfs/extent_map.h10
-rw-r--r--fs/btrfs/file.c160
-rw-r--r--fs/btrfs/inode.c123
-rw-r--r--fs/btrfs/ioctl.c210
-rw-r--r--fs/btrfs/ordered-data.c68
-rw-r--r--fs/btrfs/ordered-data.h6
-rw-r--r--fs/btrfs/qgroup.c15
-rw-r--r--fs/btrfs/raid56.c21
-rw-r--r--fs/btrfs/reada.c4
-rw-r--r--fs/btrfs/relocation.c2
-rw-r--r--fs/btrfs/root-tree.c3
-rw-r--r--fs/btrfs/scrub.c97
-rw-r--r--fs/btrfs/send.c821
-rw-r--r--fs/btrfs/super.c38
-rw-r--r--fs/btrfs/sysfs.c33
-rw-r--r--fs/btrfs/sysfs.h5
-rw-r--r--fs/btrfs/transaction.c39
-rw-r--r--fs/btrfs/tree-log.c236
-rw-r--r--fs/btrfs/tree-log.h18
-rw-r--r--fs/btrfs/volumes.c46
-rw-r--r--fs/btrfs/volumes.h1
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/cachefiles/namei.c4
-rw-r--r--fs/cachefiles/rdwr.c33
-rw-r--r--fs/cifs/cifsfs.c5
-rw-r--r--fs/coda/coda_int.h2
-rw-r--r--fs/coda/inode.c5
-rw-r--r--fs/compat.c162
-rw-r--r--fs/compat_binfmt_elf.c5
-rw-r--r--fs/compat_ioctl.c5
-rw-r--r--fs/cramfs/inode.c4
-rw-r--r--fs/dcache.c50
-rw-r--r--fs/debugfs/inode.c7
-rw-r--r--fs/devpts/inode.c1
-rw-r--r--fs/direct-io.c19
-rw-r--r--fs/dlm/ast.c3
-rw-r--r--fs/dlm/dir.c4
-rw-r--r--fs/dlm/dlm_internal.h2
-rw-r--r--fs/dlm/lock.c7
-rw-r--r--fs/dlm/lockspace.c8
-rw-r--r--fs/dlm/member.c27
-rw-r--r--fs/dlm/recover.c10
-rw-r--r--fs/dlm/recoverd.c34
-rw-r--r--fs/drop_caches.c16
-rw-r--r--fs/ecryptfs/inode.c2
-rw-r--r--fs/ecryptfs/super.c2
-rw-r--r--fs/efivarfs/file.c13
-rw-r--r--fs/efs/super.c3
-rw-r--r--fs/exec.c8
-rw-r--r--fs/exofs/inode.c2
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/super.c1
-rw-r--r--fs/ext3/inode.c2
-rw-r--r--fs/ext3/super.c2
-rw-r--r--fs/ext4/ext4.h11
-rw-r--r--fs/ext4/ext4_jbd2.c10
-rw-r--r--fs/ext4/extents.c818
-rw-r--r--fs/ext4/extents_status.c28
-rw-r--r--fs/ext4/extents_status.h9
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/inode.c124
-rw-r--r--fs/ext4/ioctl.c24
-rw-r--r--fs/ext4/mballoc.c7
-rw-r--r--fs/ext4/mballoc.h4
-rw-r--r--fs/ext4/move_extent.c5
-rw-r--r--fs/ext4/namei.c480
-rw-r--r--fs/ext4/super.c40
-rw-r--r--fs/ext4/xattr.c59
-rw-r--r--fs/ext4/xattr.h6
-rw-r--r--fs/f2fs/inode.c2
-rw-r--r--fs/f2fs/super.c2
-rw-r--r--fs/fat/inode.c4
-rw-r--r--fs/fcntl.c37
-rw-r--r--fs/file.c2
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/filesystems.c2
-rw-r--r--fs/freevxfs/vxfs_inode.c2
-rw-r--r--fs/freevxfs/vxfs_lookup.c2
-rw-r--r--fs/freevxfs/vxfs_super.c1
-rw-r--r--fs/fs-writeback.c33
-rw-r--r--fs/fuse/cuse.c5
-rw-r--r--fs/fuse/dir.c119
-rw-r--r--fs/fuse/file.c286
-rw-r--r--fs/fuse/fuse_i.h22
-rw-r--r--fs/fuse/inode.c32
-rw-r--r--fs/gfs2/acl.c23
-rw-r--r--fs/gfs2/acl.h2
-rw-r--r--fs/gfs2/aops.c132
-rw-r--r--fs/gfs2/bmap.c115
-rw-r--r--fs/gfs2/bmap.h2
-rw-r--r--fs/gfs2/dir.c23
-rw-r--r--fs/gfs2/file.c13
-rw-r--r--fs/gfs2/glock.c28
-rw-r--r--fs/gfs2/glops.c2
-rw-r--r--fs/gfs2/incore.h37
-rw-r--r--fs/gfs2/inode.c75
-rw-r--r--fs/gfs2/lock_dlm.c10
-rw-r--r--fs/gfs2/log.c102
-rw-r--r--fs/gfs2/lops.c85
-rw-r--r--fs/gfs2/lops.h5
-rw-r--r--fs/gfs2/main.c4
-rw-r--r--fs/gfs2/meta_io.c14
-rw-r--r--fs/gfs2/meta_io.h3
-rw-r--r--fs/gfs2/ops_fstype.c89
-rw-r--r--fs/gfs2/quota.c18
-rw-r--r--fs/gfs2/recovery.c16
-rw-r--r--fs/gfs2/recovery.h6
-rw-r--r--fs/gfs2/rgrp.c32
-rw-r--r--fs/gfs2/super.c41
-rw-r--r--fs/gfs2/sys.c7
-rw-r--r--fs/gfs2/trans.c29
-rw-r--r--fs/gfs2/util.c101
-rw-r--r--fs/gfs2/util.h31
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfs/super.c1
-rw-r--r--fs/hfsplus/attributes.c2
-rw-r--r--fs/hfsplus/extents.c16
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/super.c3
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hpfs/inode.c2
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/hugetlbfs/inode.c17
-rw-r--r--fs/inode.c60
-rw-r--r--fs/isofs/inode.c1
-rw-r--r--fs/jbd2/commit.c77
-rw-r--r--fs/jbd2/journal.c10
-rw-r--r--fs/jbd2/transaction.c46
-rw-r--r--fs/jffs2/fs.c4
-rw-r--r--fs/jffs2/super.c1
-rw-r--r--fs/jfs/inode.c4
-rw-r--r--fs/jfs/super.c1
-rw-r--r--fs/kernfs/Kconfig7
-rw-r--r--fs/kernfs/dir.c753
-rw-r--r--fs/kernfs/file.c22
-rw-r--r--fs/kernfs/inode.c2
-rw-r--r--fs/kernfs/kernfs-internal.h15
-rw-r--r--fs/kernfs/mount.c39
-rw-r--r--fs/kernfs/symlink.c6
-rw-r--r--fs/locks.c389
-rw-r--r--fs/logfs/readwrite.c2
-rw-r--r--fs/mbcache.c540
-rw-r--r--fs/minix/inode.c5
-rw-r--r--fs/namei.c317
-rw-r--r--fs/ncpfs/inode.c3
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/nfs/nfs4super.c2
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfsd/auth.c5
-rw-r--r--fs/nfsd/vfs.c2
-rw-r--r--fs/nilfs2/cpfile.c12
-rw-r--r--fs/nilfs2/dat.c12
-rw-r--r--fs/nilfs2/inode.c6
-rw-r--r--fs/nilfs2/ioctl.c137
-rw-r--r--fs/nilfs2/sufile.c295
-rw-r--r--fs/nilfs2/sufile.h2
-rw-r--r--fs/nilfs2/super.c1
-rw-r--r--fs/nilfs2/the_nilfs.c10
-rw-r--r--fs/notify/fanotify/fanotify.c63
-rw-r--r--fs/notify/fanotify/fanotify.h34
-rw-r--r--fs/notify/fanotify/fanotify_user.c197
-rw-r--r--fs/ntfs/inode.c2
-rw-r--r--fs/ntfs/super.c2
-rw-r--r--fs/ocfs2/acl.c1
-rw-r--r--fs/ocfs2/alloc.c3
-rw-r--r--fs/ocfs2/aops.c7
-rw-r--r--fs/ocfs2/aops.h5
-rw-r--r--fs/ocfs2/buffer_head_io.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c28
-rw-r--r--fs/ocfs2/dcache.c61
-rw-r--r--fs/ocfs2/dcache.h12
-rw-r--r--fs/ocfs2/dir.c6
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c27
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c29
-rw-r--r--fs/ocfs2/dlmglue.c44
-rw-r--r--fs/ocfs2/dlmglue.h3
-rw-r--r--fs/ocfs2/file.c69
-rw-r--r--fs/ocfs2/inode.c61
-rw-r--r--fs/ocfs2/inode.h17
-rw-r--r--fs/ocfs2/ioctl.c5
-rw-r--r--fs/ocfs2/journal.c6
-rw-r--r--fs/ocfs2/journal.h11
-rw-r--r--fs/ocfs2/locks.c2
-rw-r--r--fs/ocfs2/move_extents.c7
-rw-r--r--fs/ocfs2/namei.c8
-rw-r--r--fs/ocfs2/ocfs2.h33
-rw-r--r--fs/ocfs2/quota.h2
-rw-r--r--fs/ocfs2/quota_global.c35
-rw-r--r--fs/ocfs2/stackglue.c14
-rw-r--r--fs/ocfs2/suballoc.c29
-rw-r--r--fs/ocfs2/suballoc.h4
-rw-r--r--fs/ocfs2/super.c55
-rw-r--r--fs/ocfs2/sysfile.c3
-rw-r--r--fs/ocfs2/xattr.c35
-rw-r--r--fs/omfs/inode.c2
-rw-r--r--fs/open.c29
-rw-r--r--fs/openpromfs/inode.c1
-rw-r--r--fs/posix_acl.c5
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/internal.h7
-rw-r--r--fs/proc/proc_devtree.c241
-rw-r--r--fs/proc/root.c5
-rw-r--r--fs/proc/stat.c2
-rw-r--r--fs/proc/uptime.c2
-rw-r--r--fs/pstore/inode.c1
-rw-r--r--fs/pstore/platform.c1
-rw-r--r--fs/pstore/ram.c19
-rw-r--r--fs/pstore/ram_core.c4
-rw-r--r--fs/qnx4/inode.c1
-rw-r--r--fs/qnx6/inode.c1
-rw-r--r--fs/quota/dquot.c4
-rw-r--r--fs/read_write.c36
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/reiserfs/reiserfs.h1
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/romfs/super.c1
-rw-r--r--fs/squashfs/super.c1
-rw-r--r--fs/super.c2
-rw-r--r--fs/sysfs/Kconfig1
-rw-r--r--fs/sysfs/dir.c44
-rw-r--r--fs/sysfs/file.c23
-rw-r--r--fs/sysfs/group.c7
-rw-r--r--fs/sysfs/mount.c2
-rw-r--r--fs/sysv/inode.c3
-rw-r--r--fs/timerfd.c1
-rw-r--r--fs/ubifs/super.c3
-rw-r--r--fs/udf/inode.c4
-rw-r--r--fs/udf/super.c1
-rw-r--r--fs/ufs/inode.c2
-rw-r--r--fs/ufs/super.c1
-rw-r--r--fs/xfs/kmem.c21
-rw-r--r--fs/xfs/xfs_acl.c2
-rw-r--r--fs/xfs/xfs_ag.h6
-rw-r--r--fs/xfs/xfs_alloc.c45
-rw-r--r--fs/xfs/xfs_alloc_btree.c16
-rw-r--r--fs/xfs/xfs_aops.c84
-rw-r--r--fs/xfs/xfs_attr_leaf.c17
-rw-r--r--fs/xfs/xfs_attr_remote.c15
-rw-r--r--fs/xfs/xfs_bmap.c193
-rw-r--r--fs/xfs/xfs_bmap.h15
-rw-r--r--fs/xfs/xfs_bmap_btree.c16
-rw-r--r--fs/xfs/xfs_bmap_util.c97
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_btree.c14
-rw-r--r--fs/xfs/xfs_buf.c11
-rw-r--r--fs/xfs/xfs_buf.h14
-rw-r--r--fs/xfs/xfs_buf_item.c19
-rw-r--r--fs/xfs/xfs_da_btree.c19
-rw-r--r--fs/xfs/xfs_dinode.h2
-rw-r--r--fs/xfs/xfs_dir2.c342
-rw-r--r--fs/xfs/xfs_dir2_block.c17
-rw-r--r--fs/xfs/xfs_dir2_data.c20
-rw-r--r--fs/xfs/xfs_dir2_leaf.c17
-rw-r--r--fs/xfs/xfs_dir2_node.c17
-rw-r--r--fs/xfs/xfs_dquot.c2
-rw-r--r--fs/xfs/xfs_dquot_buf.c11
-rw-r--r--fs/xfs/xfs_error.c27
-rw-r--r--fs/xfs/xfs_error.h1
-rw-r--r--fs/xfs/xfs_file.c26
-rw-r--r--fs/xfs/xfs_format.h2
-rw-r--r--fs/xfs/xfs_ialloc.c36
-rw-r--r--fs/xfs/xfs_ialloc_btree.c16
-rw-r--r--fs/xfs/xfs_inode.c123
-rw-r--r--fs/xfs/xfs_inode.h12
-rw-r--r--fs/xfs/xfs_inode_buf.c7
-rw-r--r--fs/xfs/xfs_iomap.c10
-rw-r--r--fs/xfs/xfs_iops.c30
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_cil.c55
-rw-r--r--fs/xfs/xfs_mount.c3
-rw-r--r--fs/xfs/xfs_rtalloc.c2
-rw-r--r--fs/xfs/xfs_sb.c17
-rw-r--r--fs/xfs/xfs_sb.h2
-rw-r--r--fs/xfs/xfs_shared.h4
-rw-r--r--fs/xfs/xfs_super.c3
-rw-r--r--fs/xfs/xfs_symlink.c9
-rw-r--r--fs/xfs/xfs_symlink_remote.c16
-rw-r--r--fs/xfs/xfs_trace.h1
-rw-r--r--fs/xfs/xfs_trans.c12
-rw-r--r--fs/xfs/xfs_trans_buf.c11
-rw-r--r--fs/xfs/xfs_trans_resv.c82
-rw-r--r--fs/xfs/xfs_trans_resv.h3
322 files changed, 8693 insertions, 4933 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index bb7991c7e5c7..53161ec058a7 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -451,7 +451,7 @@ void v9fs_evict_inode(struct inode *inode)
451{ 451{
452 struct v9fs_inode *v9inode = V9FS_I(inode); 452 struct v9fs_inode *v9inode = V9FS_I(inode);
453 453
454 truncate_inode_pages(inode->i_mapping, 0); 454 truncate_inode_pages_final(inode->i_mapping);
455 clear_inode(inode); 455 clear_inode(inode);
456 filemap_fdatawrite(inode->i_mapping); 456 filemap_fdatawrite(inode->i_mapping);
457 457
diff --git a/fs/Kconfig b/fs/Kconfig
index 7385e54be4b9..312393f32948 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -96,6 +96,7 @@ endif # BLOCK
96menu "Pseudo filesystems" 96menu "Pseudo filesystems"
97 97
98source "fs/proc/Kconfig" 98source "fs/proc/Kconfig"
99source "fs/kernfs/Kconfig"
99source "fs/sysfs/Kconfig" 100source "fs/sysfs/Kconfig"
100 101
101config TMPFS 102config TMPFS
diff --git a/fs/Makefile b/fs/Makefile
index 47ac07bb4acc..f9cb9876e466 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -52,7 +52,8 @@ obj-$(CONFIG_FHANDLE) += fhandle.o
52obj-y += quota/ 52obj-y += quota/
53 53
54obj-$(CONFIG_PROC_FS) += proc/ 54obj-$(CONFIG_PROC_FS) += proc/
55obj-$(CONFIG_SYSFS) += sysfs/ kernfs/ 55obj-$(CONFIG_KERNFS) += kernfs/
56obj-$(CONFIG_SYSFS) += sysfs/
56obj-$(CONFIG_CONFIGFS_FS) += configfs/ 57obj-$(CONFIG_CONFIGFS_FS) += configfs/
57obj-y += devpts/ 58obj-y += devpts/
58 59
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 7b3003cb6f1b..952aeb048349 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -212,6 +212,7 @@ static int parse_options(struct super_block *sb, char *options)
212 212
213static int adfs_remount(struct super_block *sb, int *flags, char *data) 213static int adfs_remount(struct super_block *sb, int *flags, char *data)
214{ 214{
215 sync_filesystem(sb);
215 *flags |= MS_NODIRATIME; 216 *flags |= MS_NODIRATIME;
216 return parse_options(sb, data); 217 return parse_options(sb, data);
217} 218}
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 0e092d08680e..96df91e8c334 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -259,7 +259,7 @@ affs_evict_inode(struct inode *inode)
259{ 259{
260 unsigned long cache_page; 260 unsigned long cache_page;
261 pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink); 261 pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
262 truncate_inode_pages(&inode->i_data, 0); 262 truncate_inode_pages_final(&inode->i_data);
263 263
264 if (!inode->i_nlink) { 264 if (!inode->i_nlink) {
265 inode->i_size = 0; 265 inode->i_size = 0;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d098731b82ff..307453086c3f 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -530,6 +530,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
530 530
531 pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data); 531 pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data);
532 532
533 sync_filesystem(sb);
533 *flags |= MS_NODIRATIME; 534 *flags |= MS_NODIRATIME;
534 535
535 memcpy(volume, sbi->s_volume, 32); 536 memcpy(volume, sbi->s_volume, 32);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index ce25d755b7aa..294671288449 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -422,7 +422,7 @@ void afs_evict_inode(struct inode *inode)
422 422
423 ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); 423 ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
424 424
425 truncate_inode_pages(&inode->i_data, 0); 425 truncate_inode_pages_final(&inode->i_data);
426 clear_inode(inode); 426 clear_inode(inode);
427 427
428 afs_give_up_callback(vnode); 428 afs_give_up_callback(vnode);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6621f8008122..be75b500005d 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -75,6 +75,7 @@ struct afs_call {
75 const struct afs_call_type *type; /* type of call */ 75 const struct afs_call_type *type; /* type of call */
76 const struct afs_wait_mode *wait_mode; /* completion wait mode */ 76 const struct afs_wait_mode *wait_mode; /* completion wait mode */
77 wait_queue_head_t waitq; /* processes awaiting completion */ 77 wait_queue_head_t waitq; /* processes awaiting completion */
78 work_func_t async_workfn;
78 struct work_struct async_work; /* asynchronous work processor */ 79 struct work_struct async_work; /* asynchronous work processor */
79 struct work_struct work; /* actual work processor */ 80 struct work_struct work; /* actual work processor */
80 struct sk_buff_head rx_queue; /* received packets */ 81 struct sk_buff_head rx_queue; /* received packets */
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 8ad8c2a0703a..ef943df73b8c 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -644,7 +644,7 @@ static void afs_process_async_call(struct work_struct *work)
644 644
645 /* we can't just delete the call because the work item may be 645 /* we can't just delete the call because the work item may be
646 * queued */ 646 * queued */
647 PREPARE_WORK(&call->async_work, afs_delete_async_call); 647 call->async_workfn = afs_delete_async_call;
648 queue_work(afs_async_calls, &call->async_work); 648 queue_work(afs_async_calls, &call->async_work);
649 } 649 }
650 650
@@ -663,6 +663,13 @@ void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb)
663 call->reply_size += len; 663 call->reply_size += len;
664} 664}
665 665
666static void afs_async_workfn(struct work_struct *work)
667{
668 struct afs_call *call = container_of(work, struct afs_call, async_work);
669
670 call->async_workfn(work);
671}
672
666/* 673/*
667 * accept the backlog of incoming calls 674 * accept the backlog of incoming calls
668 */ 675 */
@@ -685,7 +692,8 @@ static void afs_collect_incoming_call(struct work_struct *work)
685 return; 692 return;
686 } 693 }
687 694
688 INIT_WORK(&call->async_work, afs_process_async_call); 695 call->async_workfn = afs_process_async_call;
696 INIT_WORK(&call->async_work, afs_async_workfn);
689 call->wait_mode = &afs_async_incoming_call; 697 call->wait_mode = &afs_async_incoming_call;
690 call->type = &afs_RXCMxxxx; 698 call->type = &afs_RXCMxxxx;
691 init_waitqueue_head(&call->waitq); 699 init_waitqueue_head(&call->waitq);
diff --git a/fs/befs/Makefile b/fs/befs/Makefile
index 2f370bd7a50d..8b9f66642a83 100644
--- a/fs/befs/Makefile
+++ b/fs/befs/Makefile
@@ -3,5 +3,5 @@
3# 3#
4 4
5obj-$(CONFIG_BEFS_FS) += befs.o 5obj-$(CONFIG_BEFS_FS) += befs.o
6 6ccflags-$(CONFIG_BEFS_DEBUG) += -DDEBUG
7befs-objs := datastream.o btree.o super.o inode.o debug.o io.o linuxvfs.o 7befs-objs := datastream.o btree.o super.o inode.o debug.o io.o linuxvfs.o
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
index b26642839156..3a7813ab8c95 100644
--- a/fs/befs/befs.h
+++ b/fs/befs/befs.h
@@ -88,8 +88,11 @@ enum befs_err {
88 88
89/****************************/ 89/****************************/
90/* debug.c */ 90/* debug.c */
91__printf(2, 3)
91void befs_error(const struct super_block *sb, const char *fmt, ...); 92void befs_error(const struct super_block *sb, const char *fmt, ...);
93__printf(2, 3)
92void befs_warning(const struct super_block *sb, const char *fmt, ...); 94void befs_warning(const struct super_block *sb, const char *fmt, ...);
95__printf(2, 3)
93void befs_debug(const struct super_block *sb, const char *fmt, ...); 96void befs_debug(const struct super_block *sb, const char *fmt, ...);
94 97
95void befs_dump_super_block(const struct super_block *sb, befs_super_block *); 98void befs_dump_super_block(const struct super_block *sb, befs_super_block *);
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 74e397db0b8b..a2cd305a993a 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -137,7 +137,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
137 struct buffer_head *bh = NULL; 137 struct buffer_head *bh = NULL;
138 befs_disk_btree_super *od_sup = NULL; 138 befs_disk_btree_super *od_sup = NULL;
139 139
140 befs_debug(sb, "---> befs_btree_read_super()"); 140 befs_debug(sb, "---> %s", __func__);
141 141
142 bh = befs_read_datastream(sb, ds, 0, NULL); 142 bh = befs_read_datastream(sb, ds, 0, NULL);
143 143
@@ -162,11 +162,11 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
162 goto error; 162 goto error;
163 } 163 }
164 164
165 befs_debug(sb, "<--- befs_btree_read_super()"); 165 befs_debug(sb, "<--- %s", __func__);
166 return BEFS_OK; 166 return BEFS_OK;
167 167
168 error: 168 error:
169 befs_debug(sb, "<--- befs_btree_read_super() ERROR"); 169 befs_debug(sb, "<--- %s ERROR", __func__);
170 return BEFS_ERR; 170 return BEFS_ERR;
171} 171}
172 172
@@ -195,16 +195,16 @@ befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
195{ 195{
196 uint off = 0; 196 uint off = 0;
197 197
198 befs_debug(sb, "---> befs_bt_read_node()"); 198 befs_debug(sb, "---> %s", __func__);
199 199
200 if (node->bh) 200 if (node->bh)
201 brelse(node->bh); 201 brelse(node->bh);
202 202
203 node->bh = befs_read_datastream(sb, ds, node_off, &off); 203 node->bh = befs_read_datastream(sb, ds, node_off, &off);
204 if (!node->bh) { 204 if (!node->bh) {
205 befs_error(sb, "befs_bt_read_node() failed to read " 205 befs_error(sb, "%s failed to read "
206 "node at %Lu", node_off); 206 "node at %llu", __func__, node_off);
207 befs_debug(sb, "<--- befs_bt_read_node() ERROR"); 207 befs_debug(sb, "<--- %s ERROR", __func__);
208 208
209 return BEFS_ERR; 209 return BEFS_ERR;
210 } 210 }
@@ -221,7 +221,7 @@ befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
221 node->head.all_key_length = 221 node->head.all_key_length =
222 fs16_to_cpu(sb, node->od_node->all_key_length); 222 fs16_to_cpu(sb, node->od_node->all_key_length);
223 223
224 befs_debug(sb, "<--- befs_btree_read_node()"); 224 befs_debug(sb, "<--- %s", __func__);
225 return BEFS_OK; 225 return BEFS_OK;
226} 226}
227 227
@@ -252,7 +252,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
252 befs_off_t node_off; 252 befs_off_t node_off;
253 int res; 253 int res;
254 254
255 befs_debug(sb, "---> befs_btree_find() Key: %s", key); 255 befs_debug(sb, "---> %s Key: %s", __func__, key);
256 256
257 if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) { 257 if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) {
258 befs_error(sb, 258 befs_error(sb,
@@ -263,7 +263,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
263 this_node = kmalloc(sizeof (befs_btree_node), 263 this_node = kmalloc(sizeof (befs_btree_node),
264 GFP_NOFS); 264 GFP_NOFS);
265 if (!this_node) { 265 if (!this_node) {
266 befs_error(sb, "befs_btree_find() failed to allocate %u " 266 befs_error(sb, "befs_btree_find() failed to allocate %zu "
267 "bytes of memory", sizeof (befs_btree_node)); 267 "bytes of memory", sizeof (befs_btree_node));
268 goto error; 268 goto error;
269 } 269 }
@@ -274,7 +274,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
274 node_off = bt_super.root_node_ptr; 274 node_off = bt_super.root_node_ptr;
275 if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { 275 if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
276 befs_error(sb, "befs_btree_find() failed to read " 276 befs_error(sb, "befs_btree_find() failed to read "
277 "node at %Lu", node_off); 277 "node at %llu", node_off);
278 goto error_alloc; 278 goto error_alloc;
279 } 279 }
280 280
@@ -285,7 +285,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
285 /* if no match, go to overflow node */ 285 /* if no match, go to overflow node */
286 if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { 286 if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
287 befs_error(sb, "befs_btree_find() failed to read " 287 befs_error(sb, "befs_btree_find() failed to read "
288 "node at %Lu", node_off); 288 "node at %llu", node_off);
289 goto error_alloc; 289 goto error_alloc;
290 } 290 }
291 } 291 }
@@ -298,11 +298,11 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
298 kfree(this_node); 298 kfree(this_node);
299 299
300 if (res != BEFS_BT_MATCH) { 300 if (res != BEFS_BT_MATCH) {
301 befs_debug(sb, "<--- befs_btree_find() Key %s not found", key); 301 befs_debug(sb, "<--- %s Key %s not found", __func__, key);
302 *value = 0; 302 *value = 0;
303 return BEFS_BT_NOT_FOUND; 303 return BEFS_BT_NOT_FOUND;
304 } 304 }
305 befs_debug(sb, "<--- befs_btree_find() Found key %s, value %Lu", 305 befs_debug(sb, "<--- %s Found key %s, value %llu", __func__,
306 key, *value); 306 key, *value);
307 return BEFS_OK; 307 return BEFS_OK;
308 308
@@ -310,7 +310,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
310 kfree(this_node); 310 kfree(this_node);
311 error: 311 error:
312 *value = 0; 312 *value = 0;
313 befs_debug(sb, "<--- befs_btree_find() ERROR"); 313 befs_debug(sb, "<--- %s ERROR", __func__);
314 return BEFS_ERR; 314 return BEFS_ERR;
315} 315}
316 316
@@ -343,7 +343,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
343 char *thiskey; 343 char *thiskey;
344 fs64 *valarray; 344 fs64 *valarray;
345 345
346 befs_debug(sb, "---> befs_find_key() %s", findkey); 346 befs_debug(sb, "---> %s %s", __func__, findkey);
347 347
348 *value = 0; 348 *value = 0;
349 349
@@ -355,7 +355,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
355 355
356 eq = befs_compare_strings(thiskey, keylen, findkey, findkey_len); 356 eq = befs_compare_strings(thiskey, keylen, findkey, findkey_len);
357 if (eq < 0) { 357 if (eq < 0) {
358 befs_debug(sb, "<--- befs_find_key() %s not found", findkey); 358 befs_debug(sb, "<--- %s %s not found", __func__, findkey);
359 return BEFS_BT_NOT_FOUND; 359 return BEFS_BT_NOT_FOUND;
360 } 360 }
361 361
@@ -373,8 +373,8 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
373 findkey_len); 373 findkey_len);
374 374
375 if (eq == 0) { 375 if (eq == 0) {
376 befs_debug(sb, "<--- befs_find_key() found %s at %d", 376 befs_debug(sb, "<--- %s found %s at %d",
377 thiskey, mid); 377 __func__, thiskey, mid);
378 378
379 *value = fs64_to_cpu(sb, valarray[mid]); 379 *value = fs64_to_cpu(sb, valarray[mid]);
380 return BEFS_BT_MATCH; 380 return BEFS_BT_MATCH;
@@ -388,7 +388,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
388 *value = fs64_to_cpu(sb, valarray[mid + 1]); 388 *value = fs64_to_cpu(sb, valarray[mid + 1]);
389 else 389 else
390 *value = fs64_to_cpu(sb, valarray[mid]); 390 *value = fs64_to_cpu(sb, valarray[mid]);
391 befs_debug(sb, "<--- befs_find_key() found %s at %d", thiskey, mid); 391 befs_debug(sb, "<--- %s found %s at %d", __func__, thiskey, mid);
392 return BEFS_BT_PARMATCH; 392 return BEFS_BT_PARMATCH;
393} 393}
394 394
@@ -428,7 +428,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
428 428
429 uint key_sum = 0; 429 uint key_sum = 0;
430 430
431 befs_debug(sb, "---> befs_btree_read()"); 431 befs_debug(sb, "---> %s", __func__);
432 432
433 if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) { 433 if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) {
434 befs_error(sb, 434 befs_error(sb,
@@ -437,7 +437,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
437 } 437 }
438 438
439 if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) { 439 if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) {
440 befs_error(sb, "befs_btree_read() failed to allocate %u " 440 befs_error(sb, "befs_btree_read() failed to allocate %zu "
441 "bytes of memory", sizeof (befs_btree_node)); 441 "bytes of memory", sizeof (befs_btree_node));
442 goto error; 442 goto error;
443 } 443 }
@@ -452,7 +452,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
452 kfree(this_node); 452 kfree(this_node);
453 *value = 0; 453 *value = 0;
454 *keysize = 0; 454 *keysize = 0;
455 befs_debug(sb, "<--- befs_btree_read() Tree is EMPTY"); 455 befs_debug(sb, "<--- %s Tree is EMPTY", __func__);
456 return BEFS_BT_EMPTY; 456 return BEFS_BT_EMPTY;
457 } else if (res == BEFS_ERR) { 457 } else if (res == BEFS_ERR) {
458 goto error_alloc; 458 goto error_alloc;
@@ -467,7 +467,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
467 *keysize = 0; 467 *keysize = 0;
468 *value = 0; 468 *value = 0;
469 befs_debug(sb, 469 befs_debug(sb,
470 "<--- befs_btree_read() END of keys at %Lu", 470 "<--- %s END of keys at %llu", __func__,
471 (unsigned long long)
471 key_sum + this_node->head.all_key_count); 472 key_sum + this_node->head.all_key_count);
472 brelse(this_node->bh); 473 brelse(this_node->bh);
473 kfree(this_node); 474 kfree(this_node);
@@ -478,8 +479,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
478 node_off = this_node->head.right; 479 node_off = this_node->head.right;
479 480
480 if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { 481 if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
481 befs_error(sb, "befs_btree_read() failed to read " 482 befs_error(sb, "%s failed to read node at %llu",
482 "node at %Lu", node_off); 483 __func__, (unsigned long long)node_off);
483 goto error_alloc; 484 goto error_alloc;
484 } 485 }
485 } 486 }
@@ -492,11 +493,13 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
492 493
493 keystart = befs_bt_get_key(sb, this_node, cur_key, &keylen); 494 keystart = befs_bt_get_key(sb, this_node, cur_key, &keylen);
494 495
495 befs_debug(sb, "Read [%Lu,%d]: keysize %d", node_off, cur_key, keylen); 496 befs_debug(sb, "Read [%llu,%d]: keysize %d",
497 (long long unsigned int)node_off, (int)cur_key,
498 (int)keylen);
496 499
497 if (bufsize < keylen + 1) { 500 if (bufsize < keylen + 1) {
498 befs_error(sb, "befs_btree_read() keybuf too small (%u) " 501 befs_error(sb, "%s keybuf too small (%zu) "
499 "for key of size %d", bufsize, keylen); 502 "for key of size %d", __func__, bufsize, keylen);
500 brelse(this_node->bh); 503 brelse(this_node->bh);
501 goto error_alloc; 504 goto error_alloc;
502 }; 505 };
@@ -506,13 +509,13 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
506 *keysize = keylen; 509 *keysize = keylen;
507 keybuf[keylen] = '\0'; 510 keybuf[keylen] = '\0';
508 511
509 befs_debug(sb, "Read [%Lu,%d]: Key \"%.*s\", Value %Lu", node_off, 512 befs_debug(sb, "Read [%llu,%d]: Key \"%.*s\", Value %llu", node_off,
510 cur_key, keylen, keybuf, *value); 513 cur_key, keylen, keybuf, *value);
511 514
512 brelse(this_node->bh); 515 brelse(this_node->bh);
513 kfree(this_node); 516 kfree(this_node);
514 517
515 befs_debug(sb, "<--- befs_btree_read()"); 518 befs_debug(sb, "<--- %s", __func__);
516 519
517 return BEFS_OK; 520 return BEFS_OK;
518 521
@@ -522,7 +525,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
522 error: 525 error:
523 *keysize = 0; 526 *keysize = 0;
524 *value = 0; 527 *value = 0;
525 befs_debug(sb, "<--- befs_btree_read() ERROR"); 528 befs_debug(sb, "<--- %s ERROR", __func__);
526 return BEFS_ERR; 529 return BEFS_ERR;
527} 530}
528 531
@@ -547,26 +550,26 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
547 befs_off_t * node_off) 550 befs_off_t * node_off)
548{ 551{
549 552
550 befs_debug(sb, "---> befs_btree_seekleaf()"); 553 befs_debug(sb, "---> %s", __func__);
551 554
552 if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) { 555 if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) {
553 befs_error(sb, "befs_btree_seekleaf() failed to read " 556 befs_error(sb, "%s failed to read "
554 "node at %Lu", *node_off); 557 "node at %llu", __func__, *node_off);
555 goto error; 558 goto error;
556 } 559 }
557 befs_debug(sb, "Seekleaf to root node %Lu", *node_off); 560 befs_debug(sb, "Seekleaf to root node %llu", *node_off);
558 561
559 if (this_node->head.all_key_count == 0 && befs_leafnode(this_node)) { 562 if (this_node->head.all_key_count == 0 && befs_leafnode(this_node)) {
560 befs_debug(sb, "<--- befs_btree_seekleaf() Tree is EMPTY"); 563 befs_debug(sb, "<--- %s Tree is EMPTY", __func__);
561 return BEFS_BT_EMPTY; 564 return BEFS_BT_EMPTY;
562 } 565 }
563 566
564 while (!befs_leafnode(this_node)) { 567 while (!befs_leafnode(this_node)) {
565 568
566 if (this_node->head.all_key_count == 0) { 569 if (this_node->head.all_key_count == 0) {
567 befs_debug(sb, "befs_btree_seekleaf() encountered " 570 befs_debug(sb, "%s encountered "
568 "an empty interior node: %Lu. Using Overflow " 571 "an empty interior node: %llu. Using Overflow "
569 "node: %Lu", *node_off, 572 "node: %llu", __func__, *node_off,
570 this_node->head.overflow); 573 this_node->head.overflow);
571 *node_off = this_node->head.overflow; 574 *node_off = this_node->head.overflow;
572 } else { 575 } else {
@@ -574,19 +577,19 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
574 *node_off = fs64_to_cpu(sb, valarray[0]); 577 *node_off = fs64_to_cpu(sb, valarray[0]);
575 } 578 }
576 if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) { 579 if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) {
577 befs_error(sb, "befs_btree_seekleaf() failed to read " 580 befs_error(sb, "%s failed to read "
578 "node at %Lu", *node_off); 581 "node at %llu", __func__, *node_off);
579 goto error; 582 goto error;
580 } 583 }
581 584
582 befs_debug(sb, "Seekleaf to child node %Lu", *node_off); 585 befs_debug(sb, "Seekleaf to child node %llu", *node_off);
583 } 586 }
584 befs_debug(sb, "Node %Lu is a leaf node", *node_off); 587 befs_debug(sb, "Node %llu is a leaf node", *node_off);
585 588
586 return BEFS_OK; 589 return BEFS_OK;
587 590
588 error: 591 error:
589 befs_debug(sb, "<--- befs_btree_seekleaf() ERROR"); 592 befs_debug(sb, "<--- %s ERROR", __func__);
590 return BEFS_ERR; 593 return BEFS_ERR;
591} 594}
592 595
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index 59096b5e0fc7..c467bebd50af 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -52,26 +52,25 @@ befs_read_datastream(struct super_block *sb, befs_data_stream * ds,
52 befs_block_run run; 52 befs_block_run run;
53 befs_blocknr_t block; /* block coresponding to pos */ 53 befs_blocknr_t block; /* block coresponding to pos */
54 54
55 befs_debug(sb, "---> befs_read_datastream() %Lu", pos); 55 befs_debug(sb, "---> %s %llu", __func__, pos);
56 block = pos >> BEFS_SB(sb)->block_shift; 56 block = pos >> BEFS_SB(sb)->block_shift;
57 if (off) 57 if (off)
58 *off = pos - (block << BEFS_SB(sb)->block_shift); 58 *off = pos - (block << BEFS_SB(sb)->block_shift);
59 59
60 if (befs_fblock2brun(sb, ds, block, &run) != BEFS_OK) { 60 if (befs_fblock2brun(sb, ds, block, &run) != BEFS_OK) {
61 befs_error(sb, "BeFS: Error finding disk addr of block %lu", 61 befs_error(sb, "BeFS: Error finding disk addr of block %lu",
62 block); 62 (unsigned long)block);
63 befs_debug(sb, "<--- befs_read_datastream() ERROR"); 63 befs_debug(sb, "<--- %s ERROR", __func__);
64 return NULL; 64 return NULL;
65 } 65 }
66 bh = befs_bread_iaddr(sb, run); 66 bh = befs_bread_iaddr(sb, run);
67 if (!bh) { 67 if (!bh) {
68 befs_error(sb, "BeFS: Error reading block %lu from datastream", 68 befs_error(sb, "BeFS: Error reading block %lu from datastream",
69 block); 69 (unsigned long)block);
70 return NULL; 70 return NULL;
71 } 71 }
72 72
73 befs_debug(sb, "<--- befs_read_datastream() read data, starting at %Lu", 73 befs_debug(sb, "<--- %s read data, starting at %llu", __func__, pos);
74 pos);
75 74
76 return bh; 75 return bh;
77} 76}
@@ -106,7 +105,8 @@ befs_fblock2brun(struct super_block *sb, befs_data_stream * data,
106 } else { 105 } else {
107 befs_error(sb, 106 befs_error(sb,
108 "befs_fblock2brun() was asked to find block %lu, " 107 "befs_fblock2brun() was asked to find block %lu, "
109 "which is not mapped by the datastream\n", fblock); 108 "which is not mapped by the datastream\n",
109 (unsigned long)fblock);
110 err = BEFS_ERR; 110 err = BEFS_ERR;
111 } 111 }
112 return err; 112 return err;
@@ -128,14 +128,14 @@ befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff,
128 befs_off_t bytes_read = 0; /* bytes readed */ 128 befs_off_t bytes_read = 0; /* bytes readed */
129 u16 plen; 129 u16 plen;
130 struct buffer_head *bh = NULL; 130 struct buffer_head *bh = NULL;
131 befs_debug(sb, "---> befs_read_lsymlink() length: %Lu", len); 131 befs_debug(sb, "---> %s length: %llu", __func__, len);
132 132
133 while (bytes_read < len) { 133 while (bytes_read < len) {
134 bh = befs_read_datastream(sb, ds, bytes_read, NULL); 134 bh = befs_read_datastream(sb, ds, bytes_read, NULL);
135 if (!bh) { 135 if (!bh) {
136 befs_error(sb, "BeFS: Error reading datastream block " 136 befs_error(sb, "BeFS: Error reading datastream block "
137 "starting from %Lu", bytes_read); 137 "starting from %llu", bytes_read);
138 befs_debug(sb, "<--- befs_read_lsymlink() ERROR"); 138 befs_debug(sb, "<--- %s ERROR", __func__);
139 return bytes_read; 139 return bytes_read;
140 140
141 } 141 }
@@ -146,7 +146,8 @@ befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff,
146 bytes_read += plen; 146 bytes_read += plen;
147 } 147 }
148 148
149 befs_debug(sb, "<--- befs_read_lsymlink() read %u bytes", bytes_read); 149 befs_debug(sb, "<--- %s read %u bytes", __func__, (unsigned int)
150 bytes_read);
150 return bytes_read; 151 return bytes_read;
151} 152}
152 153
@@ -169,7 +170,7 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds)
169 befs_blocknr_t metablocks; /* FS metadata blocks */ 170 befs_blocknr_t metablocks; /* FS metadata blocks */
170 befs_sb_info *befs_sb = BEFS_SB(sb); 171 befs_sb_info *befs_sb = BEFS_SB(sb);
171 172
172 befs_debug(sb, "---> befs_count_blocks()"); 173 befs_debug(sb, "---> %s", __func__);
173 174
174 datablocks = ds->size >> befs_sb->block_shift; 175 datablocks = ds->size >> befs_sb->block_shift;
175 if (ds->size & (befs_sb->block_size - 1)) 176 if (ds->size & (befs_sb->block_size - 1))
@@ -206,7 +207,7 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds)
206 } 207 }
207 208
208 blocks = datablocks + metablocks; 209 blocks = datablocks + metablocks;
209 befs_debug(sb, "<--- befs_count_blocks() %u blocks", blocks); 210 befs_debug(sb, "<--- %s %u blocks", __func__, (unsigned int)blocks);
210 211
211 return blocks; 212 return blocks;
212} 213}
@@ -251,11 +252,11 @@ befs_find_brun_direct(struct super_block *sb, befs_data_stream * data,
251 befs_blocknr_t max_block = 252 befs_blocknr_t max_block =
252 data->max_direct_range >> BEFS_SB(sb)->block_shift; 253 data->max_direct_range >> BEFS_SB(sb)->block_shift;
253 254
254 befs_debug(sb, "---> befs_find_brun_direct(), find %lu", blockno); 255 befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno);
255 256
256 if (blockno > max_block) { 257 if (blockno > max_block) {
257 befs_error(sb, "befs_find_brun_direct() passed block outside of" 258 befs_error(sb, "%s passed block outside of direct region",
258 "direct region"); 259 __func__);
259 return BEFS_ERR; 260 return BEFS_ERR;
260 } 261 }
261 262
@@ -267,13 +268,14 @@ befs_find_brun_direct(struct super_block *sb, befs_data_stream * data,
267 run->start = array[i].start + offset; 268 run->start = array[i].start + offset;
268 run->len = array[i].len - offset; 269 run->len = array[i].len - offset;
269 270
270 befs_debug(sb, "---> befs_find_brun_direct(), " 271 befs_debug(sb, "---> %s, "
271 "found %lu at direct[%d]", blockno, i); 272 "found %lu at direct[%d]", __func__,
273 (unsigned long)blockno, i);
272 return BEFS_OK; 274 return BEFS_OK;
273 } 275 }
274 } 276 }
275 277
276 befs_debug(sb, "---> befs_find_brun_direct() ERROR"); 278 befs_debug(sb, "---> %s ERROR", __func__);
277 return BEFS_ERR; 279 return BEFS_ERR;
278} 280}
279 281
@@ -316,7 +318,7 @@ befs_find_brun_indirect(struct super_block *sb,
316 befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect); 318 befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect);
317 int arraylen = befs_iaddrs_per_block(sb); 319 int arraylen = befs_iaddrs_per_block(sb);
318 320
319 befs_debug(sb, "---> befs_find_brun_indirect(), find %lu", blockno); 321 befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno);
320 322
321 indir_start_blk = data->max_direct_range >> BEFS_SB(sb)->block_shift; 323 indir_start_blk = data->max_direct_range >> BEFS_SB(sb)->block_shift;
322 search_blk = blockno - indir_start_blk; 324 search_blk = blockno - indir_start_blk;
@@ -325,10 +327,9 @@ befs_find_brun_indirect(struct super_block *sb,
325 for (i = 0; i < indirect.len; i++) { 327 for (i = 0; i < indirect.len; i++) {
326 indirblock = befs_bread(sb, indirblockno + i); 328 indirblock = befs_bread(sb, indirblockno + i);
327 if (indirblock == NULL) { 329 if (indirblock == NULL) {
328 befs_debug(sb, 330 befs_debug(sb, "---> %s failed to read "
329 "---> befs_find_brun_indirect() failed to " 331 "disk block %lu from the indirect brun",
330 "read disk block %lu from the indirect brun", 332 __func__, (unsigned long)indirblockno + i);
331 indirblockno + i);
332 return BEFS_ERR; 333 return BEFS_ERR;
333 } 334 }
334 335
@@ -348,9 +349,10 @@ befs_find_brun_indirect(struct super_block *sb,
348 349
349 brelse(indirblock); 350 brelse(indirblock);
350 befs_debug(sb, 351 befs_debug(sb,
351 "<--- befs_find_brun_indirect() found " 352 "<--- %s found file block "
352 "file block %lu at indirect[%d]", 353 "%lu at indirect[%d]", __func__,
353 blockno, j + (i * arraylen)); 354 (unsigned long)blockno,
355 j + (i * arraylen));
354 return BEFS_OK; 356 return BEFS_OK;
355 } 357 }
356 sum += len; 358 sum += len;
@@ -360,10 +362,10 @@ befs_find_brun_indirect(struct super_block *sb,
360 } 362 }
361 363
362 /* Only fallthrough is an error */ 364 /* Only fallthrough is an error */
363 befs_error(sb, "BeFS: befs_find_brun_indirect() failed to find " 365 befs_error(sb, "BeFS: %s failed to find "
364 "file block %lu", blockno); 366 "file block %lu", __func__, (unsigned long)blockno);
365 367
366 befs_debug(sb, "<--- befs_find_brun_indirect() ERROR"); 368 befs_debug(sb, "<--- %s ERROR", __func__);
367 return BEFS_ERR; 369 return BEFS_ERR;
368} 370}
369 371
@@ -444,7 +446,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
444 size_t diblklen = iblklen * befs_iaddrs_per_block(sb) 446 size_t diblklen = iblklen * befs_iaddrs_per_block(sb)
445 * BEFS_DBLINDIR_BRUN_LEN; 447 * BEFS_DBLINDIR_BRUN_LEN;
446 448
447 befs_debug(sb, "---> befs_find_brun_dblindirect() find %lu", blockno); 449 befs_debug(sb, "---> %s find %lu", __func__, (unsigned long)blockno);
448 450
449 /* First, discover which of the double_indir->indir blocks 451 /* First, discover which of the double_indir->indir blocks
450 * contains pos. Then figure out how much of pos that 452 * contains pos. Then figure out how much of pos that
@@ -460,8 +462,9 @@ befs_find_brun_dblindirect(struct super_block *sb,
460 dbl_which_block = dblindir_indx / befs_iaddrs_per_block(sb); 462 dbl_which_block = dblindir_indx / befs_iaddrs_per_block(sb);
461 if (dbl_which_block > data->double_indirect.len) { 463 if (dbl_which_block > data->double_indirect.len) {
462 befs_error(sb, "The double-indirect index calculated by " 464 befs_error(sb, "The double-indirect index calculated by "
463 "befs_read_brun_dblindirect(), %d, is outside the range " 465 "%s, %d, is outside the range "
464 "of the double-indirect block", dblindir_indx); 466 "of the double-indirect block", __func__,
467 dblindir_indx);
465 return BEFS_ERR; 468 return BEFS_ERR;
466 } 469 }
467 470
@@ -469,10 +472,10 @@ befs_find_brun_dblindirect(struct super_block *sb,
469 befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) + 472 befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) +
470 dbl_which_block); 473 dbl_which_block);
471 if (dbl_indir_block == NULL) { 474 if (dbl_indir_block == NULL) {
472 befs_error(sb, "befs_read_brun_dblindirect() couldn't read the " 475 befs_error(sb, "%s couldn't read the "
473 "double-indirect block at blockno %lu", 476 "double-indirect block at blockno %lu", __func__,
474 iaddr2blockno(sb, 477 (unsigned long)
475 &data->double_indirect) + 478 iaddr2blockno(sb, &data->double_indirect) +
476 dbl_which_block); 479 dbl_which_block);
477 brelse(dbl_indir_block); 480 brelse(dbl_indir_block);
478 return BEFS_ERR; 481 return BEFS_ERR;
@@ -489,16 +492,16 @@ befs_find_brun_dblindirect(struct super_block *sb,
489 which_block = indir_indx / befs_iaddrs_per_block(sb); 492 which_block = indir_indx / befs_iaddrs_per_block(sb);
490 if (which_block > indir_run.len) { 493 if (which_block > indir_run.len) {
491 befs_error(sb, "The indirect index calculated by " 494 befs_error(sb, "The indirect index calculated by "
492 "befs_read_brun_dblindirect(), %d, is outside the range " 495 "%s, %d, is outside the range "
493 "of the indirect block", indir_indx); 496 "of the indirect block", __func__, indir_indx);
494 return BEFS_ERR; 497 return BEFS_ERR;
495 } 498 }
496 499
497 indir_block = 500 indir_block =
498 befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block); 501 befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block);
499 if (indir_block == NULL) { 502 if (indir_block == NULL) {
500 befs_error(sb, "befs_read_brun_dblindirect() couldn't read the " 503 befs_error(sb, "%s couldn't read the indirect block "
501 "indirect block at blockno %lu", 504 "at blockno %lu", __func__, (unsigned long)
502 iaddr2blockno(sb, &indir_run) + which_block); 505 iaddr2blockno(sb, &indir_run) + which_block);
503 brelse(indir_block); 506 brelse(indir_block);
504 return BEFS_ERR; 507 return BEFS_ERR;
@@ -519,7 +522,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
519 run->len -= offset; 522 run->len -= offset;
520 523
521 befs_debug(sb, "Found file block %lu in double_indirect[%d][%d]," 524 befs_debug(sb, "Found file block %lu in double_indirect[%d][%d],"
522 " double_indirect_leftover = %lu", 525 " double_indirect_leftover = %lu", (unsigned long)
523 blockno, dblindir_indx, indir_indx, dblindir_leftover); 526 blockno, dblindir_indx, indir_indx, dblindir_leftover);
524 527
525 return BEFS_OK; 528 return BEFS_OK;
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index 622e73775c83..4de7cffcd662 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -10,6 +10,7 @@
10 * debug functions 10 * debug functions
11 */ 11 */
12 12
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13#ifdef __KERNEL__ 14#ifdef __KERNEL__
14 15
15#include <stdarg.h> 16#include <stdarg.h>
@@ -23,43 +24,30 @@
23 24
24#include "befs.h" 25#include "befs.h"
25 26
26#define ERRBUFSIZE 1024
27
28void 27void
29befs_error(const struct super_block *sb, const char *fmt, ...) 28befs_error(const struct super_block *sb, const char *fmt, ...)
30{ 29{
30 struct va_format vaf;
31 va_list args; 31 va_list args;
32 char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL);
33 if (err_buf == NULL) {
34 printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE);
35 return;
36 }
37 32
38 va_start(args, fmt); 33 va_start(args, fmt);
39 vsnprintf(err_buf, ERRBUFSIZE, fmt, args); 34 vaf.fmt = fmt;
35 vaf.va = &args;
36 pr_err("(%s): %pV\n", sb->s_id, &vaf);
40 va_end(args); 37 va_end(args);
41
42 printk(KERN_ERR "BeFS(%s): %s\n", sb->s_id, err_buf);
43 kfree(err_buf);
44} 38}
45 39
46void 40void
47befs_warning(const struct super_block *sb, const char *fmt, ...) 41befs_warning(const struct super_block *sb, const char *fmt, ...)
48{ 42{
43 struct va_format vaf;
49 va_list args; 44 va_list args;
50 char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL);
51 if (err_buf == NULL) {
52 printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE);
53 return;
54 }
55 45
56 va_start(args, fmt); 46 va_start(args, fmt);
57 vsnprintf(err_buf, ERRBUFSIZE, fmt, args); 47 vaf.fmt = fmt;
48 vaf.va = &args;
49 pr_warn("(%s): %pV\n", sb->s_id, &vaf);
58 va_end(args); 50 va_end(args);
59
60 printk(KERN_WARNING "BeFS(%s): %s\n", sb->s_id, err_buf);
61
62 kfree(err_buf);
63} 51}
64 52
65void 53void
@@ -67,25 +55,13 @@ befs_debug(const struct super_block *sb, const char *fmt, ...)
67{ 55{
68#ifdef CONFIG_BEFS_DEBUG 56#ifdef CONFIG_BEFS_DEBUG
69 57
58 struct va_format vaf;
70 va_list args; 59 va_list args;
71 char *err_buf = NULL; 60 va_start(args, fmt);
72 61 vaf.fmt = fmt;
73 if (BEFS_SB(sb)->mount_opts.debug) { 62 vaf.va = &args;
74 err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL); 63 pr_debug("(%s): %pV\n", sb->s_id, &vaf);
75 if (err_buf == NULL) { 64 va_end(args);
76 printk(KERN_ERR "could not allocate %d bytes\n",
77 ERRBUFSIZE);
78 return;
79 }
80
81 va_start(args, fmt);
82 vsnprintf(err_buf, ERRBUFSIZE, fmt, args);
83 va_end(args);
84
85 printk(KERN_DEBUG "BeFS(%s): %s\n", sb->s_id, err_buf);
86
87 kfree(err_buf);
88 }
89 65
90#endif //CONFIG_BEFS_DEBUG 66#endif //CONFIG_BEFS_DEBUG
91} 67}
@@ -109,9 +85,9 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
109 befs_debug(sb, " gid %u", fs32_to_cpu(sb, inode->gid)); 85 befs_debug(sb, " gid %u", fs32_to_cpu(sb, inode->gid));
110 befs_debug(sb, " mode %08x", fs32_to_cpu(sb, inode->mode)); 86 befs_debug(sb, " mode %08x", fs32_to_cpu(sb, inode->mode));
111 befs_debug(sb, " flags %08x", fs32_to_cpu(sb, inode->flags)); 87 befs_debug(sb, " flags %08x", fs32_to_cpu(sb, inode->flags));
112 befs_debug(sb, " create_time %Lu", 88 befs_debug(sb, " create_time %llu",
113 fs64_to_cpu(sb, inode->create_time)); 89 fs64_to_cpu(sb, inode->create_time));
114 befs_debug(sb, " last_modified_time %Lu", 90 befs_debug(sb, " last_modified_time %llu",
115 fs64_to_cpu(sb, inode->last_modified_time)); 91 fs64_to_cpu(sb, inode->last_modified_time));
116 92
117 tmp_run = fsrun_to_cpu(sb, inode->parent); 93 tmp_run = fsrun_to_cpu(sb, inode->parent);
@@ -137,7 +113,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
137 tmp_run.allocation_group, tmp_run.start, 113 tmp_run.allocation_group, tmp_run.start,
138 tmp_run.len); 114 tmp_run.len);
139 } 115 }
140 befs_debug(sb, " max_direct_range %Lu", 116 befs_debug(sb, " max_direct_range %llu",
141 fs64_to_cpu(sb, 117 fs64_to_cpu(sb,
142 inode->data.datastream. 118 inode->data.datastream.
143 max_direct_range)); 119 max_direct_range));
@@ -147,7 +123,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
147 tmp_run.allocation_group, 123 tmp_run.allocation_group,
148 tmp_run.start, tmp_run.len); 124 tmp_run.start, tmp_run.len);
149 125
150 befs_debug(sb, " max_indirect_range %Lu", 126 befs_debug(sb, " max_indirect_range %llu",
151 fs64_to_cpu(sb, 127 fs64_to_cpu(sb,
152 inode->data.datastream. 128 inode->data.datastream.
153 max_indirect_range)); 129 max_indirect_range));
@@ -158,12 +134,12 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
158 tmp_run.allocation_group, tmp_run.start, 134 tmp_run.allocation_group, tmp_run.start,
159 tmp_run.len); 135 tmp_run.len);
160 136
161 befs_debug(sb, " max_double_indirect_range %Lu", 137 befs_debug(sb, " max_double_indirect_range %llu",
162 fs64_to_cpu(sb, 138 fs64_to_cpu(sb,
163 inode->data.datastream. 139 inode->data.datastream.
164 max_double_indirect_range)); 140 max_double_indirect_range));
165 141
166 befs_debug(sb, " size %Lu", 142 befs_debug(sb, " size %llu",
167 fs64_to_cpu(sb, inode->data.datastream.size)); 143 fs64_to_cpu(sb, inode->data.datastream.size));
168 } 144 }
169 145
@@ -191,8 +167,8 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup)
191 befs_debug(sb, " block_size %u", fs32_to_cpu(sb, sup->block_size)); 167 befs_debug(sb, " block_size %u", fs32_to_cpu(sb, sup->block_size));
192 befs_debug(sb, " block_shift %u", fs32_to_cpu(sb, sup->block_shift)); 168 befs_debug(sb, " block_shift %u", fs32_to_cpu(sb, sup->block_shift));
193 169
194 befs_debug(sb, " num_blocks %Lu", fs64_to_cpu(sb, sup->num_blocks)); 170 befs_debug(sb, " num_blocks %llu", fs64_to_cpu(sb, sup->num_blocks));
195 befs_debug(sb, " used_blocks %Lu", fs64_to_cpu(sb, sup->used_blocks)); 171 befs_debug(sb, " used_blocks %llu", fs64_to_cpu(sb, sup->used_blocks));
196 172
197 befs_debug(sb, " magic2 %08x", fs32_to_cpu(sb, sup->magic2)); 173 befs_debug(sb, " magic2 %08x", fs32_to_cpu(sb, sup->magic2));
198 befs_debug(sb, " blocks_per_ag %u", 174 befs_debug(sb, " blocks_per_ag %u",
@@ -206,8 +182,8 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup)
206 befs_debug(sb, " log_blocks %u, %hu, %hu", 182 befs_debug(sb, " log_blocks %u, %hu, %hu",
207 tmp_run.allocation_group, tmp_run.start, tmp_run.len); 183 tmp_run.allocation_group, tmp_run.start, tmp_run.len);
208 184
209 befs_debug(sb, " log_start %Ld", fs64_to_cpu(sb, sup->log_start)); 185 befs_debug(sb, " log_start %lld", fs64_to_cpu(sb, sup->log_start));
210 befs_debug(sb, " log_end %Ld", fs64_to_cpu(sb, sup->log_end)); 186 befs_debug(sb, " log_end %lld", fs64_to_cpu(sb, sup->log_end));
211 187
212 befs_debug(sb, " magic3 %08x", fs32_to_cpu(sb, sup->magic3)); 188 befs_debug(sb, " magic3 %08x", fs32_to_cpu(sb, sup->magic3));
213 189
diff --git a/fs/befs/inode.c b/fs/befs/inode.c
index 94c17f9a9576..fa4b718de597 100644
--- a/fs/befs/inode.c
+++ b/fs/befs/inode.c
@@ -25,7 +25,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
25 /* check magic header. */ 25 /* check magic header. */
26 if (magic1 != BEFS_INODE_MAGIC1) { 26 if (magic1 != BEFS_INODE_MAGIC1) {
27 befs_error(sb, 27 befs_error(sb,
28 "Inode has a bad magic header - inode = %lu", inode); 28 "Inode has a bad magic header - inode = %lu",
29 (unsigned long)inode);
29 return BEFS_BAD_INODE; 30 return BEFS_BAD_INODE;
30 } 31 }
31 32
@@ -34,8 +35,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
34 */ 35 */
35 if (inode != iaddr2blockno(sb, &ino_num)) { 36 if (inode != iaddr2blockno(sb, &ino_num)) {
36 befs_error(sb, "inode blocknr field disagrees with vfs " 37 befs_error(sb, "inode blocknr field disagrees with vfs "
37 "VFS: %lu, Inode %lu", 38 "VFS: %lu, Inode %lu", (unsigned long)
38 inode, iaddr2blockno(sb, &ino_num)); 39 inode, (unsigned long)iaddr2blockno(sb, &ino_num));
39 return BEFS_BAD_INODE; 40 return BEFS_BAD_INODE;
40 } 41 }
41 42
@@ -44,7 +45,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
44 */ 45 */
45 46
46 if (!(flags & BEFS_INODE_IN_USE)) { 47 if (!(flags & BEFS_INODE_IN_USE)) {
47 befs_error(sb, "inode is not used - inode = %lu", inode); 48 befs_error(sb, "inode is not used - inode = %lu",
49 (unsigned long)inode);
48 return BEFS_BAD_INODE; 50 return BEFS_BAD_INODE;
49 } 51 }
50 52
diff --git a/fs/befs/io.c b/fs/befs/io.c
index ddef98aa255d..0408a3d601d0 100644
--- a/fs/befs/io.c
+++ b/fs/befs/io.c
@@ -30,9 +30,9 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
30 befs_blocknr_t block = 0; 30 befs_blocknr_t block = 0;
31 befs_sb_info *befs_sb = BEFS_SB(sb); 31 befs_sb_info *befs_sb = BEFS_SB(sb);
32 32
33 befs_debug(sb, "---> Enter befs_read_iaddr() " 33 befs_debug(sb, "---> Enter %s "
34 "[%u, %hu, %hu]", 34 "[%u, %hu, %hu]", __func__, iaddr.allocation_group,
35 iaddr.allocation_group, iaddr.start, iaddr.len); 35 iaddr.start, iaddr.len);
36 36
37 if (iaddr.allocation_group > befs_sb->num_ags) { 37 if (iaddr.allocation_group > befs_sb->num_ags) {
38 befs_error(sb, "BEFS: Invalid allocation group %u, max is %u", 38 befs_error(sb, "BEFS: Invalid allocation group %u, max is %u",
@@ -42,20 +42,21 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
42 42
43 block = iaddr2blockno(sb, &iaddr); 43 block = iaddr2blockno(sb, &iaddr);
44 44
45 befs_debug(sb, "befs_read_iaddr: offset = %lu", block); 45 befs_debug(sb, "%s: offset = %lu", __func__, (unsigned long)block);
46 46
47 bh = sb_bread(sb, block); 47 bh = sb_bread(sb, block);
48 48
49 if (bh == NULL) { 49 if (bh == NULL) {
50 befs_error(sb, "Failed to read block %lu", block); 50 befs_error(sb, "Failed to read block %lu",
51 (unsigned long)block);
51 goto error; 52 goto error;
52 } 53 }
53 54
54 befs_debug(sb, "<--- befs_read_iaddr()"); 55 befs_debug(sb, "<--- %s", __func__);
55 return bh; 56 return bh;
56 57
57 error: 58 error:
58 befs_debug(sb, "<--- befs_read_iaddr() ERROR"); 59 befs_debug(sb, "<--- %s ERROR", __func__);
59 return NULL; 60 return NULL;
60} 61}
61 62
@@ -64,20 +65,21 @@ befs_bread(struct super_block *sb, befs_blocknr_t block)
64{ 65{
65 struct buffer_head *bh = NULL; 66 struct buffer_head *bh = NULL;
66 67
67 befs_debug(sb, "---> Enter befs_read() %Lu", block); 68 befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block);
68 69
69 bh = sb_bread(sb, block); 70 bh = sb_bread(sb, block);
70 71
71 if (bh == NULL) { 72 if (bh == NULL) {
72 befs_error(sb, "Failed to read block %lu", block); 73 befs_error(sb, "Failed to read block %lu",
74 (unsigned long)block);
73 goto error; 75 goto error;
74 } 76 }
75 77
76 befs_debug(sb, "<--- befs_read()"); 78 befs_debug(sb, "<--- %s", __func__);
77 79
78 return bh; 80 return bh;
79 81
80 error: 82 error:
81 befs_debug(sb, "<--- befs_read() ERROR"); 83 befs_debug(sb, "<--- %s ERROR", __func__);
82 return NULL; 84 return NULL;
83} 85}
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 845d2d690ce2..d626756ff721 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -5,6 +5,8 @@
5 * 5 *
6 */ 6 */
7 7
8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
8#include <linux/module.h> 10#include <linux/module.h>
9#include <linux/slab.h> 11#include <linux/slab.h>
10#include <linux/fs.h> 12#include <linux/fs.h>
@@ -39,7 +41,6 @@ static struct dentry *befs_lookup(struct inode *, struct dentry *, unsigned int)
39static struct inode *befs_iget(struct super_block *, unsigned long); 41static struct inode *befs_iget(struct super_block *, unsigned long);
40static struct inode *befs_alloc_inode(struct super_block *sb); 42static struct inode *befs_alloc_inode(struct super_block *sb);
41static void befs_destroy_inode(struct inode *inode); 43static void befs_destroy_inode(struct inode *inode);
42static int befs_init_inodecache(void);
43static void befs_destroy_inodecache(void); 44static void befs_destroy_inodecache(void);
44static void *befs_follow_link(struct dentry *, struct nameidata *); 45static void *befs_follow_link(struct dentry *, struct nameidata *);
45static void *befs_fast_follow_link(struct dentry *, struct nameidata *); 46static void *befs_fast_follow_link(struct dentry *, struct nameidata *);
@@ -131,26 +132,28 @@ befs_get_block(struct inode *inode, sector_t block,
131 ulong disk_off; 132 ulong disk_off;
132 133
133 befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld", 134 befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld",
134 inode->i_ino, block); 135 (unsigned long)inode->i_ino, (long)block);
135 136
136 if (block < 0) { 137 if (block < 0) {
137 befs_error(sb, "befs_get_block() was asked for a block " 138 befs_error(sb, "befs_get_block() was asked for a block "
138 "number less than zero: block %ld in inode %lu", 139 "number less than zero: block %ld in inode %lu",
139 block, inode->i_ino); 140 (long)block, (unsigned long)inode->i_ino);
140 return -EIO; 141 return -EIO;
141 } 142 }
142 143
143 if (create) { 144 if (create) {
144 befs_error(sb, "befs_get_block() was asked to write to " 145 befs_error(sb, "befs_get_block() was asked to write to "
145 "block %ld in inode %lu", block, inode->i_ino); 146 "block %ld in inode %lu", (long)block,
147 (unsigned long)inode->i_ino);
146 return -EPERM; 148 return -EPERM;
147 } 149 }
148 150
149 res = befs_fblock2brun(sb, ds, block, &run); 151 res = befs_fblock2brun(sb, ds, block, &run);
150 if (res != BEFS_OK) { 152 if (res != BEFS_OK) {
151 befs_error(sb, 153 befs_error(sb,
152 "<--- befs_get_block() for inode %lu, block " 154 "<--- %s for inode %lu, block %ld ERROR",
153 "%ld ERROR", inode->i_ino, block); 155 __func__, (unsigned long)inode->i_ino,
156 (long)block);
154 return -EFBIG; 157 return -EFBIG;
155 } 158 }
156 159
@@ -158,8 +161,9 @@ befs_get_block(struct inode *inode, sector_t block,
158 161
159 map_bh(bh_result, inode->i_sb, disk_off); 162 map_bh(bh_result, inode->i_sb, disk_off);
160 163
161 befs_debug(sb, "<--- befs_get_block() for inode %lu, block %ld, " 164 befs_debug(sb, "<--- %s for inode %lu, block %ld, disk address %lu",
162 "disk address %lu", inode->i_ino, block, disk_off); 165 __func__, (unsigned long)inode->i_ino, (long)block,
166 (unsigned long)disk_off);
163 167
164 return 0; 168 return 0;
165} 169}
@@ -176,15 +180,15 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
176 char *utfname; 180 char *utfname;
177 const char *name = dentry->d_name.name; 181 const char *name = dentry->d_name.name;
178 182
179 befs_debug(sb, "---> befs_lookup() " 183 befs_debug(sb, "---> %s name %s inode %ld", __func__,
180 "name %s inode %ld", dentry->d_name.name, dir->i_ino); 184 dentry->d_name.name, dir->i_ino);
181 185
182 /* Convert to UTF-8 */ 186 /* Convert to UTF-8 */
183 if (BEFS_SB(sb)->nls) { 187 if (BEFS_SB(sb)->nls) {
184 ret = 188 ret =
185 befs_nls2utf(sb, name, strlen(name), &utfname, &utfnamelen); 189 befs_nls2utf(sb, name, strlen(name), &utfname, &utfnamelen);
186 if (ret < 0) { 190 if (ret < 0) {
187 befs_debug(sb, "<--- befs_lookup() ERROR"); 191 befs_debug(sb, "<--- %s ERROR", __func__);
188 return ERR_PTR(ret); 192 return ERR_PTR(ret);
189 } 193 }
190 ret = befs_btree_find(sb, ds, utfname, &offset); 194 ret = befs_btree_find(sb, ds, utfname, &offset);
@@ -195,12 +199,12 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
195 } 199 }
196 200
197 if (ret == BEFS_BT_NOT_FOUND) { 201 if (ret == BEFS_BT_NOT_FOUND) {
198 befs_debug(sb, "<--- befs_lookup() %s not found", 202 befs_debug(sb, "<--- %s %s not found", __func__,
199 dentry->d_name.name); 203 dentry->d_name.name);
200 return ERR_PTR(-ENOENT); 204 return ERR_PTR(-ENOENT);
201 205
202 } else if (ret != BEFS_OK || offset == 0) { 206 } else if (ret != BEFS_OK || offset == 0) {
203 befs_warning(sb, "<--- befs_lookup() Error"); 207 befs_warning(sb, "<--- %s Error", __func__);
204 return ERR_PTR(-ENODATA); 208 return ERR_PTR(-ENODATA);
205 } 209 }
206 210
@@ -210,7 +214,7 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
210 214
211 d_add(dentry, inode); 215 d_add(dentry, inode);
212 216
213 befs_debug(sb, "<--- befs_lookup()"); 217 befs_debug(sb, "<--- %s", __func__);
214 218
215 return NULL; 219 return NULL;
216} 220}
@@ -228,26 +232,25 @@ befs_readdir(struct file *file, struct dir_context *ctx)
228 char keybuf[BEFS_NAME_LEN + 1]; 232 char keybuf[BEFS_NAME_LEN + 1];
229 const char *dirname = file->f_path.dentry->d_name.name; 233 const char *dirname = file->f_path.dentry->d_name.name;
230 234
231 befs_debug(sb, "---> befs_readdir() " 235 befs_debug(sb, "---> %s name %s, inode %ld, ctx->pos %lld",
232 "name %s, inode %ld, ctx->pos %Ld", 236 __func__, dirname, inode->i_ino, ctx->pos);
233 dirname, inode->i_ino, ctx->pos);
234 237
235more: 238more:
236 result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1, 239 result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1,
237 keybuf, &keysize, &value); 240 keybuf, &keysize, &value);
238 241
239 if (result == BEFS_ERR) { 242 if (result == BEFS_ERR) {
240 befs_debug(sb, "<--- befs_readdir() ERROR"); 243 befs_debug(sb, "<--- %s ERROR", __func__);
241 befs_error(sb, "IO error reading %s (inode %lu)", 244 befs_error(sb, "IO error reading %s (inode %lu)",
242 dirname, inode->i_ino); 245 dirname, inode->i_ino);
243 return -EIO; 246 return -EIO;
244 247
245 } else if (result == BEFS_BT_END) { 248 } else if (result == BEFS_BT_END) {
246 befs_debug(sb, "<--- befs_readdir() END"); 249 befs_debug(sb, "<--- %s END", __func__);
247 return 0; 250 return 0;
248 251
249 } else if (result == BEFS_BT_EMPTY) { 252 } else if (result == BEFS_BT_EMPTY) {
250 befs_debug(sb, "<--- befs_readdir() Empty directory"); 253 befs_debug(sb, "<--- %s Empty directory", __func__);
251 return 0; 254 return 0;
252 } 255 }
253 256
@@ -260,7 +263,7 @@ more:
260 result = 263 result =
261 befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen); 264 befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen);
262 if (result < 0) { 265 if (result < 0) {
263 befs_debug(sb, "<--- befs_readdir() ERROR"); 266 befs_debug(sb, "<--- %s ERROR", __func__);
264 return result; 267 return result;
265 } 268 }
266 if (!dir_emit(ctx, nlsname, nlsnamelen, 269 if (!dir_emit(ctx, nlsname, nlsnamelen,
@@ -277,7 +280,7 @@ more:
277 ctx->pos++; 280 ctx->pos++;
278 goto more; 281 goto more;
279 282
280 befs_debug(sb, "<--- befs_readdir() pos %Ld", ctx->pos); 283 befs_debug(sb, "<--- %s pos %lld", __func__, ctx->pos);
281 284
282 return 0; 285 return 0;
283} 286}
@@ -321,7 +324,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
321 struct inode *inode; 324 struct inode *inode;
322 long ret = -EIO; 325 long ret = -EIO;
323 326
324 befs_debug(sb, "---> befs_read_inode() " "inode = %lu", ino); 327 befs_debug(sb, "---> %s inode = %lu", __func__, ino);
325 328
326 inode = iget_locked(sb, ino); 329 inode = iget_locked(sb, ino);
327 if (!inode) 330 if (!inode)
@@ -428,7 +431,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
428 } 431 }
429 432
430 brelse(bh); 433 brelse(bh);
431 befs_debug(sb, "<--- befs_read_inode()"); 434 befs_debug(sb, "<--- %s", __func__);
432 unlock_new_inode(inode); 435 unlock_new_inode(inode);
433 return inode; 436 return inode;
434 437
@@ -437,7 +440,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
437 440
438 unacquire_none: 441 unacquire_none:
439 iget_failed(inode); 442 iget_failed(inode);
440 befs_debug(sb, "<--- befs_read_inode() - Bad inode"); 443 befs_debug(sb, "<--- %s - Bad inode", __func__);
441 return ERR_PTR(ret); 444 return ERR_PTR(ret);
442} 445}
443 446
@@ -445,7 +448,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
445 * 448 *
446 * Taken from NFS implementation by Al Viro. 449 * Taken from NFS implementation by Al Viro.
447 */ 450 */
448static int 451static int __init
449befs_init_inodecache(void) 452befs_init_inodecache(void)
450{ 453{
451 befs_inode_cachep = kmem_cache_create("befs_inode_cache", 454 befs_inode_cachep = kmem_cache_create("befs_inode_cache",
@@ -454,11 +457,9 @@ befs_init_inodecache(void)
454 SLAB_MEM_SPREAD), 457 SLAB_MEM_SPREAD),
455 init_once); 458 init_once);
456 if (befs_inode_cachep == NULL) { 459 if (befs_inode_cachep == NULL) {
457 printk(KERN_ERR "befs_init_inodecache: " 460 pr_err("%s: Couldn't initialize inode slabcache\n", __func__);
458 "Couldn't initialize inode slabcache\n");
459 return -ENOMEM; 461 return -ENOMEM;
460 } 462 }
461
462 return 0; 463 return 0;
463} 464}
464 465
@@ -544,16 +545,16 @@ befs_utf2nls(struct super_block *sb, const char *in,
544 */ 545 */
545 int maxlen = in_len + 1; 546 int maxlen = in_len + 1;
546 547
547 befs_debug(sb, "---> utf2nls()"); 548 befs_debug(sb, "---> %s", __func__);
548 549
549 if (!nls) { 550 if (!nls) {
550 befs_error(sb, "befs_utf2nls called with no NLS table loaded"); 551 befs_error(sb, "%s called with no NLS table loaded", __func__);
551 return -EINVAL; 552 return -EINVAL;
552 } 553 }
553 554
554 *out = result = kmalloc(maxlen, GFP_NOFS); 555 *out = result = kmalloc(maxlen, GFP_NOFS);
555 if (!*out) { 556 if (!*out) {
556 befs_error(sb, "befs_utf2nls() cannot allocate memory"); 557 befs_error(sb, "%s cannot allocate memory", __func__);
557 *out_len = 0; 558 *out_len = 0;
558 return -ENOMEM; 559 return -ENOMEM;
559 } 560 }
@@ -575,14 +576,14 @@ befs_utf2nls(struct super_block *sb, const char *in,
575 result[o] = '\0'; 576 result[o] = '\0';
576 *out_len = o; 577 *out_len = o;
577 578
578 befs_debug(sb, "<--- utf2nls()"); 579 befs_debug(sb, "<--- %s", __func__);
579 580
580 return o; 581 return o;
581 582
582 conv_err: 583 conv_err:
583 befs_error(sb, "Name using character set %s contains a character that " 584 befs_error(sb, "Name using character set %s contains a character that "
584 "cannot be converted to unicode.", nls->charset); 585 "cannot be converted to unicode.", nls->charset);
585 befs_debug(sb, "<--- utf2nls()"); 586 befs_debug(sb, "<--- %s", __func__);
586 kfree(result); 587 kfree(result);
587 return -EILSEQ; 588 return -EILSEQ;
588} 589}
@@ -623,16 +624,17 @@ befs_nls2utf(struct super_block *sb, const char *in,
623 * in special cases */ 624 * in special cases */
624 int maxlen = (3 * in_len) + 1; 625 int maxlen = (3 * in_len) + 1;
625 626
626 befs_debug(sb, "---> nls2utf()\n"); 627 befs_debug(sb, "---> %s\n", __func__);
627 628
628 if (!nls) { 629 if (!nls) {
629 befs_error(sb, "befs_nls2utf called with no NLS table loaded."); 630 befs_error(sb, "%s called with no NLS table loaded.",
631 __func__);
630 return -EINVAL; 632 return -EINVAL;
631 } 633 }
632 634
633 *out = result = kmalloc(maxlen, GFP_NOFS); 635 *out = result = kmalloc(maxlen, GFP_NOFS);
634 if (!*out) { 636 if (!*out) {
635 befs_error(sb, "befs_nls2utf() cannot allocate memory"); 637 befs_error(sb, "%s cannot allocate memory", __func__);
636 *out_len = 0; 638 *out_len = 0;
637 return -ENOMEM; 639 return -ENOMEM;
638 } 640 }
@@ -653,14 +655,14 @@ befs_nls2utf(struct super_block *sb, const char *in,
653 result[o] = '\0'; 655 result[o] = '\0';
654 *out_len = o; 656 *out_len = o;
655 657
656 befs_debug(sb, "<--- nls2utf()"); 658 befs_debug(sb, "<--- %s", __func__);
657 659
658 return i; 660 return i;
659 661
660 conv_err: 662 conv_err:
661 befs_error(sb, "Name using charecter set %s contains a charecter that " 663 befs_error(sb, "Name using charecter set %s contains a charecter that "
662 "cannot be converted to unicode.", nls->charset); 664 "cannot be converted to unicode.", nls->charset);
663 befs_debug(sb, "<--- nls2utf()"); 665 befs_debug(sb, "<--- %s", __func__);
664 kfree(result); 666 kfree(result);
665 return -EILSEQ; 667 return -EILSEQ;
666} 668}
@@ -715,8 +717,8 @@ parse_options(char *options, befs_mount_options * opts)
715 if (option >= 0) 717 if (option >= 0)
716 uid = make_kuid(current_user_ns(), option); 718 uid = make_kuid(current_user_ns(), option);
717 if (!uid_valid(uid)) { 719 if (!uid_valid(uid)) {
718 printk(KERN_ERR "BeFS: Invalid uid %d, " 720 pr_err("Invalid uid %d, "
719 "using default\n", option); 721 "using default\n", option);
720 break; 722 break;
721 } 723 }
722 opts->uid = uid; 724 opts->uid = uid;
@@ -729,8 +731,8 @@ parse_options(char *options, befs_mount_options * opts)
729 if (option >= 0) 731 if (option >= 0)
730 gid = make_kgid(current_user_ns(), option); 732 gid = make_kgid(current_user_ns(), option);
731 if (!gid_valid(gid)) { 733 if (!gid_valid(gid)) {
732 printk(KERN_ERR "BeFS: Invalid gid %d, " 734 pr_err("Invalid gid %d, "
733 "using default\n", option); 735 "using default\n", option);
734 break; 736 break;
735 } 737 }
736 opts->gid = gid; 738 opts->gid = gid;
@@ -740,8 +742,8 @@ parse_options(char *options, befs_mount_options * opts)
740 kfree(opts->iocharset); 742 kfree(opts->iocharset);
741 opts->iocharset = match_strdup(&args[0]); 743 opts->iocharset = match_strdup(&args[0]);
742 if (!opts->iocharset) { 744 if (!opts->iocharset) {
743 printk(KERN_ERR "BeFS: allocation failure for " 745 pr_err("allocation failure for "
744 "iocharset string\n"); 746 "iocharset string\n");
745 return 0; 747 return 0;
746 } 748 }
747 break; 749 break;
@@ -749,8 +751,8 @@ parse_options(char *options, befs_mount_options * opts)
749 opts->debug = 1; 751 opts->debug = 1;
750 break; 752 break;
751 default: 753 default:
752 printk(KERN_ERR "BeFS: Unrecognized mount option \"%s\" " 754 pr_err("Unrecognized mount option \"%s\" "
753 "or missing value\n", p); 755 "or missing value\n", p);
754 return 0; 756 return 0;
755 } 757 }
756 } 758 }
@@ -791,22 +793,20 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
791 793
792 save_mount_options(sb, data); 794 save_mount_options(sb, data);
793 795
794 sb->s_fs_info = kmalloc(sizeof (*befs_sb), GFP_KERNEL); 796 sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL);
795 if (sb->s_fs_info == NULL) { 797 if (sb->s_fs_info == NULL) {
796 printk(KERN_ERR 798 pr_err("(%s): Unable to allocate memory for private "
797 "BeFS(%s): Unable to allocate memory for private "
798 "portion of superblock. Bailing.\n", sb->s_id); 799 "portion of superblock. Bailing.\n", sb->s_id);
799 goto unacquire_none; 800 goto unacquire_none;
800 } 801 }
801 befs_sb = BEFS_SB(sb); 802 befs_sb = BEFS_SB(sb);
802 memset(befs_sb, 0, sizeof(befs_sb_info));
803 803
804 if (!parse_options((char *) data, &befs_sb->mount_opts)) { 804 if (!parse_options((char *) data, &befs_sb->mount_opts)) {
805 befs_error(sb, "cannot parse mount options"); 805 befs_error(sb, "cannot parse mount options");
806 goto unacquire_priv_sbp; 806 goto unacquire_priv_sbp;
807 } 807 }
808 808
809 befs_debug(sb, "---> befs_fill_super()"); 809 befs_debug(sb, "---> %s", __func__);
810 810
811#ifndef CONFIG_BEFS_RW 811#ifndef CONFIG_BEFS_RW
812 if (!(sb->s_flags & MS_RDONLY)) { 812 if (!(sb->s_flags & MS_RDONLY)) {
@@ -854,7 +854,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
854 goto unacquire_priv_sbp; 854 goto unacquire_priv_sbp;
855 855
856 if( befs_sb->num_blocks > ~((sector_t)0) ) { 856 if( befs_sb->num_blocks > ~((sector_t)0) ) {
857 befs_error(sb, "blocks count: %Lu " 857 befs_error(sb, "blocks count: %llu "
858 "is larger than the host can use", 858 "is larger than the host can use",
859 befs_sb->num_blocks); 859 befs_sb->num_blocks);
860 goto unacquire_priv_sbp; 860 goto unacquire_priv_sbp;
@@ -913,6 +913,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
913static int 913static int
914befs_remount(struct super_block *sb, int *flags, char *data) 914befs_remount(struct super_block *sb, int *flags, char *data)
915{ 915{
916 sync_filesystem(sb);
916 if (!(*flags & MS_RDONLY)) 917 if (!(*flags & MS_RDONLY))
917 return -EINVAL; 918 return -EINVAL;
918 return 0; 919 return 0;
@@ -924,7 +925,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
924 struct super_block *sb = dentry->d_sb; 925 struct super_block *sb = dentry->d_sb;
925 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 926 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
926 927
927 befs_debug(sb, "---> befs_statfs()"); 928 befs_debug(sb, "---> %s", __func__);
928 929
929 buf->f_type = BEFS_SUPER_MAGIC; 930 buf->f_type = BEFS_SUPER_MAGIC;
930 buf->f_bsize = sb->s_blocksize; 931 buf->f_bsize = sb->s_blocksize;
@@ -937,7 +938,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
937 buf->f_fsid.val[1] = (u32)(id >> 32); 938 buf->f_fsid.val[1] = (u32)(id >> 32);
938 buf->f_namelen = BEFS_NAME_LEN; 939 buf->f_namelen = BEFS_NAME_LEN;
939 940
940 befs_debug(sb, "<--- befs_statfs()"); 941 befs_debug(sb, "<--- %s", __func__);
941 942
942 return 0; 943 return 0;
943} 944}
@@ -963,7 +964,7 @@ init_befs_fs(void)
963{ 964{
964 int err; 965 int err;
965 966
966 printk(KERN_INFO "BeFS version: %s\n", BEFS_VERSION); 967 pr_info("version: %s\n", BEFS_VERSION);
967 968
968 err = befs_init_inodecache(); 969 err = befs_init_inodecache();
969 if (err) 970 if (err)
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 8defc6b3f9a2..29aa5cf6639b 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -172,7 +172,7 @@ static void bfs_evict_inode(struct inode *inode)
172 172
173 dprintf("ino=%08lx\n", ino); 173 dprintf("ino=%08lx\n", ino);
174 174
175 truncate_inode_pages(&inode->i_data, 0); 175 truncate_inode_pages_final(&inode->i_data);
176 invalidate_inode_buffers(inode); 176 invalidate_inode_buffers(inode);
177 clear_inode(inode); 177 clear_inode(inode);
178 178
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 67be2951b98a..0f59799fa105 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -46,10 +46,15 @@
46#endif 46#endif
47 47
48static int load_elf_binary(struct linux_binprm *bprm); 48static int load_elf_binary(struct linux_binprm *bprm);
49static int load_elf_library(struct file *);
50static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, 49static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
51 int, int, unsigned long); 50 int, int, unsigned long);
52 51
52#ifdef CONFIG_USELIB
53static int load_elf_library(struct file *);
54#else
55#define load_elf_library NULL
56#endif
57
53/* 58/*
54 * If we don't support core dumping, then supply a NULL so we 59 * If we don't support core dumping, then supply a NULL so we
55 * don't even try. 60 * don't even try.
@@ -1005,6 +1010,7 @@ out_free_ph:
1005 goto out; 1010 goto out;
1006} 1011}
1007 1012
1013#ifdef CONFIG_USELIB
1008/* This is really simpleminded and specialized - we are loading an 1014/* This is really simpleminded and specialized - we are loading an
1009 a.out library that is given an ELF header. */ 1015 a.out library that is given an ELF header. */
1010static int load_elf_library(struct file *file) 1016static int load_elf_library(struct file *file)
@@ -1083,6 +1089,7 @@ out_free_ph:
1083out: 1089out:
1084 return error; 1090 return error;
1085} 1091}
1092#endif /* #ifdef CONFIG_USELIB */
1086 1093
1087#ifdef CONFIG_ELF_CORE 1094#ifdef CONFIG_ELF_CORE
1088/* 1095/*
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 1c740e152f38..b60500300dd7 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -656,6 +656,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
656 656
657 mutex_unlock(&root->d_inode->i_mutex); 657 mutex_unlock(&root->d_inode->i_mutex);
658 dput(root); 658 dput(root);
659 break;
659 default: return res; 660 default: return res;
660 } 661 }
661 return count; 662 return count;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 4f70f383132c..29696b78d1f4 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -301,25 +301,25 @@ int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
301EXPORT_SYMBOL(bio_integrity_get_tag); 301EXPORT_SYMBOL(bio_integrity_get_tag);
302 302
303/** 303/**
304 * bio_integrity_generate - Generate integrity metadata for a bio 304 * bio_integrity_generate_verify - Generate/verify integrity metadata for a bio
305 * @bio: bio to generate integrity metadata for 305 * @bio: bio to generate/verify integrity metadata for
306 * 306 * @operate: operate number, 1 for generate, 0 for verify
307 * Description: Generates integrity metadata for a bio by calling the
308 * block device's generation callback function. The bio must have a
309 * bip attached with enough room to accommodate the generated
310 * integrity metadata.
311 */ 307 */
312static void bio_integrity_generate(struct bio *bio) 308static int bio_integrity_generate_verify(struct bio *bio, int operate)
313{ 309{
314 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 310 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
315 struct blk_integrity_exchg bix; 311 struct blk_integrity_exchg bix;
316 struct bio_vec bv; 312 struct bio_vec bv;
317 struct bvec_iter iter; 313 struct bvec_iter iter;
318 sector_t sector = bio->bi_iter.bi_sector; 314 sector_t sector;
319 unsigned int sectors, total; 315 unsigned int sectors, ret = 0;
320 void *prot_buf = bio->bi_integrity->bip_buf; 316 void *prot_buf = bio->bi_integrity->bip_buf;
321 317
322 total = 0; 318 if (operate)
319 sector = bio->bi_iter.bi_sector;
320 else
321 sector = bio->bi_integrity->bip_iter.bi_sector;
322
323 bix.disk_name = bio->bi_bdev->bd_disk->disk_name; 323 bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
324 bix.sector_size = bi->sector_size; 324 bix.sector_size = bi->sector_size;
325 325
@@ -330,16 +330,37 @@ static void bio_integrity_generate(struct bio *bio)
330 bix.prot_buf = prot_buf; 330 bix.prot_buf = prot_buf;
331 bix.sector = sector; 331 bix.sector = sector;
332 332
333 bi->generate_fn(&bix); 333 if (operate) {
334 bi->generate_fn(&bix);
335 } else {
336 ret = bi->verify_fn(&bix);
337 if (ret) {
338 kunmap_atomic(kaddr);
339 return ret;
340 }
341 }
334 342
335 sectors = bv.bv_len / bi->sector_size; 343 sectors = bv.bv_len / bi->sector_size;
336 sector += sectors; 344 sector += sectors;
337 prot_buf += sectors * bi->tuple_size; 345 prot_buf += sectors * bi->tuple_size;
338 total += sectors * bi->tuple_size;
339 BUG_ON(total > bio->bi_integrity->bip_iter.bi_size);
340 346
341 kunmap_atomic(kaddr); 347 kunmap_atomic(kaddr);
342 } 348 }
349 return ret;
350}
351
352/**
353 * bio_integrity_generate - Generate integrity metadata for a bio
354 * @bio: bio to generate integrity metadata for
355 *
356 * Description: Generates integrity metadata for a bio by calling the
357 * block device's generation callback function. The bio must have a
358 * bip attached with enough room to accommodate the generated
359 * integrity metadata.
360 */
361static void bio_integrity_generate(struct bio *bio)
362{
363 bio_integrity_generate_verify(bio, 1);
343} 364}
344 365
345static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi) 366static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
@@ -454,40 +475,7 @@ EXPORT_SYMBOL(bio_integrity_prep);
454 */ 475 */
455static int bio_integrity_verify(struct bio *bio) 476static int bio_integrity_verify(struct bio *bio)
456{ 477{
457 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 478 return bio_integrity_generate_verify(bio, 0);
458 struct blk_integrity_exchg bix;
459 struct bio_vec *bv;
460 sector_t sector = bio->bi_integrity->bip_iter.bi_sector;
461 unsigned int sectors, ret = 0;
462 void *prot_buf = bio->bi_integrity->bip_buf;
463 int i;
464
465 bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
466 bix.sector_size = bi->sector_size;
467
468 bio_for_each_segment_all(bv, bio, i) {
469 void *kaddr = kmap_atomic(bv->bv_page);
470
471 bix.data_buf = kaddr + bv->bv_offset;
472 bix.data_size = bv->bv_len;
473 bix.prot_buf = prot_buf;
474 bix.sector = sector;
475
476 ret = bi->verify_fn(&bix);
477
478 if (ret) {
479 kunmap_atomic(kaddr);
480 return ret;
481 }
482
483 sectors = bv->bv_len / bi->sector_size;
484 sector += sectors;
485 prot_buf += sectors * bi->tuple_size;
486
487 kunmap_atomic(kaddr);
488 }
489
490 return ret;
491} 479}
492 480
493/** 481/**
diff --git a/fs/bio.c b/fs/bio.c
index 8754e7b6eb49..b1bc722b89aa 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -116,7 +116,6 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
116 if (!slab) 116 if (!slab)
117 goto out_unlock; 117 goto out_unlock;
118 118
119 printk(KERN_INFO "bio: create slab <%s> at %d\n", bslab->name, entry);
120 bslab->slab = slab; 119 bslab->slab = slab;
121 bslab->slab_ref = 1; 120 bslab->slab_ref = 1;
122 bslab->slab_size = sz; 121 bslab->slab_size = sz;
@@ -1970,7 +1969,7 @@ int bio_associate_current(struct bio *bio)
1970 1969
1971 /* associate blkcg if exists */ 1970 /* associate blkcg if exists */
1972 rcu_read_lock(); 1971 rcu_read_lock();
1973 css = task_css(current, blkio_subsys_id); 1972 css = task_css(current, blkio_cgrp_id);
1974 if (css && css_tryget(css)) 1973 if (css && css_tryget(css))
1975 bio->bi_css = css; 1974 bio->bi_css = css;
1976 rcu_read_unlock(); 1975 rcu_read_unlock();
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1e86823a9cbd..ba0d2b05bb78 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -83,7 +83,7 @@ void kill_bdev(struct block_device *bdev)
83{ 83{
84 struct address_space *mapping = bdev->bd_inode->i_mapping; 84 struct address_space *mapping = bdev->bd_inode->i_mapping;
85 85
86 if (mapping->nrpages == 0) 86 if (mapping->nrpages == 0 && mapping->nrshadows == 0)
87 return; 87 return;
88 88
89 invalidate_bh_lrus(); 89 invalidate_bh_lrus();
@@ -419,7 +419,7 @@ static void bdev_evict_inode(struct inode *inode)
419{ 419{
420 struct block_device *bdev = &BDEV_I(inode)->bdev; 420 struct block_device *bdev = &BDEV_I(inode)->bdev;
421 struct list_head *p; 421 struct list_head *p;
422 truncate_inode_pages(&inode->i_data, 0); 422 truncate_inode_pages_final(&inode->i_data);
423 invalidate_inode_buffers(inode); /* is it needed here? */ 423 invalidate_inode_buffers(inode); /* is it needed here? */
424 clear_inode(inode); 424 clear_inode(inode);
425 spin_lock(&bdev_lock); 425 spin_lock(&bdev_lock);
@@ -1523,7 +1523,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1523 ssize_t err; 1523 ssize_t err;
1524 1524
1525 err = generic_write_sync(file, pos, ret); 1525 err = generic_write_sync(file, pos, ret);
1526 if (err < 0 && ret > 0) 1526 if (err < 0)
1527 ret = err; 1527 ret = err;
1528 } 1528 }
1529 blk_finish_plug(&plug); 1529 blk_finish_plug(&plug);
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c1e0b0caf9cc..ecb5832c0967 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) 2007 Oracle. All rights reserved. 2 * Copyright (C) 2007 Oracle. All rights reserved.
3 * Copyright (C) 2014 Fujitsu. All rights reserved.
3 * 4 *
4 * This program is free software; you can redistribute it and/or 5 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 6 * modify it under the terms of the GNU General Public
@@ -21,708 +22,313 @@
21#include <linux/list.h> 22#include <linux/list.h>
22#include <linux/spinlock.h> 23#include <linux/spinlock.h>
23#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/workqueue.h>
24#include "async-thread.h" 26#include "async-thread.h"
27#include "ctree.h"
28
29#define WORK_DONE_BIT 0
30#define WORK_ORDER_DONE_BIT 1
31#define WORK_HIGH_PRIO_BIT 2
32
33#define NO_THRESHOLD (-1)
34#define DFT_THRESHOLD (32)
35
36struct __btrfs_workqueue {
37 struct workqueue_struct *normal_wq;
38 /* List head pointing to ordered work list */
39 struct list_head ordered_list;
40
41 /* Spinlock for ordered_list */
42 spinlock_t list_lock;
43
44 /* Thresholding related variants */
45 atomic_t pending;
46 int max_active;
47 int current_max;
48 int thresh;
49 unsigned int count;
50 spinlock_t thres_lock;
51};
25 52
26#define WORK_QUEUED_BIT 0 53struct btrfs_workqueue {
27#define WORK_DONE_BIT 1 54 struct __btrfs_workqueue *normal;
28#define WORK_ORDER_DONE_BIT 2 55 struct __btrfs_workqueue *high;
29#define WORK_HIGH_PRIO_BIT 3 56};
30
31/*
32 * container for the kthread task pointer and the list of pending work
33 * One of these is allocated per thread.
34 */
35struct btrfs_worker_thread {
36 /* pool we belong to */
37 struct btrfs_workers *workers;
38
39 /* list of struct btrfs_work that are waiting for service */
40 struct list_head pending;
41 struct list_head prio_pending;
42
43 /* list of worker threads from struct btrfs_workers */
44 struct list_head worker_list;
45
46 /* kthread */
47 struct task_struct *task;
48 57
49 /* number of things on the pending list */ 58static inline struct __btrfs_workqueue
50 atomic_t num_pending; 59*__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
60 int thresh)
61{
62 struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
51 63
52 /* reference counter for this struct */ 64 if (unlikely(!ret))
53 atomic_t refs; 65 return NULL;
54 66
55 unsigned long sequence; 67 ret->max_active = max_active;
68 atomic_set(&ret->pending, 0);
69 if (thresh == 0)
70 thresh = DFT_THRESHOLD;
71 /* For low threshold, disabling threshold is a better choice */
72 if (thresh < DFT_THRESHOLD) {
73 ret->current_max = max_active;
74 ret->thresh = NO_THRESHOLD;
75 } else {
76 ret->current_max = 1;
77 ret->thresh = thresh;
78 }
56 79
57 /* protects the pending list. */ 80 if (flags & WQ_HIGHPRI)
58 spinlock_t lock; 81 ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
82 ret->max_active,
83 "btrfs", name);
84 else
85 ret->normal_wq = alloc_workqueue("%s-%s", flags,
86 ret->max_active, "btrfs",
87 name);
88 if (unlikely(!ret->normal_wq)) {
89 kfree(ret);
90 return NULL;
91 }
59 92
60 /* set to non-zero when this thread is already awake and kicking */ 93 INIT_LIST_HEAD(&ret->ordered_list);
61 int working; 94 spin_lock_init(&ret->list_lock);
95 spin_lock_init(&ret->thres_lock);
96 trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
97 return ret;
98}
62 99
63 /* are we currently idle */ 100static inline void
64 int idle; 101__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
65};
66 102
67static int __btrfs_start_workers(struct btrfs_workers *workers); 103struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
104 int flags,
105 int max_active,
106 int thresh)
107{
108 struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
68 109
69/* 110 if (unlikely(!ret))
70 * btrfs_start_workers uses kthread_run, which can block waiting for memory 111 return NULL;
71 * for a very long time. It will actually throttle on page writeback,
72 * and so it may not make progress until after our btrfs worker threads
73 * process all of the pending work structs in their queue
74 *
75 * This means we can't use btrfs_start_workers from inside a btrfs worker
76 * thread that is used as part of cleaning dirty memory, which pretty much
77 * involves all of the worker threads.
78 *
79 * Instead we have a helper queue who never has more than one thread
80 * where we scheduler thread start operations. This worker_start struct
81 * is used to contain the work and hold a pointer to the queue that needs
82 * another worker.
83 */
84struct worker_start {
85 struct btrfs_work work;
86 struct btrfs_workers *queue;
87};
88 112
89static void start_new_worker_func(struct btrfs_work *work) 113 ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
90{ 114 max_active, thresh);
91 struct worker_start *start; 115 if (unlikely(!ret->normal)) {
92 start = container_of(work, struct worker_start, work); 116 kfree(ret);
93 __btrfs_start_workers(start->queue); 117 return NULL;
94 kfree(start); 118 }
95}
96 119
97/* 120 if (flags & WQ_HIGHPRI) {
98 * helper function to move a thread onto the idle list after it 121 ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
99 * has finished some requests. 122 thresh);
100 */ 123 if (unlikely(!ret->high)) {
101static void check_idle_worker(struct btrfs_worker_thread *worker) 124 __btrfs_destroy_workqueue(ret->normal);
102{ 125 kfree(ret);
103 if (!worker->idle && atomic_read(&worker->num_pending) < 126 return NULL;
104 worker->workers->idle_thresh / 2) {
105 unsigned long flags;
106 spin_lock_irqsave(&worker->workers->lock, flags);
107 worker->idle = 1;
108
109 /* the list may be empty if the worker is just starting */
110 if (!list_empty(&worker->worker_list) &&
111 !worker->workers->stopping) {
112 list_move(&worker->worker_list,
113 &worker->workers->idle_list);
114 } 127 }
115 spin_unlock_irqrestore(&worker->workers->lock, flags);
116 } 128 }
129 return ret;
117} 130}
118 131
119/* 132/*
120 * helper function to move a thread off the idle list after new 133 * Hook for threshold which will be called in btrfs_queue_work.
121 * pending work is added. 134 * This hook WILL be called in IRQ handler context,
135 * so workqueue_set_max_active MUST NOT be called in this hook
122 */ 136 */
123static void check_busy_worker(struct btrfs_worker_thread *worker) 137static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
124{ 138{
125 if (worker->idle && atomic_read(&worker->num_pending) >= 139 if (wq->thresh == NO_THRESHOLD)
126 worker->workers->idle_thresh) { 140 return;
127 unsigned long flags; 141 atomic_inc(&wq->pending);
128 spin_lock_irqsave(&worker->workers->lock, flags);
129 worker->idle = 0;
130
131 if (!list_empty(&worker->worker_list) &&
132 !worker->workers->stopping) {
133 list_move_tail(&worker->worker_list,
134 &worker->workers->worker_list);
135 }
136 spin_unlock_irqrestore(&worker->workers->lock, flags);
137 }
138} 142}
139 143
140static void check_pending_worker_creates(struct btrfs_worker_thread *worker) 144/*
145 * Hook for threshold which will be called before executing the work,
146 * This hook is called in kthread content.
147 * So workqueue_set_max_active is called here.
148 */
149static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
141{ 150{
142 struct btrfs_workers *workers = worker->workers; 151 int new_max_active;
143 struct worker_start *start; 152 long pending;
144 unsigned long flags; 153 int need_change = 0;
145 154
146 rmb(); 155 if (wq->thresh == NO_THRESHOLD)
147 if (!workers->atomic_start_pending)
148 return; 156 return;
149 157
150 start = kzalloc(sizeof(*start), GFP_NOFS); 158 atomic_dec(&wq->pending);
151 if (!start) 159 spin_lock(&wq->thres_lock);
152 return; 160 /*
153 161 * Use wq->count to limit the calling frequency of
154 start->work.func = start_new_worker_func; 162 * workqueue_set_max_active.
155 start->queue = workers; 163 */
156 164 wq->count++;
157 spin_lock_irqsave(&workers->lock, flags); 165 wq->count %= (wq->thresh / 4);
158 if (!workers->atomic_start_pending) 166 if (!wq->count)
159 goto out; 167 goto out;
160 168 new_max_active = wq->current_max;
161 workers->atomic_start_pending = 0;
162 if (workers->num_workers + workers->num_workers_starting >=
163 workers->max_workers)
164 goto out;
165
166 workers->num_workers_starting += 1;
167 spin_unlock_irqrestore(&workers->lock, flags);
168 btrfs_queue_worker(workers->atomic_worker_start, &start->work);
169 return;
170 169
170 /*
171 * pending may be changed later, but it's OK since we really
172 * don't need it so accurate to calculate new_max_active.
173 */
174 pending = atomic_read(&wq->pending);
175 if (pending > wq->thresh)
176 new_max_active++;
177 if (pending < wq->thresh / 2)
178 new_max_active--;
179 new_max_active = clamp_val(new_max_active, 1, wq->max_active);
180 if (new_max_active != wq->current_max) {
181 need_change = 1;
182 wq->current_max = new_max_active;
183 }
171out: 184out:
172 kfree(start); 185 spin_unlock(&wq->thres_lock);
173 spin_unlock_irqrestore(&workers->lock, flags); 186
187 if (need_change) {
188 workqueue_set_max_active(wq->normal_wq, wq->current_max);
189 }
174} 190}
175 191
176static noinline void run_ordered_completions(struct btrfs_workers *workers, 192static void run_ordered_work(struct __btrfs_workqueue *wq)
177 struct btrfs_work *work)
178{ 193{
179 if (!workers->ordered) 194 struct list_head *list = &wq->ordered_list;
180 return; 195 struct btrfs_work *work;
181 196 spinlock_t *lock = &wq->list_lock;
182 set_bit(WORK_DONE_BIT, &work->flags); 197 unsigned long flags;
183
184 spin_lock(&workers->order_lock);
185 198
186 while (1) { 199 while (1) {
187 if (!list_empty(&workers->prio_order_list)) { 200 spin_lock_irqsave(lock, flags);
188 work = list_entry(workers->prio_order_list.next, 201 if (list_empty(list))
189 struct btrfs_work, order_list);
190 } else if (!list_empty(&workers->order_list)) {
191 work = list_entry(workers->order_list.next,
192 struct btrfs_work, order_list);
193 } else {
194 break; 202 break;
195 } 203 work = list_entry(list->next, struct btrfs_work,
204 ordered_list);
196 if (!test_bit(WORK_DONE_BIT, &work->flags)) 205 if (!test_bit(WORK_DONE_BIT, &work->flags))
197 break; 206 break;
198 207
199 /* we are going to call the ordered done function, but 208 /*
209 * we are going to call the ordered done function, but
200 * we leave the work item on the list as a barrier so 210 * we leave the work item on the list as a barrier so
201 * that later work items that are done don't have their 211 * that later work items that are done don't have their
202 * functions called before this one returns 212 * functions called before this one returns
203 */ 213 */
204 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) 214 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
205 break; 215 break;
206 216 trace_btrfs_ordered_sched(work);
207 spin_unlock(&workers->order_lock); 217 spin_unlock_irqrestore(lock, flags);
208
209 work->ordered_func(work); 218 work->ordered_func(work);
210 219
211 /* now take the lock again and drop our item from the list */ 220 /* now take the lock again and drop our item from the list */
212 spin_lock(&workers->order_lock); 221 spin_lock_irqsave(lock, flags);
213 list_del(&work->order_list); 222 list_del(&work->ordered_list);
214 spin_unlock(&workers->order_lock); 223 spin_unlock_irqrestore(lock, flags);
215 224
216 /* 225 /*
217 * we don't want to call the ordered free functions 226 * we don't want to call the ordered free functions
218 * with the lock held though 227 * with the lock held though
219 */ 228 */
220 work->ordered_free(work); 229 work->ordered_free(work);
221 spin_lock(&workers->order_lock); 230 trace_btrfs_all_work_done(work);
222 }
223
224 spin_unlock(&workers->order_lock);
225}
226
227static void put_worker(struct btrfs_worker_thread *worker)
228{
229 if (atomic_dec_and_test(&worker->refs))
230 kfree(worker);
231}
232
233static int try_worker_shutdown(struct btrfs_worker_thread *worker)
234{
235 int freeit = 0;
236
237 spin_lock_irq(&worker->lock);
238 spin_lock(&worker->workers->lock);
239 if (worker->workers->num_workers > 1 &&
240 worker->idle &&
241 !worker->working &&
242 !list_empty(&worker->worker_list) &&
243 list_empty(&worker->prio_pending) &&
244 list_empty(&worker->pending) &&
245 atomic_read(&worker->num_pending) == 0) {
246 freeit = 1;
247 list_del_init(&worker->worker_list);
248 worker->workers->num_workers--;
249 } 231 }
250 spin_unlock(&worker->workers->lock); 232 spin_unlock_irqrestore(lock, flags);
251 spin_unlock_irq(&worker->lock);
252
253 if (freeit)
254 put_worker(worker);
255 return freeit;
256} 233}
257 234
258static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker, 235static void normal_work_helper(struct work_struct *arg)
259 struct list_head *prio_head,
260 struct list_head *head)
261{
262 struct btrfs_work *work = NULL;
263 struct list_head *cur = NULL;
264
265 if (!list_empty(prio_head))
266 cur = prio_head->next;
267
268 smp_mb();
269 if (!list_empty(&worker->prio_pending))
270 goto refill;
271
272 if (!list_empty(head))
273 cur = head->next;
274
275 if (cur)
276 goto out;
277
278refill:
279 spin_lock_irq(&worker->lock);
280 list_splice_tail_init(&worker->prio_pending, prio_head);
281 list_splice_tail_init(&worker->pending, head);
282
283 if (!list_empty(prio_head))
284 cur = prio_head->next;
285 else if (!list_empty(head))
286 cur = head->next;
287 spin_unlock_irq(&worker->lock);
288
289 if (!cur)
290 goto out_fail;
291
292out:
293 work = list_entry(cur, struct btrfs_work, list);
294
295out_fail:
296 return work;
297}
298
299/*
300 * main loop for servicing work items
301 */
302static int worker_loop(void *arg)
303{ 236{
304 struct btrfs_worker_thread *worker = arg;
305 struct list_head head;
306 struct list_head prio_head;
307 struct btrfs_work *work; 237 struct btrfs_work *work;
238 struct __btrfs_workqueue *wq;
239 int need_order = 0;
308 240
309 INIT_LIST_HEAD(&head); 241 work = container_of(arg, struct btrfs_work, normal_work);
310 INIT_LIST_HEAD(&prio_head); 242 /*
311 243 * We should not touch things inside work in the following cases:
312 do { 244 * 1) after work->func() if it has no ordered_free
313again: 245 * Since the struct is freed in work->func().
314 while (1) { 246 * 2) after setting WORK_DONE_BIT
315 247 * The work may be freed in other threads almost instantly.
316 248 * So we save the needed things here.
317 work = get_next_work(worker, &prio_head, &head); 249 */
318 if (!work) 250 if (work->ordered_func)
319 break; 251 need_order = 1;
320 252 wq = work->wq;
321 list_del(&work->list); 253
322 clear_bit(WORK_QUEUED_BIT, &work->flags); 254 trace_btrfs_work_sched(work);
323 255 thresh_exec_hook(wq);
324 work->worker = worker; 256 work->func(work);
325 257 if (need_order) {
326 work->func(work); 258 set_bit(WORK_DONE_BIT, &work->flags);
327 259 run_ordered_work(wq);
328 atomic_dec(&worker->num_pending);
329 /*
330 * unless this is an ordered work queue,
331 * 'work' was probably freed by func above.
332 */
333 run_ordered_completions(worker->workers, work);
334
335 check_pending_worker_creates(worker);
336 cond_resched();
337 }
338
339 spin_lock_irq(&worker->lock);
340 check_idle_worker(worker);
341
342 if (freezing(current)) {
343 worker->working = 0;
344 spin_unlock_irq(&worker->lock);
345 try_to_freeze();
346 } else {
347 spin_unlock_irq(&worker->lock);
348 if (!kthread_should_stop()) {
349 cpu_relax();
350 /*
351 * we've dropped the lock, did someone else
352 * jump_in?
353 */
354 smp_mb();
355 if (!list_empty(&worker->pending) ||
356 !list_empty(&worker->prio_pending))
357 continue;
358
359 /*
360 * this short schedule allows more work to
361 * come in without the queue functions
362 * needing to go through wake_up_process()
363 *
364 * worker->working is still 1, so nobody
365 * is going to try and wake us up
366 */
367 schedule_timeout(1);
368 smp_mb();
369 if (!list_empty(&worker->pending) ||
370 !list_empty(&worker->prio_pending))
371 continue;
372
373 if (kthread_should_stop())
374 break;
375
376 /* still no more work?, sleep for real */
377 spin_lock_irq(&worker->lock);
378 set_current_state(TASK_INTERRUPTIBLE);
379 if (!list_empty(&worker->pending) ||
380 !list_empty(&worker->prio_pending)) {
381 spin_unlock_irq(&worker->lock);
382 set_current_state(TASK_RUNNING);
383 goto again;
384 }
385
386 /*
387 * this makes sure we get a wakeup when someone
388 * adds something new to the queue
389 */
390 worker->working = 0;
391 spin_unlock_irq(&worker->lock);
392
393 if (!kthread_should_stop()) {
394 schedule_timeout(HZ * 120);
395 if (!worker->working &&
396 try_worker_shutdown(worker)) {
397 return 0;
398 }
399 }
400 }
401 __set_current_state(TASK_RUNNING);
402 }
403 } while (!kthread_should_stop());
404 return 0;
405}
406
407/*
408 * this will wait for all the worker threads to shutdown
409 */
410void btrfs_stop_workers(struct btrfs_workers *workers)
411{
412 struct list_head *cur;
413 struct btrfs_worker_thread *worker;
414 int can_stop;
415
416 spin_lock_irq(&workers->lock);
417 workers->stopping = 1;
418 list_splice_init(&workers->idle_list, &workers->worker_list);
419 while (!list_empty(&workers->worker_list)) {
420 cur = workers->worker_list.next;
421 worker = list_entry(cur, struct btrfs_worker_thread,
422 worker_list);
423
424 atomic_inc(&worker->refs);
425 workers->num_workers -= 1;
426 if (!list_empty(&worker->worker_list)) {
427 list_del_init(&worker->worker_list);
428 put_worker(worker);
429 can_stop = 1;
430 } else
431 can_stop = 0;
432 spin_unlock_irq(&workers->lock);
433 if (can_stop)
434 kthread_stop(worker->task);
435 spin_lock_irq(&workers->lock);
436 put_worker(worker);
437 } 260 }
438 spin_unlock_irq(&workers->lock); 261 if (!need_order)
262 trace_btrfs_all_work_done(work);
439} 263}
440 264
441/* 265void btrfs_init_work(struct btrfs_work *work,
442 * simple init on struct btrfs_workers 266 btrfs_func_t func,
443 */ 267 btrfs_func_t ordered_func,
444void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, 268 btrfs_func_t ordered_free)
445 struct btrfs_workers *async_helper)
446{ 269{
447 workers->num_workers = 0; 270 work->func = func;
448 workers->num_workers_starting = 0; 271 work->ordered_func = ordered_func;
449 INIT_LIST_HEAD(&workers->worker_list); 272 work->ordered_free = ordered_free;
450 INIT_LIST_HEAD(&workers->idle_list); 273 INIT_WORK(&work->normal_work, normal_work_helper);
451 INIT_LIST_HEAD(&workers->order_list); 274 INIT_LIST_HEAD(&work->ordered_list);
452 INIT_LIST_HEAD(&workers->prio_order_list); 275 work->flags = 0;
453 spin_lock_init(&workers->lock);
454 spin_lock_init(&workers->order_lock);
455 workers->max_workers = max;
456 workers->idle_thresh = 32;
457 workers->name = name;
458 workers->ordered = 0;
459 workers->atomic_start_pending = 0;
460 workers->atomic_worker_start = async_helper;
461 workers->stopping = 0;
462} 276}
463 277
464/* 278static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
465 * starts new worker threads. This does not enforce the max worker 279 struct btrfs_work *work)
466 * count in case you need to temporarily go past it.
467 */
468static int __btrfs_start_workers(struct btrfs_workers *workers)
469{ 280{
470 struct btrfs_worker_thread *worker; 281 unsigned long flags;
471 int ret = 0;
472
473 worker = kzalloc(sizeof(*worker), GFP_NOFS);
474 if (!worker) {
475 ret = -ENOMEM;
476 goto fail;
477 }
478
479 INIT_LIST_HEAD(&worker->pending);
480 INIT_LIST_HEAD(&worker->prio_pending);
481 INIT_LIST_HEAD(&worker->worker_list);
482 spin_lock_init(&worker->lock);
483
484 atomic_set(&worker->num_pending, 0);
485 atomic_set(&worker->refs, 1);
486 worker->workers = workers;
487 worker->task = kthread_create(worker_loop, worker,
488 "btrfs-%s-%d", workers->name,
489 workers->num_workers + 1);
490 if (IS_ERR(worker->task)) {
491 ret = PTR_ERR(worker->task);
492 goto fail;
493 }
494 282
495 spin_lock_irq(&workers->lock); 283 work->wq = wq;
496 if (workers->stopping) { 284 thresh_queue_hook(wq);
497 spin_unlock_irq(&workers->lock); 285 if (work->ordered_func) {
498 ret = -EINVAL; 286 spin_lock_irqsave(&wq->list_lock, flags);
499 goto fail_kthread; 287 list_add_tail(&work->ordered_list, &wq->ordered_list);
288 spin_unlock_irqrestore(&wq->list_lock, flags);
500 } 289 }
501 list_add_tail(&worker->worker_list, &workers->idle_list); 290 queue_work(wq->normal_wq, &work->normal_work);
502 worker->idle = 1; 291 trace_btrfs_work_queued(work);
503 workers->num_workers++;
504 workers->num_workers_starting--;
505 WARN_ON(workers->num_workers_starting < 0);
506 spin_unlock_irq(&workers->lock);
507
508 wake_up_process(worker->task);
509 return 0;
510
511fail_kthread:
512 kthread_stop(worker->task);
513fail:
514 kfree(worker);
515 spin_lock_irq(&workers->lock);
516 workers->num_workers_starting--;
517 spin_unlock_irq(&workers->lock);
518 return ret;
519} 292}
520 293
521int btrfs_start_workers(struct btrfs_workers *workers) 294void btrfs_queue_work(struct btrfs_workqueue *wq,
295 struct btrfs_work *work)
522{ 296{
523 spin_lock_irq(&workers->lock); 297 struct __btrfs_workqueue *dest_wq;
524 workers->num_workers_starting++;
525 spin_unlock_irq(&workers->lock);
526 return __btrfs_start_workers(workers);
527}
528
529/*
530 * run through the list and find a worker thread that doesn't have a lot
531 * to do right now. This can return null if we aren't yet at the thread
532 * count limit and all of the threads are busy.
533 */
534static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
535{
536 struct btrfs_worker_thread *worker;
537 struct list_head *next;
538 int enforce_min;
539
540 enforce_min = (workers->num_workers + workers->num_workers_starting) <
541 workers->max_workers;
542
543 /*
544 * if we find an idle thread, don't move it to the end of the
545 * idle list. This improves the chance that the next submission
546 * will reuse the same thread, and maybe catch it while it is still
547 * working
548 */
549 if (!list_empty(&workers->idle_list)) {
550 next = workers->idle_list.next;
551 worker = list_entry(next, struct btrfs_worker_thread,
552 worker_list);
553 return worker;
554 }
555 if (enforce_min || list_empty(&workers->worker_list))
556 return NULL;
557
558 /*
559 * if we pick a busy task, move the task to the end of the list.
560 * hopefully this will keep things somewhat evenly balanced.
561 * Do the move in batches based on the sequence number. This groups
562 * requests submitted at roughly the same time onto the same worker.
563 */
564 next = workers->worker_list.next;
565 worker = list_entry(next, struct btrfs_worker_thread, worker_list);
566 worker->sequence++;
567 298
568 if (worker->sequence % workers->idle_thresh == 0) 299 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
569 list_move_tail(next, &workers->worker_list); 300 dest_wq = wq->high;
570 return worker; 301 else
302 dest_wq = wq->normal;
303 __btrfs_queue_work(dest_wq, work);
571} 304}
572 305
573/* 306static inline void
574 * selects a worker thread to take the next job. This will either find 307__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
575 * an idle worker, start a new worker up to the max count, or just return
576 * one of the existing busy workers.
577 */
578static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
579{ 308{
580 struct btrfs_worker_thread *worker; 309 destroy_workqueue(wq->normal_wq);
581 unsigned long flags; 310 trace_btrfs_workqueue_destroy(wq);
582 struct list_head *fallback; 311 kfree(wq);
583 int ret;
584
585 spin_lock_irqsave(&workers->lock, flags);
586again:
587 worker = next_worker(workers);
588
589 if (!worker) {
590 if (workers->num_workers + workers->num_workers_starting >=
591 workers->max_workers) {
592 goto fallback;
593 } else if (workers->atomic_worker_start) {
594 workers->atomic_start_pending = 1;
595 goto fallback;
596 } else {
597 workers->num_workers_starting++;
598 spin_unlock_irqrestore(&workers->lock, flags);
599 /* we're below the limit, start another worker */
600 ret = __btrfs_start_workers(workers);
601 spin_lock_irqsave(&workers->lock, flags);
602 if (ret)
603 goto fallback;
604 goto again;
605 }
606 }
607 goto found;
608
609fallback:
610 fallback = NULL;
611 /*
612 * we have failed to find any workers, just
613 * return the first one we can find.
614 */
615 if (!list_empty(&workers->worker_list))
616 fallback = workers->worker_list.next;
617 if (!list_empty(&workers->idle_list))
618 fallback = workers->idle_list.next;
619 BUG_ON(!fallback);
620 worker = list_entry(fallback,
621 struct btrfs_worker_thread, worker_list);
622found:
623 /*
624 * this makes sure the worker doesn't exit before it is placed
625 * onto a busy/idle list
626 */
627 atomic_inc(&worker->num_pending);
628 spin_unlock_irqrestore(&workers->lock, flags);
629 return worker;
630} 312}
631 313
632/* 314void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
633 * btrfs_requeue_work just puts the work item back on the tail of the list
634 * it was taken from. It is intended for use with long running work functions
635 * that make some progress and want to give the cpu up for others.
636 */
637void btrfs_requeue_work(struct btrfs_work *work)
638{ 315{
639 struct btrfs_worker_thread *worker = work->worker; 316 if (!wq)
640 unsigned long flags;
641 int wake = 0;
642
643 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
644 return; 317 return;
645 318 if (wq->high)
646 spin_lock_irqsave(&worker->lock, flags); 319 __btrfs_destroy_workqueue(wq->high);
647 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) 320 __btrfs_destroy_workqueue(wq->normal);
648 list_add_tail(&work->list, &worker->prio_pending); 321 kfree(wq);
649 else
650 list_add_tail(&work->list, &worker->pending);
651 atomic_inc(&worker->num_pending);
652
653 /* by definition we're busy, take ourselves off the idle
654 * list
655 */
656 if (worker->idle) {
657 spin_lock(&worker->workers->lock);
658 worker->idle = 0;
659 list_move_tail(&worker->worker_list,
660 &worker->workers->worker_list);
661 spin_unlock(&worker->workers->lock);
662 }
663 if (!worker->working) {
664 wake = 1;
665 worker->working = 1;
666 }
667
668 if (wake)
669 wake_up_process(worker->task);
670 spin_unlock_irqrestore(&worker->lock, flags);
671} 322}
672 323
673void btrfs_set_work_high_prio(struct btrfs_work *work) 324void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
674{ 325{
675 set_bit(WORK_HIGH_PRIO_BIT, &work->flags); 326 wq->normal->max_active = max;
327 if (wq->high)
328 wq->high->max_active = max;
676} 329}
677 330
678/* 331void btrfs_set_work_high_priority(struct btrfs_work *work)
679 * places a struct btrfs_work into the pending queue of one of the kthreads
680 */
681void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
682{ 332{
683 struct btrfs_worker_thread *worker; 333 set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
684 unsigned long flags;
685 int wake = 0;
686
687 /* don't requeue something already on a list */
688 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
689 return;
690
691 worker = find_worker(workers);
692 if (workers->ordered) {
693 /*
694 * you're not allowed to do ordered queues from an
695 * interrupt handler
696 */
697 spin_lock(&workers->order_lock);
698 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
699 list_add_tail(&work->order_list,
700 &workers->prio_order_list);
701 } else {
702 list_add_tail(&work->order_list, &workers->order_list);
703 }
704 spin_unlock(&workers->order_lock);
705 } else {
706 INIT_LIST_HEAD(&work->order_list);
707 }
708
709 spin_lock_irqsave(&worker->lock, flags);
710
711 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
712 list_add_tail(&work->list, &worker->prio_pending);
713 else
714 list_add_tail(&work->list, &worker->pending);
715 check_busy_worker(worker);
716
717 /*
718 * avoid calling into wake_up_process if this thread has already
719 * been kicked
720 */
721 if (!worker->working)
722 wake = 1;
723 worker->working = 1;
724
725 if (wake)
726 wake_up_process(worker->task);
727 spin_unlock_irqrestore(&worker->lock, flags);
728} 334}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1f26792683ed..9c6b66d15fb0 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) 2007 Oracle. All rights reserved. 2 * Copyright (C) 2007 Oracle. All rights reserved.
3 * Copyright (C) 2014 Fujitsu. All rights reserved.
3 * 4 *
4 * This program is free software; you can redistribute it and/or 5 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 6 * modify it under the terms of the GNU General Public
@@ -19,103 +20,35 @@
19#ifndef __BTRFS_ASYNC_THREAD_ 20#ifndef __BTRFS_ASYNC_THREAD_
20#define __BTRFS_ASYNC_THREAD_ 21#define __BTRFS_ASYNC_THREAD_
21 22
22struct btrfs_worker_thread; 23struct btrfs_workqueue;
24/* Internal use only */
25struct __btrfs_workqueue;
26struct btrfs_work;
27typedef void (*btrfs_func_t)(struct btrfs_work *arg);
23 28
24/*
25 * This is similar to a workqueue, but it is meant to spread the operations
26 * across all available cpus instead of just the CPU that was used to
27 * queue the work. There is also some batching introduced to try and
28 * cut down on context switches.
29 *
30 * By default threads are added on demand up to 2 * the number of cpus.
31 * Changing struct btrfs_workers->max_workers is one way to prevent
32 * demand creation of kthreads.
33 *
34 * the basic model of these worker threads is to embed a btrfs_work
35 * structure in your own data struct, and use container_of in a
36 * work function to get back to your data struct.
37 */
38struct btrfs_work { 29struct btrfs_work {
39 /* 30 btrfs_func_t func;
40 * func should be set to the function you want called 31 btrfs_func_t ordered_func;
41 * your work struct is passed as the only arg 32 btrfs_func_t ordered_free;
42 * 33
43 * ordered_func must be set for work sent to an ordered work queue, 34 /* Don't touch things below */
44 * and it is called to complete a given work item in the same 35 struct work_struct normal_work;
45 * order they were sent to the queue. 36 struct list_head ordered_list;
46 */ 37 struct __btrfs_workqueue *wq;
47 void (*func)(struct btrfs_work *work);
48 void (*ordered_func)(struct btrfs_work *work);
49 void (*ordered_free)(struct btrfs_work *work);
50
51 /*
52 * flags should be set to zero. It is used to make sure the
53 * struct is only inserted once into the list.
54 */
55 unsigned long flags; 38 unsigned long flags;
56
57 /* don't touch these */
58 struct btrfs_worker_thread *worker;
59 struct list_head list;
60 struct list_head order_list;
61};
62
63struct btrfs_workers {
64 /* current number of running workers */
65 int num_workers;
66
67 int num_workers_starting;
68
69 /* max number of workers allowed. changed by btrfs_start_workers */
70 int max_workers;
71
72 /* once a worker has this many requests or fewer, it is idle */
73 int idle_thresh;
74
75 /* force completions in the order they were queued */
76 int ordered;
77
78 /* more workers required, but in an interrupt handler */
79 int atomic_start_pending;
80
81 /*
82 * are we allowed to sleep while starting workers or are we required
83 * to start them at a later time? If we can't sleep, this indicates
84 * which queue we need to use to schedule thread creation.
85 */
86 struct btrfs_workers *atomic_worker_start;
87
88 /* list with all the work threads. The workers on the idle thread
89 * may be actively servicing jobs, but they haven't yet hit the
90 * idle thresh limit above.
91 */
92 struct list_head worker_list;
93 struct list_head idle_list;
94
95 /*
96 * when operating in ordered mode, this maintains the list
97 * of work items waiting for completion
98 */
99 struct list_head order_list;
100 struct list_head prio_order_list;
101
102 /* lock for finding the next worker thread to queue on */
103 spinlock_t lock;
104
105 /* lock for the ordered lists */
106 spinlock_t order_lock;
107
108 /* extra name for this worker, used for current->name */
109 char *name;
110
111 int stopping;
112}; 39};
113 40
114void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); 41struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
115int btrfs_start_workers(struct btrfs_workers *workers); 42 int flags,
116void btrfs_stop_workers(struct btrfs_workers *workers); 43 int max_active,
117void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, 44 int thresh);
118 struct btrfs_workers *async_starter); 45void btrfs_init_work(struct btrfs_work *work,
119void btrfs_requeue_work(struct btrfs_work *work); 46 btrfs_func_t func,
120void btrfs_set_work_high_prio(struct btrfs_work *work); 47 btrfs_func_t ordered_func,
48 btrfs_func_t ordered_free);
49void btrfs_queue_work(struct btrfs_workqueue *wq,
50 struct btrfs_work *work);
51void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
52void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
53void btrfs_set_work_high_priority(struct btrfs_work *work);
121#endif 54#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index aded3ef3d3d4..aad7201ad11b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -220,7 +220,8 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
220 220
221static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, 221static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
222 struct ulist *parents, struct __prelim_ref *ref, 222 struct ulist *parents, struct __prelim_ref *ref,
223 int level, u64 time_seq, const u64 *extent_item_pos) 223 int level, u64 time_seq, const u64 *extent_item_pos,
224 u64 total_refs)
224{ 225{
225 int ret = 0; 226 int ret = 0;
226 int slot; 227 int slot;
@@ -249,7 +250,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
249 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) 250 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
250 ret = btrfs_next_old_leaf(root, path, time_seq); 251 ret = btrfs_next_old_leaf(root, path, time_seq);
251 252
252 while (!ret && count < ref->count) { 253 while (!ret && count < total_refs) {
253 eb = path->nodes[0]; 254 eb = path->nodes[0];
254 slot = path->slots[0]; 255 slot = path->slots[0];
255 256
@@ -306,7 +307,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
306 struct btrfs_path *path, u64 time_seq, 307 struct btrfs_path *path, u64 time_seq,
307 struct __prelim_ref *ref, 308 struct __prelim_ref *ref,
308 struct ulist *parents, 309 struct ulist *parents,
309 const u64 *extent_item_pos) 310 const u64 *extent_item_pos, u64 total_refs)
310{ 311{
311 struct btrfs_root *root; 312 struct btrfs_root *root;
312 struct btrfs_key root_key; 313 struct btrfs_key root_key;
@@ -361,7 +362,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
361 } 362 }
362 363
363 ret = add_all_parents(root, path, parents, ref, level, time_seq, 364 ret = add_all_parents(root, path, parents, ref, level, time_seq,
364 extent_item_pos); 365 extent_item_pos, total_refs);
365out: 366out:
366 path->lowest_level = 0; 367 path->lowest_level = 0;
367 btrfs_release_path(path); 368 btrfs_release_path(path);
@@ -374,7 +375,7 @@ out:
374static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, 375static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
375 struct btrfs_path *path, u64 time_seq, 376 struct btrfs_path *path, u64 time_seq,
376 struct list_head *head, 377 struct list_head *head,
377 const u64 *extent_item_pos) 378 const u64 *extent_item_pos, u64 total_refs)
378{ 379{
379 int err; 380 int err;
380 int ret = 0; 381 int ret = 0;
@@ -400,7 +401,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
400 if (ref->count == 0) 401 if (ref->count == 0)
401 continue; 402 continue;
402 err = __resolve_indirect_ref(fs_info, path, time_seq, ref, 403 err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
403 parents, extent_item_pos); 404 parents, extent_item_pos,
405 total_refs);
404 /* 406 /*
405 * we can only tolerate ENOENT,otherwise,we should catch error 407 * we can only tolerate ENOENT,otherwise,we should catch error
406 * and return directly. 408 * and return directly.
@@ -557,7 +559,7 @@ static void __merge_refs(struct list_head *head, int mode)
557 * smaller or equal that seq to the list 559 * smaller or equal that seq to the list
558 */ 560 */
559static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, 561static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
560 struct list_head *prefs) 562 struct list_head *prefs, u64 *total_refs)
561{ 563{
562 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 564 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
563 struct rb_node *n = &head->node.rb_node; 565 struct rb_node *n = &head->node.rb_node;
@@ -593,6 +595,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
593 default: 595 default:
594 BUG_ON(1); 596 BUG_ON(1);
595 } 597 }
598 *total_refs += (node->ref_mod * sgn);
596 switch (node->type) { 599 switch (node->type) {
597 case BTRFS_TREE_BLOCK_REF_KEY: { 600 case BTRFS_TREE_BLOCK_REF_KEY: {
598 struct btrfs_delayed_tree_ref *ref; 601 struct btrfs_delayed_tree_ref *ref;
@@ -653,7 +656,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
653 */ 656 */
654static int __add_inline_refs(struct btrfs_fs_info *fs_info, 657static int __add_inline_refs(struct btrfs_fs_info *fs_info,
655 struct btrfs_path *path, u64 bytenr, 658 struct btrfs_path *path, u64 bytenr,
656 int *info_level, struct list_head *prefs) 659 int *info_level, struct list_head *prefs,
660 u64 *total_refs)
657{ 661{
658 int ret = 0; 662 int ret = 0;
659 int slot; 663 int slot;
@@ -677,6 +681,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
677 681
678 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); 682 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
679 flags = btrfs_extent_flags(leaf, ei); 683 flags = btrfs_extent_flags(leaf, ei);
684 *total_refs += btrfs_extent_refs(leaf, ei);
680 btrfs_item_key_to_cpu(leaf, &found_key, slot); 685 btrfs_item_key_to_cpu(leaf, &found_key, slot);
681 686
682 ptr = (unsigned long)(ei + 1); 687 ptr = (unsigned long)(ei + 1);
@@ -859,6 +864,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
859 struct list_head prefs; 864 struct list_head prefs;
860 struct __prelim_ref *ref; 865 struct __prelim_ref *ref;
861 struct extent_inode_elem *eie = NULL; 866 struct extent_inode_elem *eie = NULL;
867 u64 total_refs = 0;
862 868
863 INIT_LIST_HEAD(&prefs); 869 INIT_LIST_HEAD(&prefs);
864 INIT_LIST_HEAD(&prefs_delayed); 870 INIT_LIST_HEAD(&prefs_delayed);
@@ -873,8 +879,10 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
873 path = btrfs_alloc_path(); 879 path = btrfs_alloc_path();
874 if (!path) 880 if (!path)
875 return -ENOMEM; 881 return -ENOMEM;
876 if (!trans) 882 if (!trans) {
877 path->search_commit_root = 1; 883 path->search_commit_root = 1;
884 path->skip_locking = 1;
885 }
878 886
879 /* 887 /*
880 * grab both a lock on the path and a lock on the delayed ref head. 888 * grab both a lock on the path and a lock on the delayed ref head.
@@ -915,7 +923,7 @@ again:
915 } 923 }
916 spin_unlock(&delayed_refs->lock); 924 spin_unlock(&delayed_refs->lock);
917 ret = __add_delayed_refs(head, time_seq, 925 ret = __add_delayed_refs(head, time_seq,
918 &prefs_delayed); 926 &prefs_delayed, &total_refs);
919 mutex_unlock(&head->mutex); 927 mutex_unlock(&head->mutex);
920 if (ret) 928 if (ret)
921 goto out; 929 goto out;
@@ -936,7 +944,8 @@ again:
936 (key.type == BTRFS_EXTENT_ITEM_KEY || 944 (key.type == BTRFS_EXTENT_ITEM_KEY ||
937 key.type == BTRFS_METADATA_ITEM_KEY)) { 945 key.type == BTRFS_METADATA_ITEM_KEY)) {
938 ret = __add_inline_refs(fs_info, path, bytenr, 946 ret = __add_inline_refs(fs_info, path, bytenr,
939 &info_level, &prefs); 947 &info_level, &prefs,
948 &total_refs);
940 if (ret) 949 if (ret)
941 goto out; 950 goto out;
942 ret = __add_keyed_refs(fs_info, path, bytenr, 951 ret = __add_keyed_refs(fs_info, path, bytenr,
@@ -956,7 +965,7 @@ again:
956 __merge_refs(&prefs, 1); 965 __merge_refs(&prefs, 1);
957 966
958 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, 967 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
959 extent_item_pos); 968 extent_item_pos, total_refs);
960 if (ret) 969 if (ret)
961 goto out; 970 goto out;
962 971
@@ -965,7 +974,7 @@ again:
965 while (!list_empty(&prefs)) { 974 while (!list_empty(&prefs)) {
966 ref = list_first_entry(&prefs, struct __prelim_ref, list); 975 ref = list_first_entry(&prefs, struct __prelim_ref, list);
967 WARN_ON(ref->count < 0); 976 WARN_ON(ref->count < 0);
968 if (ref->count && ref->root_id && ref->parent == 0) { 977 if (roots && ref->count && ref->root_id && ref->parent == 0) {
969 /* no parent == root of tree */ 978 /* no parent == root of tree */
970 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); 979 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
971 if (ret < 0) 980 if (ret < 0)
@@ -1061,22 +1070,14 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
1061 u64 time_seq, struct ulist **leafs, 1070 u64 time_seq, struct ulist **leafs,
1062 const u64 *extent_item_pos) 1071 const u64 *extent_item_pos)
1063{ 1072{
1064 struct ulist *tmp;
1065 int ret; 1073 int ret;
1066 1074
1067 tmp = ulist_alloc(GFP_NOFS);
1068 if (!tmp)
1069 return -ENOMEM;
1070 *leafs = ulist_alloc(GFP_NOFS); 1075 *leafs = ulist_alloc(GFP_NOFS);
1071 if (!*leafs) { 1076 if (!*leafs)
1072 ulist_free(tmp);
1073 return -ENOMEM; 1077 return -ENOMEM;
1074 }
1075 1078
1076 ret = find_parent_nodes(trans, fs_info, bytenr, 1079 ret = find_parent_nodes(trans, fs_info, bytenr,
1077 time_seq, *leafs, tmp, extent_item_pos); 1080 time_seq, *leafs, NULL, extent_item_pos);
1078 ulist_free(tmp);
1079
1080 if (ret < 0 && ret != -ENOENT) { 1081 if (ret < 0 && ret != -ENOENT) {
1081 free_leaf_list(*leafs); 1082 free_leaf_list(*leafs);
1082 return ret; 1083 return ret;
@@ -1333,38 +1334,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1333 if (ret < 0) 1334 if (ret < 0)
1334 return ret; 1335 return ret;
1335 1336
1336 while (1) { 1337 ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
1337 u32 nritems; 1338 if (ret) {
1338 if (path->slots[0] == 0) { 1339 if (ret > 0)
1339 btrfs_set_path_blocking(path); 1340 ret = -ENOENT;
1340 ret = btrfs_prev_leaf(fs_info->extent_root, path); 1341 return ret;
1341 if (ret != 0) {
1342 if (ret > 0) {
1343 pr_debug("logical %llu is not within "
1344 "any extent\n", logical);
1345 ret = -ENOENT;
1346 }
1347 return ret;
1348 }
1349 } else {
1350 path->slots[0]--;
1351 }
1352 nritems = btrfs_header_nritems(path->nodes[0]);
1353 if (nritems == 0) {
1354 pr_debug("logical %llu is not within any extent\n",
1355 logical);
1356 return -ENOENT;
1357 }
1358 if (path->slots[0] == nritems)
1359 path->slots[0]--;
1360
1361 btrfs_item_key_to_cpu(path->nodes[0], found_key,
1362 path->slots[0]);
1363 if (found_key->type == BTRFS_EXTENT_ITEM_KEY ||
1364 found_key->type == BTRFS_METADATA_ITEM_KEY)
1365 break;
1366 } 1342 }
1367 1343 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
1368 if (found_key->type == BTRFS_METADATA_ITEM_KEY) 1344 if (found_key->type == BTRFS_METADATA_ITEM_KEY)
1369 size = fs_info->extent_root->leafsize; 1345 size = fs_info->extent_root->leafsize;
1370 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) 1346 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 8fed2125689e..c9a24444ec9a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -109,14 +109,17 @@ struct btrfs_inode {
109 u64 last_trans; 109 u64 last_trans;
110 110
111 /* 111 /*
112 * log transid when this inode was last modified 112 * transid that last logged this inode
113 */ 113 */
114 u64 last_sub_trans; 114 u64 logged_trans;
115 115
116 /* 116 /*
117 * transid that last logged this inode 117 * log transid when this inode was last modified
118 */ 118 */
119 u64 logged_trans; 119 int last_sub_trans;
120
121 /* a local copy of root's last_log_commit */
122 int last_log_commit;
120 123
121 /* total number of bytes pending delalloc, used by stat to calc the 124 /* total number of bytes pending delalloc, used by stat to calc the
122 * real block usage of the file 125 * real block usage of the file
@@ -155,9 +158,6 @@ struct btrfs_inode {
155 /* flags field from the on disk inode */ 158 /* flags field from the on disk inode */
156 u32 flags; 159 u32 flags;
157 160
158 /* a local copy of root's last_log_commit */
159 unsigned long last_log_commit;
160
161 /* 161 /*
162 * Counters to keep track of the number of extent item's we may use due 162 * Counters to keep track of the number of extent item's we may use due
163 * to delalloc and such. outstanding_extents is the number of extent 163 * to delalloc and such. outstanding_extents is the number of extent
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b01fb6c527e3..d43c544d3b68 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -472,7 +472,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
472 rcu_read_lock(); 472 rcu_read_lock();
473 page = radix_tree_lookup(&mapping->page_tree, pg_index); 473 page = radix_tree_lookup(&mapping->page_tree, pg_index);
474 rcu_read_unlock(); 474 rcu_read_unlock();
475 if (page) { 475 if (page && !radix_tree_exceptional_entry(page)) {
476 misses++; 476 misses++;
477 if (misses > 4) 477 if (misses > 4)
478 break; 478 break;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cbd3a7d6fa68..88d1b1eedc9c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5376,6 +5376,8 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5376 int advance_right; 5376 int advance_right;
5377 u64 left_blockptr; 5377 u64 left_blockptr;
5378 u64 right_blockptr; 5378 u64 right_blockptr;
5379 u64 left_gen;
5380 u64 right_gen;
5379 u64 left_start_ctransid; 5381 u64 left_start_ctransid;
5380 u64 right_start_ctransid; 5382 u64 right_start_ctransid;
5381 u64 ctransid; 5383 u64 ctransid;
@@ -5640,7 +5642,14 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5640 right_blockptr = btrfs_node_blockptr( 5642 right_blockptr = btrfs_node_blockptr(
5641 right_path->nodes[right_level], 5643 right_path->nodes[right_level],
5642 right_path->slots[right_level]); 5644 right_path->slots[right_level]);
5643 if (left_blockptr == right_blockptr) { 5645 left_gen = btrfs_node_ptr_generation(
5646 left_path->nodes[left_level],
5647 left_path->slots[left_level]);
5648 right_gen = btrfs_node_ptr_generation(
5649 right_path->nodes[right_level],
5650 right_path->slots[right_level]);
5651 if (left_blockptr == right_blockptr &&
5652 left_gen == right_gen) {
5644 /* 5653 /*
5645 * As we're on a shared block, don't 5654 * As we're on a shared block, don't
5646 * allow to go deeper. 5655 * allow to go deeper.
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c1a42ca519f..bc96c03dd259 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -351,6 +351,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
351#define BTRFS_FS_STATE_ERROR 0 351#define BTRFS_FS_STATE_ERROR 0
352#define BTRFS_FS_STATE_REMOUNTING 1 352#define BTRFS_FS_STATE_REMOUNTING 1
353#define BTRFS_FS_STATE_TRANS_ABORTED 2 353#define BTRFS_FS_STATE_TRANS_ABORTED 2
354#define BTRFS_FS_STATE_DEV_REPLACING 3
354 355
355/* Super block flags */ 356/* Super block flags */
356/* Errors detected */ 357/* Errors detected */
@@ -1489,6 +1490,7 @@ struct btrfs_fs_info {
1489 */ 1490 */
1490 struct list_head ordered_roots; 1491 struct list_head ordered_roots;
1491 1492
1493 struct mutex delalloc_root_mutex;
1492 spinlock_t delalloc_root_lock; 1494 spinlock_t delalloc_root_lock;
1493 /* all fs/file tree roots that have delalloc inodes. */ 1495 /* all fs/file tree roots that have delalloc inodes. */
1494 struct list_head delalloc_roots; 1496 struct list_head delalloc_roots;
@@ -1503,28 +1505,27 @@ struct btrfs_fs_info {
1503 * A third pool does submit_bio to avoid deadlocking with the other 1505 * A third pool does submit_bio to avoid deadlocking with the other
1504 * two 1506 * two
1505 */ 1507 */
1506 struct btrfs_workers generic_worker; 1508 struct btrfs_workqueue *workers;
1507 struct btrfs_workers workers; 1509 struct btrfs_workqueue *delalloc_workers;
1508 struct btrfs_workers delalloc_workers; 1510 struct btrfs_workqueue *flush_workers;
1509 struct btrfs_workers flush_workers; 1511 struct btrfs_workqueue *endio_workers;
1510 struct btrfs_workers endio_workers; 1512 struct btrfs_workqueue *endio_meta_workers;
1511 struct btrfs_workers endio_meta_workers; 1513 struct btrfs_workqueue *endio_raid56_workers;
1512 struct btrfs_workers endio_raid56_workers; 1514 struct btrfs_workqueue *rmw_workers;
1513 struct btrfs_workers rmw_workers; 1515 struct btrfs_workqueue *endio_meta_write_workers;
1514 struct btrfs_workers endio_meta_write_workers; 1516 struct btrfs_workqueue *endio_write_workers;
1515 struct btrfs_workers endio_write_workers; 1517 struct btrfs_workqueue *endio_freespace_worker;
1516 struct btrfs_workers endio_freespace_worker; 1518 struct btrfs_workqueue *submit_workers;
1517 struct btrfs_workers submit_workers; 1519 struct btrfs_workqueue *caching_workers;
1518 struct btrfs_workers caching_workers; 1520 struct btrfs_workqueue *readahead_workers;
1519 struct btrfs_workers readahead_workers;
1520 1521
1521 /* 1522 /*
1522 * fixup workers take dirty pages that didn't properly go through 1523 * fixup workers take dirty pages that didn't properly go through
1523 * the cow mechanism and make them safe to write. It happens 1524 * the cow mechanism and make them safe to write. It happens
1524 * for the sys_munmap function call path 1525 * for the sys_munmap function call path
1525 */ 1526 */
1526 struct btrfs_workers fixup_workers; 1527 struct btrfs_workqueue *fixup_workers;
1527 struct btrfs_workers delayed_workers; 1528 struct btrfs_workqueue *delayed_workers;
1528 struct task_struct *transaction_kthread; 1529 struct task_struct *transaction_kthread;
1529 struct task_struct *cleaner_kthread; 1530 struct task_struct *cleaner_kthread;
1530 int thread_pool_size; 1531 int thread_pool_size;
@@ -1604,9 +1605,9 @@ struct btrfs_fs_info {
1604 atomic_t scrub_cancel_req; 1605 atomic_t scrub_cancel_req;
1605 wait_queue_head_t scrub_pause_wait; 1606 wait_queue_head_t scrub_pause_wait;
1606 int scrub_workers_refcnt; 1607 int scrub_workers_refcnt;
1607 struct btrfs_workers scrub_workers; 1608 struct btrfs_workqueue *scrub_workers;
1608 struct btrfs_workers scrub_wr_completion_workers; 1609 struct btrfs_workqueue *scrub_wr_completion_workers;
1609 struct btrfs_workers scrub_nocow_workers; 1610 struct btrfs_workqueue *scrub_nocow_workers;
1610 1611
1611#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1612#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1612 u32 check_integrity_print_mask; 1613 u32 check_integrity_print_mask;
@@ -1647,7 +1648,7 @@ struct btrfs_fs_info {
1647 /* qgroup rescan items */ 1648 /* qgroup rescan items */
1648 struct mutex qgroup_rescan_lock; /* protects the progress item */ 1649 struct mutex qgroup_rescan_lock; /* protects the progress item */
1649 struct btrfs_key qgroup_rescan_progress; 1650 struct btrfs_key qgroup_rescan_progress;
1650 struct btrfs_workers qgroup_rescan_workers; 1651 struct btrfs_workqueue *qgroup_rescan_workers;
1651 struct completion qgroup_rescan_completion; 1652 struct completion qgroup_rescan_completion;
1652 struct btrfs_work qgroup_rescan_work; 1653 struct btrfs_work qgroup_rescan_work;
1653 1654
@@ -1674,10 +1675,18 @@ struct btrfs_fs_info {
1674 1675
1675 atomic_t mutually_exclusive_operation_running; 1676 atomic_t mutually_exclusive_operation_running;
1676 1677
1678 struct percpu_counter bio_counter;
1679 wait_queue_head_t replace_wait;
1680
1677 struct semaphore uuid_tree_rescan_sem; 1681 struct semaphore uuid_tree_rescan_sem;
1678 unsigned int update_uuid_tree_gen:1; 1682 unsigned int update_uuid_tree_gen:1;
1679}; 1683};
1680 1684
1685struct btrfs_subvolume_writers {
1686 struct percpu_counter counter;
1687 wait_queue_head_t wait;
1688};
1689
1681/* 1690/*
1682 * in ram representation of the tree. extent_root is used for all allocations 1691 * in ram representation of the tree. extent_root is used for all allocations
1683 * and for the extent tree extent_root root. 1692 * and for the extent tree extent_root root.
@@ -1714,11 +1723,15 @@ struct btrfs_root {
1714 struct mutex log_mutex; 1723 struct mutex log_mutex;
1715 wait_queue_head_t log_writer_wait; 1724 wait_queue_head_t log_writer_wait;
1716 wait_queue_head_t log_commit_wait[2]; 1725 wait_queue_head_t log_commit_wait[2];
1726 struct list_head log_ctxs[2];
1717 atomic_t log_writers; 1727 atomic_t log_writers;
1718 atomic_t log_commit[2]; 1728 atomic_t log_commit[2];
1719 atomic_t log_batch; 1729 atomic_t log_batch;
1720 unsigned long log_transid; 1730 int log_transid;
1721 unsigned long last_log_commit; 1731 /* No matter the commit succeeds or not*/
1732 int log_transid_committed;
1733 /* Just be updated when the commit succeeds. */
1734 int last_log_commit;
1722 pid_t log_start_pid; 1735 pid_t log_start_pid;
1723 bool log_multiple_pids; 1736 bool log_multiple_pids;
1724 1737
@@ -1793,6 +1806,7 @@ struct btrfs_root {
1793 spinlock_t root_item_lock; 1806 spinlock_t root_item_lock;
1794 atomic_t refs; 1807 atomic_t refs;
1795 1808
1809 struct mutex delalloc_mutex;
1796 spinlock_t delalloc_lock; 1810 spinlock_t delalloc_lock;
1797 /* 1811 /*
1798 * all of the inodes that have delalloc bytes. It is possible for 1812 * all of the inodes that have delalloc bytes. It is possible for
@@ -1802,6 +1816,8 @@ struct btrfs_root {
1802 struct list_head delalloc_inodes; 1816 struct list_head delalloc_inodes;
1803 struct list_head delalloc_root; 1817 struct list_head delalloc_root;
1804 u64 nr_delalloc_inodes; 1818 u64 nr_delalloc_inodes;
1819
1820 struct mutex ordered_extent_mutex;
1805 /* 1821 /*
1806 * this is used by the balancing code to wait for all the pending 1822 * this is used by the balancing code to wait for all the pending
1807 * ordered extents 1823 * ordered extents
@@ -1822,6 +1838,8 @@ struct btrfs_root {
1822 * manipulation with the read-only status via SUBVOL_SETFLAGS 1838 * manipulation with the read-only status via SUBVOL_SETFLAGS
1823 */ 1839 */
1824 int send_in_progress; 1840 int send_in_progress;
1841 struct btrfs_subvolume_writers *subv_writers;
1842 atomic_t will_be_snapshoted;
1825}; 1843};
1826 1844
1827struct btrfs_ioctl_defrag_range_args { 1845struct btrfs_ioctl_defrag_range_args {
@@ -3346,6 +3364,9 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
3346int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 3364int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
3347 struct btrfs_fs_info *fs_info); 3365 struct btrfs_fs_info *fs_info);
3348int __get_raid_index(u64 flags); 3366int __get_raid_index(u64 flags);
3367
3368int btrfs_start_nocow_write(struct btrfs_root *root);
3369void btrfs_end_nocow_write(struct btrfs_root *root);
3349/* ctree.c */ 3370/* ctree.c */
3350int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 3371int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
3351 int level, int *slot); 3372 int level, int *slot);
@@ -3723,7 +3744,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3723 u32 min_type); 3744 u32 min_type);
3724 3745
3725int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 3746int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
3726int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput); 3747int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
3748 int nr);
3727int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 3749int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
3728 struct extent_state **cached_state); 3750 struct extent_state **cached_state);
3729int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 3751int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -4005,6 +4027,11 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
4005int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 4027int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
4006 struct btrfs_scrub_progress *progress); 4028 struct btrfs_scrub_progress *progress);
4007 4029
4030/* dev-replace.c */
4031void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
4032void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
4033void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
4034
4008/* reada.c */ 4035/* reada.c */
4009struct reada_control { 4036struct reada_control {
4010 struct btrfs_root *root; /* tree to prefetch */ 4037 struct btrfs_root *root; /* tree to prefetch */
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 451b00c86f6c..33e561a84013 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1392,11 +1392,11 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
1392 return -ENOMEM; 1392 return -ENOMEM;
1393 1393
1394 async_work->delayed_root = delayed_root; 1394 async_work->delayed_root = delayed_root;
1395 async_work->work.func = btrfs_async_run_delayed_root; 1395 btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root,
1396 async_work->work.flags = 0; 1396 NULL, NULL);
1397 async_work->nr = nr; 1397 async_work->nr = nr;
1398 1398
1399 btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work); 1399 btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
1400 return 0; 1400 return 0;
1401} 1401}
1402 1402
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index f3bff89eecf0..31299646024d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -199,44 +199,31 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
199 */ 199 */
200static struct btrfs_delayed_ref_head * 200static struct btrfs_delayed_ref_head *
201find_ref_head(struct rb_root *root, u64 bytenr, 201find_ref_head(struct rb_root *root, u64 bytenr,
202 struct btrfs_delayed_ref_head **last, int return_bigger) 202 int return_bigger)
203{ 203{
204 struct rb_node *n; 204 struct rb_node *n;
205 struct btrfs_delayed_ref_head *entry; 205 struct btrfs_delayed_ref_head *entry;
206 int cmp = 0;
207 206
208again:
209 n = root->rb_node; 207 n = root->rb_node;
210 entry = NULL; 208 entry = NULL;
211 while (n) { 209 while (n) {
212 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); 210 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
213 if (last)
214 *last = entry;
215 211
216 if (bytenr < entry->node.bytenr) 212 if (bytenr < entry->node.bytenr)
217 cmp = -1;
218 else if (bytenr > entry->node.bytenr)
219 cmp = 1;
220 else
221 cmp = 0;
222
223 if (cmp < 0)
224 n = n->rb_left; 213 n = n->rb_left;
225 else if (cmp > 0) 214 else if (bytenr > entry->node.bytenr)
226 n = n->rb_right; 215 n = n->rb_right;
227 else 216 else
228 return entry; 217 return entry;
229 } 218 }
230 if (entry && return_bigger) { 219 if (entry && return_bigger) {
231 if (cmp > 0) { 220 if (bytenr > entry->node.bytenr) {
232 n = rb_next(&entry->href_node); 221 n = rb_next(&entry->href_node);
233 if (!n) 222 if (!n)
234 n = rb_first(root); 223 n = rb_first(root);
235 entry = rb_entry(n, struct btrfs_delayed_ref_head, 224 entry = rb_entry(n, struct btrfs_delayed_ref_head,
236 href_node); 225 href_node);
237 bytenr = entry->node.bytenr; 226 return entry;
238 return_bigger = 0;
239 goto again;
240 } 227 }
241 return entry; 228 return entry;
242 } 229 }
@@ -415,12 +402,12 @@ btrfs_select_ref_head(struct btrfs_trans_handle *trans)
415 402
416again: 403again:
417 start = delayed_refs->run_delayed_start; 404 start = delayed_refs->run_delayed_start;
418 head = find_ref_head(&delayed_refs->href_root, start, NULL, 1); 405 head = find_ref_head(&delayed_refs->href_root, start, 1);
419 if (!head && !loop) { 406 if (!head && !loop) {
420 delayed_refs->run_delayed_start = 0; 407 delayed_refs->run_delayed_start = 0;
421 start = 0; 408 start = 0;
422 loop = true; 409 loop = true;
423 head = find_ref_head(&delayed_refs->href_root, start, NULL, 1); 410 head = find_ref_head(&delayed_refs->href_root, start, 1);
424 if (!head) 411 if (!head)
425 return NULL; 412 return NULL;
426 } else if (!head && loop) { 413 } else if (!head && loop) {
@@ -508,6 +495,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
508 ref = btrfs_delayed_node_to_head(update); 495 ref = btrfs_delayed_node_to_head(update);
509 BUG_ON(existing_ref->is_data != ref->is_data); 496 BUG_ON(existing_ref->is_data != ref->is_data);
510 497
498 spin_lock(&existing_ref->lock);
511 if (ref->must_insert_reserved) { 499 if (ref->must_insert_reserved) {
512 /* if the extent was freed and then 500 /* if the extent was freed and then
513 * reallocated before the delayed ref 501 * reallocated before the delayed ref
@@ -549,7 +537,6 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
549 * only need the lock for this case cause we could be processing it 537 * only need the lock for this case cause we could be processing it
550 * currently, for refs we just added we know we're a-ok. 538 * currently, for refs we just added we know we're a-ok.
551 */ 539 */
552 spin_lock(&existing_ref->lock);
553 existing->ref_mod += update->ref_mod; 540 existing->ref_mod += update->ref_mod;
554 spin_unlock(&existing_ref->lock); 541 spin_unlock(&existing_ref->lock);
555} 542}
@@ -898,7 +885,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
898 struct btrfs_delayed_ref_root *delayed_refs; 885 struct btrfs_delayed_ref_root *delayed_refs;
899 886
900 delayed_refs = &trans->transaction->delayed_refs; 887 delayed_refs = &trans->transaction->delayed_refs;
901 return find_ref_head(&delayed_refs->href_root, bytenr, NULL, 0); 888 return find_ref_head(&delayed_refs->href_root, bytenr, 0);
902} 889}
903 890
904void btrfs_delayed_ref_exit(void) 891void btrfs_delayed_ref_exit(void)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 564c92638b20..9f2290509aca 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -431,6 +431,35 @@ leave_no_lock:
431 return ret; 431 return ret;
432} 432}
433 433
434/*
435 * blocked until all flighting bios are finished.
436 */
437static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
438{
439 s64 writers;
440 DEFINE_WAIT(wait);
441
442 set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
443 do {
444 prepare_to_wait(&fs_info->replace_wait, &wait,
445 TASK_UNINTERRUPTIBLE);
446 writers = percpu_counter_sum(&fs_info->bio_counter);
447 if (writers)
448 schedule();
449 finish_wait(&fs_info->replace_wait, &wait);
450 } while (writers);
451}
452
453/*
454 * we have removed target device, it is safe to allow new bios request.
455 */
456static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
457{
458 clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
459 if (waitqueue_active(&fs_info->replace_wait))
460 wake_up(&fs_info->replace_wait);
461}
462
434static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, 463static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
435 int scrub_ret) 464 int scrub_ret)
436{ 465{
@@ -458,17 +487,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
458 src_device = dev_replace->srcdev; 487 src_device = dev_replace->srcdev;
459 btrfs_dev_replace_unlock(dev_replace); 488 btrfs_dev_replace_unlock(dev_replace);
460 489
461 /* replace old device with new one in mapping tree */
462 if (!scrub_ret)
463 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
464 src_device,
465 tgt_device);
466
467 /* 490 /*
468 * flush all outstanding I/O and inode extent mappings before the 491 * flush all outstanding I/O and inode extent mappings before the
469 * copy operation is declared as being finished 492 * copy operation is declared as being finished
470 */ 493 */
471 ret = btrfs_start_delalloc_roots(root->fs_info, 0); 494 ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
472 if (ret) { 495 if (ret) {
473 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 496 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
474 return ret; 497 return ret;
@@ -484,6 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
484 WARN_ON(ret); 507 WARN_ON(ret);
485 508
486 /* keep away write_all_supers() during the finishing procedure */ 509 /* keep away write_all_supers() during the finishing procedure */
510 mutex_lock(&root->fs_info->chunk_mutex);
487 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 511 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
488 btrfs_dev_replace_lock(dev_replace); 512 btrfs_dev_replace_lock(dev_replace);
489 dev_replace->replace_state = 513 dev_replace->replace_state =
@@ -494,7 +518,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
494 dev_replace->time_stopped = get_seconds(); 518 dev_replace->time_stopped = get_seconds();
495 dev_replace->item_needs_writeback = 1; 519 dev_replace->item_needs_writeback = 1;
496 520
497 if (scrub_ret) { 521 /* replace old device with new one in mapping tree */
522 if (!scrub_ret) {
523 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
524 src_device,
525 tgt_device);
526 } else {
498 printk_in_rcu(KERN_ERR 527 printk_in_rcu(KERN_ERR
499 "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", 528 "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
500 src_device->missing ? "<missing disk>" : 529 src_device->missing ? "<missing disk>" :
@@ -503,6 +532,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
503 rcu_str_deref(tgt_device->name), scrub_ret); 532 rcu_str_deref(tgt_device->name), scrub_ret);
504 btrfs_dev_replace_unlock(dev_replace); 533 btrfs_dev_replace_unlock(dev_replace);
505 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 534 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
535 mutex_unlock(&root->fs_info->chunk_mutex);
506 if (tgt_device) 536 if (tgt_device)
507 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 537 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
508 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 538 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
@@ -532,8 +562,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
532 fs_info->fs_devices->latest_bdev = tgt_device->bdev; 562 fs_info->fs_devices->latest_bdev = tgt_device->bdev;
533 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 563 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
534 564
565 btrfs_rm_dev_replace_blocked(fs_info);
566
535 btrfs_rm_dev_replace_srcdev(fs_info, src_device); 567 btrfs_rm_dev_replace_srcdev(fs_info, src_device);
536 568
569 btrfs_rm_dev_replace_unblocked(fs_info);
570
537 /* 571 /*
538 * this is again a consistent state where no dev_replace procedure 572 * this is again a consistent state where no dev_replace procedure
539 * is running, the target device is part of the filesystem, the 573 * is running, the target device is part of the filesystem, the
@@ -543,6 +577,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
543 */ 577 */
544 btrfs_dev_replace_unlock(dev_replace); 578 btrfs_dev_replace_unlock(dev_replace);
545 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 579 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
580 mutex_unlock(&root->fs_info->chunk_mutex);
546 581
547 /* write back the superblocks */ 582 /* write back the superblocks */
548 trans = btrfs_start_transaction(root, 0); 583 trans = btrfs_start_transaction(root, 0);
@@ -862,3 +897,31 @@ void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
862 mutex_unlock(&dev_replace->lock_management_lock); 897 mutex_unlock(&dev_replace->lock_management_lock);
863 } 898 }
864} 899}
900
901void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
902{
903 percpu_counter_inc(&fs_info->bio_counter);
904}
905
906void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
907{
908 percpu_counter_dec(&fs_info->bio_counter);
909
910 if (waitqueue_active(&fs_info->replace_wait))
911 wake_up(&fs_info->replace_wait);
912}
913
914void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
915{
916 DEFINE_WAIT(wait);
917again:
918 percpu_counter_inc(&fs_info->bio_counter);
919 if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
920 btrfs_bio_counter_dec(fs_info);
921 wait_event(fs_info->replace_wait,
922 !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
923 &fs_info->fs_state));
924 goto again;
925 }
926
927}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 81ea55314b1f..bd0f752b797b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -678,32 +678,31 @@ static void end_workqueue_bio(struct bio *bio, int err)
678 678
679 fs_info = end_io_wq->info; 679 fs_info = end_io_wq->info;
680 end_io_wq->error = err; 680 end_io_wq->error = err;
681 end_io_wq->work.func = end_workqueue_fn; 681 btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
682 end_io_wq->work.flags = 0;
683 682
684 if (bio->bi_rw & REQ_WRITE) { 683 if (bio->bi_rw & REQ_WRITE) {
685 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) 684 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
686 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 685 btrfs_queue_work(fs_info->endio_meta_write_workers,
687 &end_io_wq->work); 686 &end_io_wq->work);
688 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) 687 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
689 btrfs_queue_worker(&fs_info->endio_freespace_worker, 688 btrfs_queue_work(fs_info->endio_freespace_worker,
690 &end_io_wq->work); 689 &end_io_wq->work);
691 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 690 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
692 btrfs_queue_worker(&fs_info->endio_raid56_workers, 691 btrfs_queue_work(fs_info->endio_raid56_workers,
693 &end_io_wq->work); 692 &end_io_wq->work);
694 else 693 else
695 btrfs_queue_worker(&fs_info->endio_write_workers, 694 btrfs_queue_work(fs_info->endio_write_workers,
696 &end_io_wq->work); 695 &end_io_wq->work);
697 } else { 696 } else {
698 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 697 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
699 btrfs_queue_worker(&fs_info->endio_raid56_workers, 698 btrfs_queue_work(fs_info->endio_raid56_workers,
700 &end_io_wq->work); 699 &end_io_wq->work);
701 else if (end_io_wq->metadata) 700 else if (end_io_wq->metadata)
702 btrfs_queue_worker(&fs_info->endio_meta_workers, 701 btrfs_queue_work(fs_info->endio_meta_workers,
703 &end_io_wq->work); 702 &end_io_wq->work);
704 else 703 else
705 btrfs_queue_worker(&fs_info->endio_workers, 704 btrfs_queue_work(fs_info->endio_workers,
706 &end_io_wq->work); 705 &end_io_wq->work);
707 } 706 }
708} 707}
709 708
@@ -738,7 +737,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
738unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) 737unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
739{ 738{
740 unsigned long limit = min_t(unsigned long, 739 unsigned long limit = min_t(unsigned long,
741 info->workers.max_workers, 740 info->thread_pool_size,
742 info->fs_devices->open_devices); 741 info->fs_devices->open_devices);
743 return 256 * limit; 742 return 256 * limit;
744} 743}
@@ -811,11 +810,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
811 async->submit_bio_start = submit_bio_start; 810 async->submit_bio_start = submit_bio_start;
812 async->submit_bio_done = submit_bio_done; 811 async->submit_bio_done = submit_bio_done;
813 812
814 async->work.func = run_one_async_start; 813 btrfs_init_work(&async->work, run_one_async_start,
815 async->work.ordered_func = run_one_async_done; 814 run_one_async_done, run_one_async_free);
816 async->work.ordered_free = run_one_async_free;
817 815
818 async->work.flags = 0;
819 async->bio_flags = bio_flags; 816 async->bio_flags = bio_flags;
820 async->bio_offset = bio_offset; 817 async->bio_offset = bio_offset;
821 818
@@ -824,9 +821,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
824 atomic_inc(&fs_info->nr_async_submits); 821 atomic_inc(&fs_info->nr_async_submits);
825 822
826 if (rw & REQ_SYNC) 823 if (rw & REQ_SYNC)
827 btrfs_set_work_high_prio(&async->work); 824 btrfs_set_work_high_priority(&async->work);
828 825
829 btrfs_queue_worker(&fs_info->workers, &async->work); 826 btrfs_queue_work(fs_info->workers, &async->work);
830 827
831 while (atomic_read(&fs_info->async_submit_draining) && 828 while (atomic_read(&fs_info->async_submit_draining) &&
832 atomic_read(&fs_info->nr_async_submits)) { 829 atomic_read(&fs_info->nr_async_submits)) {
@@ -1149,6 +1146,32 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1149 } 1146 }
1150} 1147}
1151 1148
1149static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
1150{
1151 struct btrfs_subvolume_writers *writers;
1152 int ret;
1153
1154 writers = kmalloc(sizeof(*writers), GFP_NOFS);
1155 if (!writers)
1156 return ERR_PTR(-ENOMEM);
1157
1158 ret = percpu_counter_init(&writers->counter, 0);
1159 if (ret < 0) {
1160 kfree(writers);
1161 return ERR_PTR(ret);
1162 }
1163
1164 init_waitqueue_head(&writers->wait);
1165 return writers;
1166}
1167
1168static void
1169btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
1170{
1171 percpu_counter_destroy(&writers->counter);
1172 kfree(writers);
1173}
1174
1152static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, 1175static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1153 u32 stripesize, struct btrfs_root *root, 1176 u32 stripesize, struct btrfs_root *root,
1154 struct btrfs_fs_info *fs_info, 1177 struct btrfs_fs_info *fs_info,
@@ -1194,16 +1217,22 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1194 spin_lock_init(&root->log_extents_lock[1]); 1217 spin_lock_init(&root->log_extents_lock[1]);
1195 mutex_init(&root->objectid_mutex); 1218 mutex_init(&root->objectid_mutex);
1196 mutex_init(&root->log_mutex); 1219 mutex_init(&root->log_mutex);
1220 mutex_init(&root->ordered_extent_mutex);
1221 mutex_init(&root->delalloc_mutex);
1197 init_waitqueue_head(&root->log_writer_wait); 1222 init_waitqueue_head(&root->log_writer_wait);
1198 init_waitqueue_head(&root->log_commit_wait[0]); 1223 init_waitqueue_head(&root->log_commit_wait[0]);
1199 init_waitqueue_head(&root->log_commit_wait[1]); 1224 init_waitqueue_head(&root->log_commit_wait[1]);
1225 INIT_LIST_HEAD(&root->log_ctxs[0]);
1226 INIT_LIST_HEAD(&root->log_ctxs[1]);
1200 atomic_set(&root->log_commit[0], 0); 1227 atomic_set(&root->log_commit[0], 0);
1201 atomic_set(&root->log_commit[1], 0); 1228 atomic_set(&root->log_commit[1], 0);
1202 atomic_set(&root->log_writers, 0); 1229 atomic_set(&root->log_writers, 0);
1203 atomic_set(&root->log_batch, 0); 1230 atomic_set(&root->log_batch, 0);
1204 atomic_set(&root->orphan_inodes, 0); 1231 atomic_set(&root->orphan_inodes, 0);
1205 atomic_set(&root->refs, 1); 1232 atomic_set(&root->refs, 1);
1233 atomic_set(&root->will_be_snapshoted, 0);
1206 root->log_transid = 0; 1234 root->log_transid = 0;
1235 root->log_transid_committed = -1;
1207 root->last_log_commit = 0; 1236 root->last_log_commit = 0;
1208 if (fs_info) 1237 if (fs_info)
1209 extent_io_tree_init(&root->dirty_log_pages, 1238 extent_io_tree_init(&root->dirty_log_pages,
@@ -1417,6 +1446,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1417 WARN_ON(root->log_root); 1446 WARN_ON(root->log_root);
1418 root->log_root = log_root; 1447 root->log_root = log_root;
1419 root->log_transid = 0; 1448 root->log_transid = 0;
1449 root->log_transid_committed = -1;
1420 root->last_log_commit = 0; 1450 root->last_log_commit = 0;
1421 return 0; 1451 return 0;
1422} 1452}
@@ -1498,6 +1528,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
1498int btrfs_init_fs_root(struct btrfs_root *root) 1528int btrfs_init_fs_root(struct btrfs_root *root)
1499{ 1529{
1500 int ret; 1530 int ret;
1531 struct btrfs_subvolume_writers *writers;
1501 1532
1502 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); 1533 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
1503 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), 1534 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
@@ -1507,6 +1538,13 @@ int btrfs_init_fs_root(struct btrfs_root *root)
1507 goto fail; 1538 goto fail;
1508 } 1539 }
1509 1540
1541 writers = btrfs_alloc_subvolume_writers();
1542 if (IS_ERR(writers)) {
1543 ret = PTR_ERR(writers);
1544 goto fail;
1545 }
1546 root->subv_writers = writers;
1547
1510 btrfs_init_free_ino_ctl(root); 1548 btrfs_init_free_ino_ctl(root);
1511 mutex_init(&root->fs_commit_mutex); 1549 mutex_init(&root->fs_commit_mutex);
1512 spin_lock_init(&root->cache_lock); 1550 spin_lock_init(&root->cache_lock);
@@ -1514,8 +1552,11 @@ int btrfs_init_fs_root(struct btrfs_root *root)
1514 1552
1515 ret = get_anon_bdev(&root->anon_dev); 1553 ret = get_anon_bdev(&root->anon_dev);
1516 if (ret) 1554 if (ret)
1517 goto fail; 1555 goto free_writers;
1518 return 0; 1556 return 0;
1557
1558free_writers:
1559 btrfs_free_subvolume_writers(root->subv_writers);
1519fail: 1560fail:
1520 kfree(root->free_ino_ctl); 1561 kfree(root->free_ino_ctl);
1521 kfree(root->free_ino_pinned); 1562 kfree(root->free_ino_pinned);
@@ -1990,23 +2031,22 @@ static noinline int next_root_backup(struct btrfs_fs_info *info,
1990/* helper to cleanup workers */ 2031/* helper to cleanup workers */
1991static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) 2032static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
1992{ 2033{
1993 btrfs_stop_workers(&fs_info->generic_worker); 2034 btrfs_destroy_workqueue(fs_info->fixup_workers);
1994 btrfs_stop_workers(&fs_info->fixup_workers); 2035 btrfs_destroy_workqueue(fs_info->delalloc_workers);
1995 btrfs_stop_workers(&fs_info->delalloc_workers); 2036 btrfs_destroy_workqueue(fs_info->workers);
1996 btrfs_stop_workers(&fs_info->workers); 2037 btrfs_destroy_workqueue(fs_info->endio_workers);
1997 btrfs_stop_workers(&fs_info->endio_workers); 2038 btrfs_destroy_workqueue(fs_info->endio_meta_workers);
1998 btrfs_stop_workers(&fs_info->endio_meta_workers); 2039 btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
1999 btrfs_stop_workers(&fs_info->endio_raid56_workers); 2040 btrfs_destroy_workqueue(fs_info->rmw_workers);
2000 btrfs_stop_workers(&fs_info->rmw_workers); 2041 btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
2001 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2042 btrfs_destroy_workqueue(fs_info->endio_write_workers);
2002 btrfs_stop_workers(&fs_info->endio_write_workers); 2043 btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
2003 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2044 btrfs_destroy_workqueue(fs_info->submit_workers);
2004 btrfs_stop_workers(&fs_info->submit_workers); 2045 btrfs_destroy_workqueue(fs_info->delayed_workers);
2005 btrfs_stop_workers(&fs_info->delayed_workers); 2046 btrfs_destroy_workqueue(fs_info->caching_workers);
2006 btrfs_stop_workers(&fs_info->caching_workers); 2047 btrfs_destroy_workqueue(fs_info->readahead_workers);
2007 btrfs_stop_workers(&fs_info->readahead_workers); 2048 btrfs_destroy_workqueue(fs_info->flush_workers);
2008 btrfs_stop_workers(&fs_info->flush_workers); 2049 btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
2009 btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
2010} 2050}
2011 2051
2012static void free_root_extent_buffers(struct btrfs_root *root) 2052static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2097,6 +2137,8 @@ int open_ctree(struct super_block *sb,
2097 int err = -EINVAL; 2137 int err = -EINVAL;
2098 int num_backups_tried = 0; 2138 int num_backups_tried = 0;
2099 int backup_index = 0; 2139 int backup_index = 0;
2140 int max_active;
2141 int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
2100 bool create_uuid_tree; 2142 bool create_uuid_tree;
2101 bool check_uuid_tree; 2143 bool check_uuid_tree;
2102 2144
@@ -2133,10 +2175,16 @@ int open_ctree(struct super_block *sb,
2133 goto fail_dirty_metadata_bytes; 2175 goto fail_dirty_metadata_bytes;
2134 } 2176 }
2135 2177
2178 ret = percpu_counter_init(&fs_info->bio_counter, 0);
2179 if (ret) {
2180 err = ret;
2181 goto fail_delalloc_bytes;
2182 }
2183
2136 fs_info->btree_inode = new_inode(sb); 2184 fs_info->btree_inode = new_inode(sb);
2137 if (!fs_info->btree_inode) { 2185 if (!fs_info->btree_inode) {
2138 err = -ENOMEM; 2186 err = -ENOMEM;
2139 goto fail_delalloc_bytes; 2187 goto fail_bio_counter;
2140 } 2188 }
2141 2189
2142 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); 2190 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2159,6 +2207,7 @@ int open_ctree(struct super_block *sb,
2159 spin_lock_init(&fs_info->buffer_lock); 2207 spin_lock_init(&fs_info->buffer_lock);
2160 rwlock_init(&fs_info->tree_mod_log_lock); 2208 rwlock_init(&fs_info->tree_mod_log_lock);
2161 mutex_init(&fs_info->reloc_mutex); 2209 mutex_init(&fs_info->reloc_mutex);
2210 mutex_init(&fs_info->delalloc_root_mutex);
2162 seqlock_init(&fs_info->profiles_lock); 2211 seqlock_init(&fs_info->profiles_lock);
2163 2212
2164 init_completion(&fs_info->kobj_unregister); 2213 init_completion(&fs_info->kobj_unregister);
@@ -2211,6 +2260,7 @@ int open_ctree(struct super_block *sb,
2211 atomic_set(&fs_info->scrub_pause_req, 0); 2260 atomic_set(&fs_info->scrub_pause_req, 0);
2212 atomic_set(&fs_info->scrubs_paused, 0); 2261 atomic_set(&fs_info->scrubs_paused, 0);
2213 atomic_set(&fs_info->scrub_cancel_req, 0); 2262 atomic_set(&fs_info->scrub_cancel_req, 0);
2263 init_waitqueue_head(&fs_info->replace_wait);
2214 init_waitqueue_head(&fs_info->scrub_pause_wait); 2264 init_waitqueue_head(&fs_info->scrub_pause_wait);
2215 fs_info->scrub_workers_refcnt = 0; 2265 fs_info->scrub_workers_refcnt = 0;
2216#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 2266#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -2458,104 +2508,68 @@ int open_ctree(struct super_block *sb,
2458 goto fail_alloc; 2508 goto fail_alloc;
2459 } 2509 }
2460 2510
2461 btrfs_init_workers(&fs_info->generic_worker, 2511 max_active = fs_info->thread_pool_size;
2462 "genwork", 1, NULL);
2463
2464 btrfs_init_workers(&fs_info->workers, "worker",
2465 fs_info->thread_pool_size,
2466 &fs_info->generic_worker);
2467 2512
2468 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", 2513 fs_info->workers =
2469 fs_info->thread_pool_size, NULL); 2514 btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
2515 max_active, 16);
2470 2516
2471 btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc", 2517 fs_info->delalloc_workers =
2472 fs_info->thread_pool_size, NULL); 2518 btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
2473 2519
2474 btrfs_init_workers(&fs_info->submit_workers, "submit", 2520 fs_info->flush_workers =
2475 min_t(u64, fs_devices->num_devices, 2521 btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
2476 fs_info->thread_pool_size), NULL);
2477 2522
2478 btrfs_init_workers(&fs_info->caching_workers, "cache", 2523 fs_info->caching_workers =
2479 fs_info->thread_pool_size, NULL); 2524 btrfs_alloc_workqueue("cache", flags, max_active, 0);
2480 2525
2481 /* a higher idle thresh on the submit workers makes it much more 2526 /*
2527 * a higher idle thresh on the submit workers makes it much more
2482 * likely that bios will be send down in a sane order to the 2528 * likely that bios will be send down in a sane order to the
2483 * devices 2529 * devices
2484 */ 2530 */
2485 fs_info->submit_workers.idle_thresh = 64; 2531 fs_info->submit_workers =
2486 2532 btrfs_alloc_workqueue("submit", flags,
2487 fs_info->workers.idle_thresh = 16; 2533 min_t(u64, fs_devices->num_devices,
2488 fs_info->workers.ordered = 1; 2534 max_active), 64);
2489 2535
2490 fs_info->delalloc_workers.idle_thresh = 2; 2536 fs_info->fixup_workers =
2491 fs_info->delalloc_workers.ordered = 1; 2537 btrfs_alloc_workqueue("fixup", flags, 1, 0);
2492
2493 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
2494 &fs_info->generic_worker);
2495 btrfs_init_workers(&fs_info->endio_workers, "endio",
2496 fs_info->thread_pool_size,
2497 &fs_info->generic_worker);
2498 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
2499 fs_info->thread_pool_size,
2500 &fs_info->generic_worker);
2501 btrfs_init_workers(&fs_info->endio_meta_write_workers,
2502 "endio-meta-write", fs_info->thread_pool_size,
2503 &fs_info->generic_worker);
2504 btrfs_init_workers(&fs_info->endio_raid56_workers,
2505 "endio-raid56", fs_info->thread_pool_size,
2506 &fs_info->generic_worker);
2507 btrfs_init_workers(&fs_info->rmw_workers,
2508 "rmw", fs_info->thread_pool_size,
2509 &fs_info->generic_worker);
2510 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
2511 fs_info->thread_pool_size,
2512 &fs_info->generic_worker);
2513 btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
2514 1, &fs_info->generic_worker);
2515 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
2516 fs_info->thread_pool_size,
2517 &fs_info->generic_worker);
2518 btrfs_init_workers(&fs_info->readahead_workers, "readahead",
2519 fs_info->thread_pool_size,
2520 &fs_info->generic_worker);
2521 btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
2522 &fs_info->generic_worker);
2523 2538
2524 /* 2539 /*
2525 * endios are largely parallel and should have a very 2540 * endios are largely parallel and should have a very
2526 * low idle thresh 2541 * low idle thresh
2527 */ 2542 */
2528 fs_info->endio_workers.idle_thresh = 4; 2543 fs_info->endio_workers =
2529 fs_info->endio_meta_workers.idle_thresh = 4; 2544 btrfs_alloc_workqueue("endio", flags, max_active, 4);
2530 fs_info->endio_raid56_workers.idle_thresh = 4; 2545 fs_info->endio_meta_workers =
2531 fs_info->rmw_workers.idle_thresh = 2; 2546 btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
2532 2547 fs_info->endio_meta_write_workers =
2533 fs_info->endio_write_workers.idle_thresh = 2; 2548 btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
2534 fs_info->endio_meta_write_workers.idle_thresh = 2; 2549 fs_info->endio_raid56_workers =
2535 fs_info->readahead_workers.idle_thresh = 2; 2550 btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
2536 2551 fs_info->rmw_workers =
2537 /* 2552 btrfs_alloc_workqueue("rmw", flags, max_active, 2);
2538 * btrfs_start_workers can really only fail because of ENOMEM so just 2553 fs_info->endio_write_workers =
2539 * return -ENOMEM if any of these fail. 2554 btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
2540 */ 2555 fs_info->endio_freespace_worker =
2541 ret = btrfs_start_workers(&fs_info->workers); 2556 btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
2542 ret |= btrfs_start_workers(&fs_info->generic_worker); 2557 fs_info->delayed_workers =
2543 ret |= btrfs_start_workers(&fs_info->submit_workers); 2558 btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
2544 ret |= btrfs_start_workers(&fs_info->delalloc_workers); 2559 fs_info->readahead_workers =
2545 ret |= btrfs_start_workers(&fs_info->fixup_workers); 2560 btrfs_alloc_workqueue("readahead", flags, max_active, 2);
2546 ret |= btrfs_start_workers(&fs_info->endio_workers); 2561 fs_info->qgroup_rescan_workers =
2547 ret |= btrfs_start_workers(&fs_info->endio_meta_workers); 2562 btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
2548 ret |= btrfs_start_workers(&fs_info->rmw_workers); 2563
2549 ret |= btrfs_start_workers(&fs_info->endio_raid56_workers); 2564 if (!(fs_info->workers && fs_info->delalloc_workers &&
2550 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); 2565 fs_info->submit_workers && fs_info->flush_workers &&
2551 ret |= btrfs_start_workers(&fs_info->endio_write_workers); 2566 fs_info->endio_workers && fs_info->endio_meta_workers &&
2552 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); 2567 fs_info->endio_meta_write_workers &&
2553 ret |= btrfs_start_workers(&fs_info->delayed_workers); 2568 fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
2554 ret |= btrfs_start_workers(&fs_info->caching_workers); 2569 fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2555 ret |= btrfs_start_workers(&fs_info->readahead_workers); 2570 fs_info->caching_workers && fs_info->readahead_workers &&
2556 ret |= btrfs_start_workers(&fs_info->flush_workers); 2571 fs_info->fixup_workers && fs_info->delayed_workers &&
2557 ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers); 2572 fs_info->qgroup_rescan_workers)) {
2558 if (ret) {
2559 err = -ENOMEM; 2573 err = -ENOMEM;
2560 goto fail_sb_buffer; 2574 goto fail_sb_buffer;
2561 } 2575 }
@@ -2963,6 +2977,8 @@ fail_iput:
2963 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2977 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2964 2978
2965 iput(fs_info->btree_inode); 2979 iput(fs_info->btree_inode);
2980fail_bio_counter:
2981 percpu_counter_destroy(&fs_info->bio_counter);
2966fail_delalloc_bytes: 2982fail_delalloc_bytes:
2967 percpu_counter_destroy(&fs_info->delalloc_bytes); 2983 percpu_counter_destroy(&fs_info->delalloc_bytes);
2968fail_dirty_metadata_bytes: 2984fail_dirty_metadata_bytes:
@@ -3244,6 +3260,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
3244 /* send down all the barriers */ 3260 /* send down all the barriers */
3245 head = &info->fs_devices->devices; 3261 head = &info->fs_devices->devices;
3246 list_for_each_entry_rcu(dev, head, dev_list) { 3262 list_for_each_entry_rcu(dev, head, dev_list) {
3263 if (dev->missing)
3264 continue;
3247 if (!dev->bdev) { 3265 if (!dev->bdev) {
3248 errors_send++; 3266 errors_send++;
3249 continue; 3267 continue;
@@ -3258,6 +3276,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
3258 3276
3259 /* wait for all the barriers */ 3277 /* wait for all the barriers */
3260 list_for_each_entry_rcu(dev, head, dev_list) { 3278 list_for_each_entry_rcu(dev, head, dev_list) {
3279 if (dev->missing)
3280 continue;
3261 if (!dev->bdev) { 3281 if (!dev->bdev) {
3262 errors_wait++; 3282 errors_wait++;
3263 continue; 3283 continue;
@@ -3477,6 +3497,8 @@ static void free_fs_root(struct btrfs_root *root)
3477 root->orphan_block_rsv = NULL; 3497 root->orphan_block_rsv = NULL;
3478 if (root->anon_dev) 3498 if (root->anon_dev)
3479 free_anon_bdev(root->anon_dev); 3499 free_anon_bdev(root->anon_dev);
3500 if (root->subv_writers)
3501 btrfs_free_subvolume_writers(root->subv_writers);
3480 free_extent_buffer(root->node); 3502 free_extent_buffer(root->node);
3481 free_extent_buffer(root->commit_root); 3503 free_extent_buffer(root->commit_root);
3482 kfree(root->free_ino_ctl); 3504 kfree(root->free_ino_ctl);
@@ -3610,6 +3632,7 @@ int close_ctree(struct btrfs_root *root)
3610 3632
3611 percpu_counter_destroy(&fs_info->dirty_metadata_bytes); 3633 percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
3612 percpu_counter_destroy(&fs_info->delalloc_bytes); 3634 percpu_counter_destroy(&fs_info->delalloc_bytes);
3635 percpu_counter_destroy(&fs_info->bio_counter);
3613 bdi_destroy(&fs_info->bdi); 3636 bdi_destroy(&fs_info->bdi);
3614 cleanup_srcu_struct(&fs_info->subvol_srcu); 3637 cleanup_srcu_struct(&fs_info->subvol_srcu);
3615 3638
@@ -3791,9 +3814,11 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
3791 list_move_tail(&root->ordered_root, 3814 list_move_tail(&root->ordered_root,
3792 &fs_info->ordered_roots); 3815 &fs_info->ordered_roots);
3793 3816
3817 spin_unlock(&fs_info->ordered_root_lock);
3794 btrfs_destroy_ordered_extents(root); 3818 btrfs_destroy_ordered_extents(root);
3795 3819
3796 cond_resched_lock(&fs_info->ordered_root_lock); 3820 cond_resched();
3821 spin_lock(&fs_info->ordered_root_lock);
3797 } 3822 }
3798 spin_unlock(&fs_info->ordered_root_lock); 3823 spin_unlock(&fs_info->ordered_root_lock);
3799} 3824}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32312e09f0f5..c6b6a6e3e735 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -549,7 +549,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
549 caching_ctl->block_group = cache; 549 caching_ctl->block_group = cache;
550 caching_ctl->progress = cache->key.objectid; 550 caching_ctl->progress = cache->key.objectid;
551 atomic_set(&caching_ctl->count, 1); 551 atomic_set(&caching_ctl->count, 1);
552 caching_ctl->work.func = caching_thread; 552 btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
553 553
554 spin_lock(&cache->lock); 554 spin_lock(&cache->lock);
555 /* 555 /*
@@ -640,7 +640,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
640 640
641 btrfs_get_block_group(cache); 641 btrfs_get_block_group(cache);
642 642
643 btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work); 643 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
644 644
645 return ret; 645 return ret;
646} 646}
@@ -3971,7 +3971,7 @@ static int can_overcommit(struct btrfs_root *root,
3971} 3971}
3972 3972
3973static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, 3973static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3974 unsigned long nr_pages) 3974 unsigned long nr_pages, int nr_items)
3975{ 3975{
3976 struct super_block *sb = root->fs_info->sb; 3976 struct super_block *sb = root->fs_info->sb;
3977 3977
@@ -3986,9 +3986,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3986 * the filesystem is readonly(all dirty pages are written to 3986 * the filesystem is readonly(all dirty pages are written to
3987 * the disk). 3987 * the disk).
3988 */ 3988 */
3989 btrfs_start_delalloc_roots(root->fs_info, 0); 3989 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
3990 if (!current->journal_info) 3990 if (!current->journal_info)
3991 btrfs_wait_ordered_roots(root->fs_info, -1); 3991 btrfs_wait_ordered_roots(root->fs_info, nr_items);
3992 } 3992 }
3993} 3993}
3994 3994
@@ -4045,7 +4045,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4045 while (delalloc_bytes && loops < 3) { 4045 while (delalloc_bytes && loops < 3) {
4046 max_reclaim = min(delalloc_bytes, to_reclaim); 4046 max_reclaim = min(delalloc_bytes, to_reclaim);
4047 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 4047 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
4048 btrfs_writeback_inodes_sb_nr(root, nr_pages); 4048 btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
4049 /* 4049 /*
4050 * We need to wait for the async pages to actually start before 4050 * We need to wait for the async pages to actually start before
4051 * we do anything. 4051 * we do anything.
@@ -4112,13 +4112,9 @@ static int may_commit_transaction(struct btrfs_root *root,
4112 goto commit; 4112 goto commit;
4113 4113
4114 /* See if there is enough pinned space to make this reservation */ 4114 /* See if there is enough pinned space to make this reservation */
4115 spin_lock(&space_info->lock);
4116 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4115 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4117 bytes) >= 0) { 4116 bytes) >= 0)
4118 spin_unlock(&space_info->lock);
4119 goto commit; 4117 goto commit;
4120 }
4121 spin_unlock(&space_info->lock);
4122 4118
4123 /* 4119 /*
4124 * See if there is some space in the delayed insertion reservation for 4120 * See if there is some space in the delayed insertion reservation for
@@ -4127,16 +4123,13 @@ static int may_commit_transaction(struct btrfs_root *root,
4127 if (space_info != delayed_rsv->space_info) 4123 if (space_info != delayed_rsv->space_info)
4128 return -ENOSPC; 4124 return -ENOSPC;
4129 4125
4130 spin_lock(&space_info->lock);
4131 spin_lock(&delayed_rsv->lock); 4126 spin_lock(&delayed_rsv->lock);
4132 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4127 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4133 bytes - delayed_rsv->size) >= 0) { 4128 bytes - delayed_rsv->size) >= 0) {
4134 spin_unlock(&delayed_rsv->lock); 4129 spin_unlock(&delayed_rsv->lock);
4135 spin_unlock(&space_info->lock);
4136 return -ENOSPC; 4130 return -ENOSPC;
4137 } 4131 }
4138 spin_unlock(&delayed_rsv->lock); 4132 spin_unlock(&delayed_rsv->lock);
4139 spin_unlock(&space_info->lock);
4140 4133
4141commit: 4134commit:
4142 trans = btrfs_join_transaction(root); 4135 trans = btrfs_join_transaction(root);
@@ -4181,7 +4174,7 @@ static int flush_space(struct btrfs_root *root,
4181 break; 4174 break;
4182 case FLUSH_DELALLOC: 4175 case FLUSH_DELALLOC:
4183 case FLUSH_DELALLOC_WAIT: 4176 case FLUSH_DELALLOC_WAIT:
4184 shrink_delalloc(root, num_bytes, orig_bytes, 4177 shrink_delalloc(root, num_bytes * 2, orig_bytes,
4185 state == FLUSH_DELALLOC_WAIT); 4178 state == FLUSH_DELALLOC_WAIT);
4186 break; 4179 break;
4187 case ALLOC_CHUNK: 4180 case ALLOC_CHUNK:
@@ -8938,3 +8931,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8938 range->len = trimmed; 8931 range->len = trimmed;
8939 return ret; 8932 return ret;
8940} 8933}
8934
8935/*
8936 * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
8937 * they are used to prevent the some tasks writing data into the page cache
8938 * by nocow before the subvolume is snapshoted, but flush the data into
8939 * the disk after the snapshot creation.
8940 */
8941void btrfs_end_nocow_write(struct btrfs_root *root)
8942{
8943 percpu_counter_dec(&root->subv_writers->counter);
8944 /*
8945 * Make sure counter is updated before we wake up
8946 * waiters.
8947 */
8948 smp_mb();
8949 if (waitqueue_active(&root->subv_writers->wait))
8950 wake_up(&root->subv_writers->wait);
8951}
8952
8953int btrfs_start_nocow_write(struct btrfs_root *root)
8954{
8955 if (unlikely(atomic_read(&root->will_be_snapshoted)))
8956 return 0;
8957
8958 percpu_counter_inc(&root->subv_writers->counter);
8959 /*
8960 * Make sure counter is updated before we check for snapshot creation.
8961 */
8962 smp_mb();
8963 if (unlikely(atomic_read(&root->will_be_snapshoted))) {
8964 btrfs_end_nocow_write(root);
8965 return 0;
8966 }
8967 return 1;
8968}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 85bbd01f1271..ae69a00387e7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -229,12 +229,14 @@ void free_extent_state(struct extent_state *state)
229 } 229 }
230} 230}
231 231
232static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 232static struct rb_node *tree_insert(struct rb_root *root,
233 struct rb_node *search_start,
234 u64 offset,
233 struct rb_node *node, 235 struct rb_node *node,
234 struct rb_node ***p_in, 236 struct rb_node ***p_in,
235 struct rb_node **parent_in) 237 struct rb_node **parent_in)
236{ 238{
237 struct rb_node **p = &root->rb_node; 239 struct rb_node **p;
238 struct rb_node *parent = NULL; 240 struct rb_node *parent = NULL;
239 struct tree_entry *entry; 241 struct tree_entry *entry;
240 242
@@ -244,6 +246,7 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
244 goto do_insert; 246 goto do_insert;
245 } 247 }
246 248
249 p = search_start ? &search_start : &root->rb_node;
247 while (*p) { 250 while (*p) {
248 parent = *p; 251 parent = *p;
249 entry = rb_entry(parent, struct tree_entry, rb_node); 252 entry = rb_entry(parent, struct tree_entry, rb_node);
@@ -430,7 +433,7 @@ static int insert_state(struct extent_io_tree *tree,
430 433
431 set_state_bits(tree, state, bits); 434 set_state_bits(tree, state, bits);
432 435
433 node = tree_insert(&tree->state, end, &state->rb_node, p, parent); 436 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
434 if (node) { 437 if (node) {
435 struct extent_state *found; 438 struct extent_state *found;
436 found = rb_entry(node, struct extent_state, rb_node); 439 found = rb_entry(node, struct extent_state, rb_node);
@@ -477,8 +480,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
477 prealloc->state = orig->state; 480 prealloc->state = orig->state;
478 orig->start = split; 481 orig->start = split;
479 482
480 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node, 483 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
481 NULL, NULL); 484 &prealloc->rb_node, NULL, NULL);
482 if (node) { 485 if (node) {
483 free_extent_state(prealloc); 486 free_extent_state(prealloc);
484 return -EEXIST; 487 return -EEXIST;
@@ -2757,7 +2760,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
2757 2760
2758 if (em_cached && *em_cached) { 2761 if (em_cached && *em_cached) {
2759 em = *em_cached; 2762 em = *em_cached;
2760 if (em->in_tree && start >= em->start && 2763 if (extent_map_in_tree(em) && start >= em->start &&
2761 start < extent_map_end(em)) { 2764 start < extent_map_end(em)) {
2762 atomic_inc(&em->refs); 2765 atomic_inc(&em->refs);
2763 return em; 2766 return em;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 996ad56b57db..1874aee69c86 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -51,7 +51,7 @@ struct extent_map *alloc_extent_map(void)
51 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); 51 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
52 if (!em) 52 if (!em)
53 return NULL; 53 return NULL;
54 em->in_tree = 0; 54 RB_CLEAR_NODE(&em->rb_node);
55 em->flags = 0; 55 em->flags = 0;
56 em->compress_type = BTRFS_COMPRESS_NONE; 56 em->compress_type = BTRFS_COMPRESS_NONE;
57 em->generation = 0; 57 em->generation = 0;
@@ -73,7 +73,7 @@ void free_extent_map(struct extent_map *em)
73 return; 73 return;
74 WARN_ON(atomic_read(&em->refs) == 0); 74 WARN_ON(atomic_read(&em->refs) == 0);
75 if (atomic_dec_and_test(&em->refs)) { 75 if (atomic_dec_and_test(&em->refs)) {
76 WARN_ON(em->in_tree); 76 WARN_ON(extent_map_in_tree(em));
77 WARN_ON(!list_empty(&em->list)); 77 WARN_ON(!list_empty(&em->list));
78 kmem_cache_free(extent_map_cache, em); 78 kmem_cache_free(extent_map_cache, em);
79 } 79 }
@@ -99,8 +99,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
99 parent = *p; 99 parent = *p;
100 entry = rb_entry(parent, struct extent_map, rb_node); 100 entry = rb_entry(parent, struct extent_map, rb_node);
101 101
102 WARN_ON(!entry->in_tree);
103
104 if (em->start < entry->start) 102 if (em->start < entry->start)
105 p = &(*p)->rb_left; 103 p = &(*p)->rb_left;
106 else if (em->start >= extent_map_end(entry)) 104 else if (em->start >= extent_map_end(entry))
@@ -128,7 +126,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
128 if (end > entry->start && em->start < extent_map_end(entry)) 126 if (end > entry->start && em->start < extent_map_end(entry))
129 return -EEXIST; 127 return -EEXIST;
130 128
131 em->in_tree = 1;
132 rb_link_node(&em->rb_node, orig_parent, p); 129 rb_link_node(&em->rb_node, orig_parent, p);
133 rb_insert_color(&em->rb_node, root); 130 rb_insert_color(&em->rb_node, root);
134 return 0; 131 return 0;
@@ -153,8 +150,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
153 prev = n; 150 prev = n;
154 prev_entry = entry; 151 prev_entry = entry;
155 152
156 WARN_ON(!entry->in_tree);
157
158 if (offset < entry->start) 153 if (offset < entry->start)
159 n = n->rb_left; 154 n = n->rb_left;
160 else if (offset >= extent_map_end(entry)) 155 else if (offset >= extent_map_end(entry))
@@ -240,12 +235,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
240 em->len += merge->len; 235 em->len += merge->len;
241 em->block_len += merge->block_len; 236 em->block_len += merge->block_len;
242 em->block_start = merge->block_start; 237 em->block_start = merge->block_start;
243 merge->in_tree = 0;
244 em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; 238 em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
245 em->mod_start = merge->mod_start; 239 em->mod_start = merge->mod_start;
246 em->generation = max(em->generation, merge->generation); 240 em->generation = max(em->generation, merge->generation);
247 241
248 rb_erase(&merge->rb_node, &tree->map); 242 rb_erase(&merge->rb_node, &tree->map);
243 RB_CLEAR_NODE(&merge->rb_node);
249 free_extent_map(merge); 244 free_extent_map(merge);
250 } 245 }
251 } 246 }
@@ -257,7 +252,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
257 em->len += merge->len; 252 em->len += merge->len;
258 em->block_len += merge->block_len; 253 em->block_len += merge->block_len;
259 rb_erase(&merge->rb_node, &tree->map); 254 rb_erase(&merge->rb_node, &tree->map);
260 merge->in_tree = 0; 255 RB_CLEAR_NODE(&merge->rb_node);
261 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; 256 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
262 em->generation = max(em->generation, merge->generation); 257 em->generation = max(em->generation, merge->generation);
263 free_extent_map(merge); 258 free_extent_map(merge);
@@ -319,7 +314,21 @@ out:
319void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) 314void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
320{ 315{
321 clear_bit(EXTENT_FLAG_LOGGING, &em->flags); 316 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
322 if (em->in_tree) 317 if (extent_map_in_tree(em))
318 try_merge_map(tree, em);
319}
320
321static inline void setup_extent_mapping(struct extent_map_tree *tree,
322 struct extent_map *em,
323 int modified)
324{
325 atomic_inc(&em->refs);
326 em->mod_start = em->start;
327 em->mod_len = em->len;
328
329 if (modified)
330 list_move(&em->list, &tree->modified_extents);
331 else
323 try_merge_map(tree, em); 332 try_merge_map(tree, em);
324} 333}
325 334
@@ -342,15 +351,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
342 if (ret) 351 if (ret)
343 goto out; 352 goto out;
344 353
345 atomic_inc(&em->refs); 354 setup_extent_mapping(tree, em, modified);
346
347 em->mod_start = em->start;
348 em->mod_len = em->len;
349
350 if (modified)
351 list_move(&em->list, &tree->modified_extents);
352 else
353 try_merge_map(tree, em);
354out: 355out:
355 return ret; 356 return ret;
356} 357}
@@ -434,6 +435,21 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
434 rb_erase(&em->rb_node, &tree->map); 435 rb_erase(&em->rb_node, &tree->map);
435 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) 436 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
436 list_del_init(&em->list); 437 list_del_init(&em->list);
437 em->in_tree = 0; 438 RB_CLEAR_NODE(&em->rb_node);
438 return ret; 439 return ret;
439} 440}
441
442void replace_extent_mapping(struct extent_map_tree *tree,
443 struct extent_map *cur,
444 struct extent_map *new,
445 int modified)
446{
447 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
448 ASSERT(extent_map_in_tree(cur));
449 if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
450 list_del_init(&cur->list);
451 rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
452 RB_CLEAR_NODE(&cur->rb_node);
453
454 setup_extent_mapping(tree, new, modified);
455}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 93fba716d7f8..e7fd8a56a140 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -33,7 +33,6 @@ struct extent_map {
33 unsigned long flags; 33 unsigned long flags;
34 struct block_device *bdev; 34 struct block_device *bdev;
35 atomic_t refs; 35 atomic_t refs;
36 unsigned int in_tree;
37 unsigned int compress_type; 36 unsigned int compress_type;
38 struct list_head list; 37 struct list_head list;
39}; 38};
@@ -44,6 +43,11 @@ struct extent_map_tree {
44 rwlock_t lock; 43 rwlock_t lock;
45}; 44};
46 45
46static inline int extent_map_in_tree(const struct extent_map *em)
47{
48 return !RB_EMPTY_NODE(&em->rb_node);
49}
50
47static inline u64 extent_map_end(struct extent_map *em) 51static inline u64 extent_map_end(struct extent_map *em)
48{ 52{
49 if (em->start + em->len < em->start) 53 if (em->start + em->len < em->start)
@@ -64,6 +68,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
64int add_extent_mapping(struct extent_map_tree *tree, 68int add_extent_mapping(struct extent_map_tree *tree,
65 struct extent_map *em, int modified); 69 struct extent_map *em, int modified);
66int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); 70int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
71void replace_extent_mapping(struct extent_map_tree *tree,
72 struct extent_map *cur,
73 struct extent_map *new,
74 int modified);
67 75
68struct extent_map *alloc_extent_map(void); 76struct extent_map *alloc_extent_map(void);
69void free_extent_map(struct extent_map *em); 77void free_extent_map(struct extent_map *em);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0165b8672f09..e1ffb1e22898 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -591,7 +591,6 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
591 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 591 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
592 clear_bit(EXTENT_FLAG_LOGGING, &flags); 592 clear_bit(EXTENT_FLAG_LOGGING, &flags);
593 modified = !list_empty(&em->list); 593 modified = !list_empty(&em->list);
594 remove_extent_mapping(em_tree, em);
595 if (no_splits) 594 if (no_splits)
596 goto next; 595 goto next;
597 596
@@ -622,8 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
622 split->bdev = em->bdev; 621 split->bdev = em->bdev;
623 split->flags = flags; 622 split->flags = flags;
624 split->compress_type = em->compress_type; 623 split->compress_type = em->compress_type;
625 ret = add_extent_mapping(em_tree, split, modified); 624 replace_extent_mapping(em_tree, em, split, modified);
626 BUG_ON(ret); /* Logic error */
627 free_extent_map(split); 625 free_extent_map(split);
628 split = split2; 626 split = split2;
629 split2 = NULL; 627 split2 = NULL;
@@ -661,12 +659,20 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
661 split->orig_block_len = 0; 659 split->orig_block_len = 0;
662 } 660 }
663 661
664 ret = add_extent_mapping(em_tree, split, modified); 662 if (extent_map_in_tree(em)) {
665 BUG_ON(ret); /* Logic error */ 663 replace_extent_mapping(em_tree, em, split,
664 modified);
665 } else {
666 ret = add_extent_mapping(em_tree, split,
667 modified);
668 ASSERT(ret == 0); /* Logic error */
669 }
666 free_extent_map(split); 670 free_extent_map(split);
667 split = NULL; 671 split = NULL;
668 } 672 }
669next: 673next:
674 if (extent_map_in_tree(em))
675 remove_extent_mapping(em_tree, em);
670 write_unlock(&em_tree->lock); 676 write_unlock(&em_tree->lock);
671 677
672 /* once for us */ 678 /* once for us */
@@ -720,7 +726,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
720 if (drop_cache) 726 if (drop_cache)
721 btrfs_drop_extent_cache(inode, start, end - 1, 0); 727 btrfs_drop_extent_cache(inode, start, end - 1, 0);
722 728
723 if (start >= BTRFS_I(inode)->disk_i_size) 729 if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
724 modify_tree = 0; 730 modify_tree = 0;
725 731
726 while (1) { 732 while (1) {
@@ -798,7 +804,10 @@ next_slot:
798 */ 804 */
799 if (start > key.offset && end < extent_end) { 805 if (start > key.offset && end < extent_end) {
800 BUG_ON(del_nr > 0); 806 BUG_ON(del_nr > 0);
801 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 807 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
808 ret = -EINVAL;
809 break;
810 }
802 811
803 memcpy(&new_key, &key, sizeof(new_key)); 812 memcpy(&new_key, &key, sizeof(new_key));
804 new_key.offset = start; 813 new_key.offset = start;
@@ -841,7 +850,10 @@ next_slot:
841 * | -------- extent -------- | 850 * | -------- extent -------- |
842 */ 851 */
843 if (start <= key.offset && end < extent_end) { 852 if (start <= key.offset && end < extent_end) {
844 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 853 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
854 ret = -EINVAL;
855 break;
856 }
845 857
846 memcpy(&new_key, &key, sizeof(new_key)); 858 memcpy(&new_key, &key, sizeof(new_key));
847 new_key.offset = end; 859 new_key.offset = end;
@@ -864,7 +876,10 @@ next_slot:
864 */ 876 */
865 if (start > key.offset && end >= extent_end) { 877 if (start > key.offset && end >= extent_end) {
866 BUG_ON(del_nr > 0); 878 BUG_ON(del_nr > 0);
867 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 879 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
880 ret = -EINVAL;
881 break;
882 }
868 883
869 btrfs_set_file_extent_num_bytes(leaf, fi, 884 btrfs_set_file_extent_num_bytes(leaf, fi,
870 start - key.offset); 885 start - key.offset);
@@ -938,34 +953,42 @@ next_slot:
938 * Set path->slots[0] to first slot, so that after the delete 953 * Set path->slots[0] to first slot, so that after the delete
939 * if items are move off from our leaf to its immediate left or 954 * if items are move off from our leaf to its immediate left or
940 * right neighbor leafs, we end up with a correct and adjusted 955 * right neighbor leafs, we end up with a correct and adjusted
941 * path->slots[0] for our insertion. 956 * path->slots[0] for our insertion (if replace_extent != 0).
942 */ 957 */
943 path->slots[0] = del_slot; 958 path->slots[0] = del_slot;
944 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 959 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
945 if (ret) 960 if (ret)
946 btrfs_abort_transaction(trans, root, ret); 961 btrfs_abort_transaction(trans, root, ret);
962 }
947 963
948 leaf = path->nodes[0]; 964 leaf = path->nodes[0];
949 /* 965 /*
950 * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that 966 * If btrfs_del_items() was called, it might have deleted a leaf, in
951 * is, its contents got pushed to its neighbors), in which case 967 * which case it unlocked our path, so check path->locks[0] matches a
952 * it means path->locks[0] == 0 968 * write lock.
953 */ 969 */
954 if (!ret && replace_extent && leafs_visited == 1 && 970 if (!ret && replace_extent && leafs_visited == 1 &&
955 path->locks[0] && 971 (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
956 btrfs_leaf_free_space(root, leaf) >= 972 path->locks[0] == BTRFS_WRITE_LOCK) &&
957 sizeof(struct btrfs_item) + extent_item_size) { 973 btrfs_leaf_free_space(root, leaf) >=
958 974 sizeof(struct btrfs_item) + extent_item_size) {
959 key.objectid = ino; 975
960 key.type = BTRFS_EXTENT_DATA_KEY; 976 key.objectid = ino;
961 key.offset = start; 977 key.type = BTRFS_EXTENT_DATA_KEY;
962 setup_items_for_insert(root, path, &key, 978 key.offset = start;
963 &extent_item_size, 979 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
964 extent_item_size, 980 struct btrfs_key slot_key;
965 sizeof(struct btrfs_item) + 981
966 extent_item_size, 1); 982 btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
967 *key_inserted = 1; 983 if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
984 path->slots[0]++;
968 } 985 }
986 setup_items_for_insert(root, path, &key,
987 &extent_item_size,
988 extent_item_size,
989 sizeof(struct btrfs_item) +
990 extent_item_size, 1);
991 *key_inserted = 1;
969 } 992 }
970 993
971 if (!replace_extent || !(*key_inserted)) 994 if (!replace_extent || !(*key_inserted))
@@ -1346,11 +1369,11 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
1346 struct btrfs_ordered_extent *ordered; 1369 struct btrfs_ordered_extent *ordered;
1347 lock_extent_bits(&BTRFS_I(inode)->io_tree, 1370 lock_extent_bits(&BTRFS_I(inode)->io_tree,
1348 start_pos, last_pos, 0, cached_state); 1371 start_pos, last_pos, 0, cached_state);
1349 ordered = btrfs_lookup_first_ordered_extent(inode, last_pos); 1372 ordered = btrfs_lookup_ordered_range(inode, start_pos,
1373 last_pos - start_pos + 1);
1350 if (ordered && 1374 if (ordered &&
1351 ordered->file_offset + ordered->len > start_pos && 1375 ordered->file_offset + ordered->len > start_pos &&
1352 ordered->file_offset <= last_pos) { 1376 ordered->file_offset <= last_pos) {
1353 btrfs_put_ordered_extent(ordered);
1354 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1377 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1355 start_pos, last_pos, 1378 start_pos, last_pos,
1356 cached_state, GFP_NOFS); 1379 cached_state, GFP_NOFS);
@@ -1358,12 +1381,9 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
1358 unlock_page(pages[i]); 1381 unlock_page(pages[i]);
1359 page_cache_release(pages[i]); 1382 page_cache_release(pages[i]);
1360 } 1383 }
1361 ret = btrfs_wait_ordered_range(inode, start_pos, 1384 btrfs_start_ordered_extent(inode, ordered, 1);
1362 last_pos - start_pos + 1); 1385 btrfs_put_ordered_extent(ordered);
1363 if (ret) 1386 return -EAGAIN;
1364 return ret;
1365 else
1366 return -EAGAIN;
1367 } 1387 }
1368 if (ordered) 1388 if (ordered)
1369 btrfs_put_ordered_extent(ordered); 1389 btrfs_put_ordered_extent(ordered);
@@ -1396,8 +1416,12 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1396 u64 num_bytes; 1416 u64 num_bytes;
1397 int ret; 1417 int ret;
1398 1418
1419 ret = btrfs_start_nocow_write(root);
1420 if (!ret)
1421 return -ENOSPC;
1422
1399 lockstart = round_down(pos, root->sectorsize); 1423 lockstart = round_down(pos, root->sectorsize);
1400 lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1; 1424 lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
1401 1425
1402 while (1) { 1426 while (1) {
1403 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); 1427 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1415,12 +1439,10 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1415 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); 1439 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
1416 if (ret <= 0) { 1440 if (ret <= 0) {
1417 ret = 0; 1441 ret = 0;
1442 btrfs_end_nocow_write(root);
1418 } else { 1443 } else {
1419 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 1444 *write_bytes = min_t(size_t, *write_bytes ,
1420 EXTENT_DIRTY | EXTENT_DELALLOC | 1445 num_bytes - pos + lockstart);
1421 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
1422 NULL, GFP_NOFS);
1423 *write_bytes = min_t(size_t, *write_bytes, num_bytes);
1424 } 1446 }
1425 1447
1426 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); 1448 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1510,6 +1532,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1510 if (!only_release_metadata) 1532 if (!only_release_metadata)
1511 btrfs_free_reserved_data_space(inode, 1533 btrfs_free_reserved_data_space(inode,
1512 reserve_bytes); 1534 reserve_bytes);
1535 else
1536 btrfs_end_nocow_write(root);
1513 break; 1537 break;
1514 } 1538 }
1515 1539
@@ -1598,6 +1622,9 @@ again:
1598 } 1622 }
1599 1623
1600 release_bytes = 0; 1624 release_bytes = 0;
1625 if (only_release_metadata)
1626 btrfs_end_nocow_write(root);
1627
1601 if (only_release_metadata && copied > 0) { 1628 if (only_release_metadata && copied > 0) {
1602 u64 lockstart = round_down(pos, root->sectorsize); 1629 u64 lockstart = round_down(pos, root->sectorsize);
1603 u64 lockend = lockstart + 1630 u64 lockend = lockstart +
@@ -1624,10 +1651,12 @@ again:
1624 kfree(pages); 1651 kfree(pages);
1625 1652
1626 if (release_bytes) { 1653 if (release_bytes) {
1627 if (only_release_metadata) 1654 if (only_release_metadata) {
1655 btrfs_end_nocow_write(root);
1628 btrfs_delalloc_release_metadata(inode, release_bytes); 1656 btrfs_delalloc_release_metadata(inode, release_bytes);
1629 else 1657 } else {
1630 btrfs_delalloc_release_space(inode, release_bytes); 1658 btrfs_delalloc_release_space(inode, release_bytes);
1659 }
1631 } 1660 }
1632 1661
1633 return num_written ? num_written : ret; 1662 return num_written ? num_written : ret;
@@ -1797,7 +1826,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1797 BTRFS_I(inode)->last_sub_trans = root->log_transid; 1826 BTRFS_I(inode)->last_sub_trans = root->log_transid;
1798 if (num_written > 0) { 1827 if (num_written > 0) {
1799 err = generic_write_sync(file, pos, num_written); 1828 err = generic_write_sync(file, pos, num_written);
1800 if (err < 0 && num_written > 0) 1829 if (err < 0)
1801 num_written = err; 1830 num_written = err;
1802 } 1831 }
1803 1832
@@ -1856,8 +1885,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1856 struct dentry *dentry = file->f_path.dentry; 1885 struct dentry *dentry = file->f_path.dentry;
1857 struct inode *inode = dentry->d_inode; 1886 struct inode *inode = dentry->d_inode;
1858 struct btrfs_root *root = BTRFS_I(inode)->root; 1887 struct btrfs_root *root = BTRFS_I(inode)->root;
1859 int ret = 0;
1860 struct btrfs_trans_handle *trans; 1888 struct btrfs_trans_handle *trans;
1889 struct btrfs_log_ctx ctx;
1890 int ret = 0;
1861 bool full_sync = 0; 1891 bool full_sync = 0;
1862 1892
1863 trace_btrfs_sync_file(file, datasync); 1893 trace_btrfs_sync_file(file, datasync);
@@ -1951,7 +1981,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1951 } 1981 }
1952 trans->sync = true; 1982 trans->sync = true;
1953 1983
1954 ret = btrfs_log_dentry_safe(trans, root, dentry); 1984 btrfs_init_log_ctx(&ctx);
1985
1986 ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
1955 if (ret < 0) { 1987 if (ret < 0) {
1956 /* Fallthrough and commit/free transaction. */ 1988 /* Fallthrough and commit/free transaction. */
1957 ret = 1; 1989 ret = 1;
@@ -1971,7 +2003,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1971 2003
1972 if (ret != BTRFS_NO_LOG_SYNC) { 2004 if (ret != BTRFS_NO_LOG_SYNC) {
1973 if (!ret) { 2005 if (!ret) {
1974 ret = btrfs_sync_log(trans, root); 2006 ret = btrfs_sync_log(trans, root, &ctx);
1975 if (!ret) { 2007 if (!ret) {
1976 ret = btrfs_end_transaction(trans, root); 2008 ret = btrfs_end_transaction(trans, root);
1977 goto out; 2009 goto out;
@@ -2157,6 +2189,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2157 bool same_page = ((offset >> PAGE_CACHE_SHIFT) == 2189 bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
2158 ((offset + len - 1) >> PAGE_CACHE_SHIFT)); 2190 ((offset + len - 1) >> PAGE_CACHE_SHIFT));
2159 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); 2191 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2192 u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
2160 2193
2161 ret = btrfs_wait_ordered_range(inode, offset, len); 2194 ret = btrfs_wait_ordered_range(inode, offset, len);
2162 if (ret) 2195 if (ret)
@@ -2172,14 +2205,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2172 * entire page. 2205 * entire page.
2173 */ 2206 */
2174 if (same_page && len < PAGE_CACHE_SIZE) { 2207 if (same_page && len < PAGE_CACHE_SIZE) {
2175 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) 2208 if (offset < ino_size)
2176 ret = btrfs_truncate_page(inode, offset, len, 0); 2209 ret = btrfs_truncate_page(inode, offset, len, 0);
2177 mutex_unlock(&inode->i_mutex); 2210 mutex_unlock(&inode->i_mutex);
2178 return ret; 2211 return ret;
2179 } 2212 }
2180 2213
2181 /* zero back part of the first page */ 2214 /* zero back part of the first page */
2182 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { 2215 if (offset < ino_size) {
2183 ret = btrfs_truncate_page(inode, offset, 0, 0); 2216 ret = btrfs_truncate_page(inode, offset, 0, 0);
2184 if (ret) { 2217 if (ret) {
2185 mutex_unlock(&inode->i_mutex); 2218 mutex_unlock(&inode->i_mutex);
@@ -2188,7 +2221,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2188 } 2221 }
2189 2222
2190 /* zero the front end of the last page */ 2223 /* zero the front end of the last page */
2191 if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { 2224 if (offset + len < ino_size) {
2192 ret = btrfs_truncate_page(inode, offset + len, 0, 1); 2225 ret = btrfs_truncate_page(inode, offset + len, 0, 1);
2193 if (ret) { 2226 if (ret) {
2194 mutex_unlock(&inode->i_mutex); 2227 mutex_unlock(&inode->i_mutex);
@@ -2277,10 +2310,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2277 2310
2278 trans->block_rsv = &root->fs_info->trans_block_rsv; 2311 trans->block_rsv = &root->fs_info->trans_block_rsv;
2279 2312
2280 ret = fill_holes(trans, inode, path, cur_offset, drop_end); 2313 if (cur_offset < ino_size) {
2281 if (ret) { 2314 ret = fill_holes(trans, inode, path, cur_offset,
2282 err = ret; 2315 drop_end);
2283 break; 2316 if (ret) {
2317 err = ret;
2318 break;
2319 }
2284 } 2320 }
2285 2321
2286 cur_offset = drop_end; 2322 cur_offset = drop_end;
@@ -2313,10 +2349,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2313 } 2349 }
2314 2350
2315 trans->block_rsv = &root->fs_info->trans_block_rsv; 2351 trans->block_rsv = &root->fs_info->trans_block_rsv;
2316 ret = fill_holes(trans, inode, path, cur_offset, drop_end); 2352 if (cur_offset < ino_size) {
2317 if (ret) { 2353 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
2318 err = ret; 2354 if (ret) {
2319 goto out_trans; 2355 err = ret;
2356 goto out_trans;
2357 }
2320 } 2358 }
2321 2359
2322out_trans: 2360out_trans:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d3d44486290b..06e9a4152b14 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -864,7 +864,8 @@ static noinline int cow_file_range(struct inode *inode,
864 864
865 if (btrfs_is_free_space_inode(inode)) { 865 if (btrfs_is_free_space_inode(inode)) {
866 WARN_ON_ONCE(1); 866 WARN_ON_ONCE(1);
867 return -EINVAL; 867 ret = -EINVAL;
868 goto out_unlock;
868 } 869 }
869 870
870 num_bytes = ALIGN(end - start + 1, blocksize); 871 num_bytes = ALIGN(end - start + 1, blocksize);
@@ -1075,17 +1076,15 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1075 async_cow->end = cur_end; 1076 async_cow->end = cur_end;
1076 INIT_LIST_HEAD(&async_cow->extents); 1077 INIT_LIST_HEAD(&async_cow->extents);
1077 1078
1078 async_cow->work.func = async_cow_start; 1079 btrfs_init_work(&async_cow->work, async_cow_start,
1079 async_cow->work.ordered_func = async_cow_submit; 1080 async_cow_submit, async_cow_free);
1080 async_cow->work.ordered_free = async_cow_free;
1081 async_cow->work.flags = 0;
1082 1081
1083 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 1082 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1084 PAGE_CACHE_SHIFT; 1083 PAGE_CACHE_SHIFT;
1085 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); 1084 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1086 1085
1087 btrfs_queue_worker(&root->fs_info->delalloc_workers, 1086 btrfs_queue_work(root->fs_info->delalloc_workers,
1088 &async_cow->work); 1087 &async_cow->work);
1089 1088
1090 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { 1089 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1091 wait_event(root->fs_info->async_submit_wait, 1090 wait_event(root->fs_info->async_submit_wait,
@@ -1843,9 +1842,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1843 1842
1844 SetPageChecked(page); 1843 SetPageChecked(page);
1845 page_cache_get(page); 1844 page_cache_get(page);
1846 fixup->work.func = btrfs_writepage_fixup_worker; 1845 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
1847 fixup->page = page; 1846 fixup->page = page;
1848 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); 1847 btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
1849 return -EBUSY; 1848 return -EBUSY;
1850} 1849}
1851 1850
@@ -2239,6 +2238,11 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
2239 return PTR_ERR(root); 2238 return PTR_ERR(root);
2240 } 2239 }
2241 2240
2241 if (btrfs_root_readonly(root)) {
2242 srcu_read_unlock(&fs_info->subvol_srcu, index);
2243 return 0;
2244 }
2245
2242 /* step 2: get inode */ 2246 /* step 2: get inode */
2243 key.objectid = backref->inum; 2247 key.objectid = backref->inum;
2244 key.type = BTRFS_INODE_ITEM_KEY; 2248 key.type = BTRFS_INODE_ITEM_KEY;
@@ -2759,7 +2763,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2759 struct inode *inode = page->mapping->host; 2763 struct inode *inode = page->mapping->host;
2760 struct btrfs_root *root = BTRFS_I(inode)->root; 2764 struct btrfs_root *root = BTRFS_I(inode)->root;
2761 struct btrfs_ordered_extent *ordered_extent = NULL; 2765 struct btrfs_ordered_extent *ordered_extent = NULL;
2762 struct btrfs_workers *workers; 2766 struct btrfs_workqueue *workers;
2763 2767
2764 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 2768 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2765 2769
@@ -2768,14 +2772,13 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2768 end - start + 1, uptodate)) 2772 end - start + 1, uptodate))
2769 return 0; 2773 return 0;
2770 2774
2771 ordered_extent->work.func = finish_ordered_fn; 2775 btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
2772 ordered_extent->work.flags = 0;
2773 2776
2774 if (btrfs_is_free_space_inode(inode)) 2777 if (btrfs_is_free_space_inode(inode))
2775 workers = &root->fs_info->endio_freespace_worker; 2778 workers = root->fs_info->endio_freespace_worker;
2776 else 2779 else
2777 workers = &root->fs_info->endio_write_workers; 2780 workers = root->fs_info->endio_write_workers;
2778 btrfs_queue_worker(workers, &ordered_extent->work); 2781 btrfs_queue_work(workers, &ordered_extent->work);
2779 2782
2780 return 0; 2783 return 0;
2781} 2784}
@@ -4593,7 +4596,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
4593 struct rb_node *node; 4596 struct rb_node *node;
4594 4597
4595 ASSERT(inode->i_state & I_FREEING); 4598 ASSERT(inode->i_state & I_FREEING);
4596 truncate_inode_pages(&inode->i_data, 0); 4599 truncate_inode_pages_final(&inode->i_data);
4597 4600
4598 write_lock(&map_tree->lock); 4601 write_lock(&map_tree->lock);
4599 while (!RB_EMPTY_ROOT(&map_tree->map)) { 4602 while (!RB_EMPTY_ROOT(&map_tree->map)) {
@@ -4924,7 +4927,8 @@ void btrfs_invalidate_inodes(struct btrfs_root *root)
4924 struct inode *inode; 4927 struct inode *inode;
4925 u64 objectid = 0; 4928 u64 objectid = 0;
4926 4929
4927 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 4930 if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
4931 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4928 4932
4929 spin_lock(&root->inode_lock); 4933 spin_lock(&root->inode_lock);
4930again: 4934again:
@@ -5799,6 +5803,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
5799 } 5803 }
5800out_unlock: 5804out_unlock:
5801 btrfs_end_transaction(trans, root); 5805 btrfs_end_transaction(trans, root);
5806 btrfs_balance_delayed_items(root);
5802 btrfs_btree_balance_dirty(root); 5807 btrfs_btree_balance_dirty(root);
5803 if (drop_inode) { 5808 if (drop_inode) {
5804 inode_dec_link_count(inode); 5809 inode_dec_link_count(inode);
@@ -5872,6 +5877,7 @@ out_unlock:
5872 inode_dec_link_count(inode); 5877 inode_dec_link_count(inode);
5873 iput(inode); 5878 iput(inode);
5874 } 5879 }
5880 btrfs_balance_delayed_items(root);
5875 btrfs_btree_balance_dirty(root); 5881 btrfs_btree_balance_dirty(root);
5876 return err; 5882 return err;
5877} 5883}
@@ -5930,6 +5936,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5930 } 5936 }
5931 5937
5932 btrfs_end_transaction(trans, root); 5938 btrfs_end_transaction(trans, root);
5939 btrfs_balance_delayed_items(root);
5933fail: 5940fail:
5934 if (drop_inode) { 5941 if (drop_inode) {
5935 inode_dec_link_count(inode); 5942 inode_dec_link_count(inode);
@@ -5996,6 +6003,7 @@ out_fail:
5996 btrfs_end_transaction(trans, root); 6003 btrfs_end_transaction(trans, root);
5997 if (drop_on_err) 6004 if (drop_on_err)
5998 iput(inode); 6005 iput(inode);
6006 btrfs_balance_delayed_items(root);
5999 btrfs_btree_balance_dirty(root); 6007 btrfs_btree_balance_dirty(root);
6000 return err; 6008 return err;
6001} 6009}
@@ -6550,6 +6558,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6550 int ret; 6558 int ret;
6551 struct extent_buffer *leaf; 6559 struct extent_buffer *leaf;
6552 struct btrfs_root *root = BTRFS_I(inode)->root; 6560 struct btrfs_root *root = BTRFS_I(inode)->root;
6561 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6553 struct btrfs_file_extent_item *fi; 6562 struct btrfs_file_extent_item *fi;
6554 struct btrfs_key key; 6563 struct btrfs_key key;
6555 u64 disk_bytenr; 6564 u64 disk_bytenr;
@@ -6626,6 +6635,20 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6626 6635
6627 if (btrfs_extent_readonly(root, disk_bytenr)) 6636 if (btrfs_extent_readonly(root, disk_bytenr))
6628 goto out; 6637 goto out;
6638
6639 num_bytes = min(offset + *len, extent_end) - offset;
6640 if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6641 u64 range_end;
6642
6643 range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
6644 ret = test_range_bit(io_tree, offset, range_end,
6645 EXTENT_DELALLOC, 0, NULL);
6646 if (ret) {
6647 ret = -EAGAIN;
6648 goto out;
6649 }
6650 }
6651
6629 btrfs_release_path(path); 6652 btrfs_release_path(path);
6630 6653
6631 /* 6654 /*
@@ -6654,7 +6677,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6654 */ 6677 */
6655 disk_bytenr += backref_offset; 6678 disk_bytenr += backref_offset;
6656 disk_bytenr += offset - key.offset; 6679 disk_bytenr += offset - key.offset;
6657 num_bytes = min(offset + *len, extent_end) - offset;
6658 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 6680 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
6659 goto out; 6681 goto out;
6660 /* 6682 /*
@@ -7024,10 +7046,9 @@ again:
7024 if (!ret) 7046 if (!ret)
7025 goto out_test; 7047 goto out_test;
7026 7048
7027 ordered->work.func = finish_ordered_fn; 7049 btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
7028 ordered->work.flags = 0; 7050 btrfs_queue_work(root->fs_info->endio_write_workers,
7029 btrfs_queue_worker(&root->fs_info->endio_write_workers, 7051 &ordered->work);
7030 &ordered->work);
7031out_test: 7052out_test:
7032 /* 7053 /*
7033 * our bio might span multiple ordered extents. If we haven't 7054 * our bio might span multiple ordered extents. If we haven't
@@ -7404,15 +7425,15 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7404 smp_mb__after_atomic_inc(); 7425 smp_mb__after_atomic_inc();
7405 7426
7406 /* 7427 /*
7407 * The generic stuff only does filemap_write_and_wait_range, which isn't 7428 * The generic stuff only does filemap_write_and_wait_range, which
7408 * enough if we've written compressed pages to this area, so we need to 7429 * isn't enough if we've written compressed pages to this area, so
7409 * call btrfs_wait_ordered_range to make absolutely sure that any 7430 * we need to flush the dirty pages again to make absolutely sure
7410 * outstanding dirty pages are on disk. 7431 * that any outstanding dirty pages are on disk.
7411 */ 7432 */
7412 count = iov_length(iov, nr_segs); 7433 count = iov_length(iov, nr_segs);
7413 ret = btrfs_wait_ordered_range(inode, offset, count); 7434 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7414 if (ret) 7435 &BTRFS_I(inode)->runtime_flags))
7415 return ret; 7436 filemap_fdatawrite_range(inode->i_mapping, offset, count);
7416 7437
7417 if (rw & WRITE) { 7438 if (rw & WRITE) {
7418 /* 7439 /*
@@ -8404,7 +8425,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
8404 work->inode = inode; 8425 work->inode = inode;
8405 work->wait = wait; 8426 work->wait = wait;
8406 work->delay_iput = delay_iput; 8427 work->delay_iput = delay_iput;
8407 work->work.func = btrfs_run_delalloc_work; 8428 btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
8408 8429
8409 return work; 8430 return work;
8410} 8431}
@@ -8419,7 +8440,8 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
8419 * some fairly slow code that needs optimization. This walks the list 8440 * some fairly slow code that needs optimization. This walks the list
8420 * of all the inodes with pending delalloc and forces them to disk. 8441 * of all the inodes with pending delalloc and forces them to disk.
8421 */ 8442 */
8422static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 8443static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
8444 int nr)
8423{ 8445{
8424 struct btrfs_inode *binode; 8446 struct btrfs_inode *binode;
8425 struct inode *inode; 8447 struct inode *inode;
@@ -8431,6 +8453,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8431 INIT_LIST_HEAD(&works); 8453 INIT_LIST_HEAD(&works);
8432 INIT_LIST_HEAD(&splice); 8454 INIT_LIST_HEAD(&splice);
8433 8455
8456 mutex_lock(&root->delalloc_mutex);
8434 spin_lock(&root->delalloc_lock); 8457 spin_lock(&root->delalloc_lock);
8435 list_splice_init(&root->delalloc_inodes, &splice); 8458 list_splice_init(&root->delalloc_inodes, &splice);
8436 while (!list_empty(&splice)) { 8459 while (!list_empty(&splice)) {
@@ -8453,12 +8476,14 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8453 else 8476 else
8454 iput(inode); 8477 iput(inode);
8455 ret = -ENOMEM; 8478 ret = -ENOMEM;
8456 goto out; 8479 break;
8457 } 8480 }
8458 list_add_tail(&work->list, &works); 8481 list_add_tail(&work->list, &works);
8459 btrfs_queue_worker(&root->fs_info->flush_workers, 8482 btrfs_queue_work(root->fs_info->flush_workers,
8460 &work->work); 8483 &work->work);
8461 8484 ret++;
8485 if (nr != -1 && ret >= nr)
8486 break;
8462 cond_resched(); 8487 cond_resched();
8463 spin_lock(&root->delalloc_lock); 8488 spin_lock(&root->delalloc_lock);
8464 } 8489 }
@@ -8468,18 +8493,13 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8468 list_del_init(&work->list); 8493 list_del_init(&work->list);
8469 btrfs_wait_and_free_delalloc_work(work); 8494 btrfs_wait_and_free_delalloc_work(work);
8470 } 8495 }
8471 return 0;
8472out:
8473 list_for_each_entry_safe(work, next, &works, list) {
8474 list_del_init(&work->list);
8475 btrfs_wait_and_free_delalloc_work(work);
8476 }
8477 8496
8478 if (!list_empty_careful(&splice)) { 8497 if (!list_empty_careful(&splice)) {
8479 spin_lock(&root->delalloc_lock); 8498 spin_lock(&root->delalloc_lock);
8480 list_splice_tail(&splice, &root->delalloc_inodes); 8499 list_splice_tail(&splice, &root->delalloc_inodes);
8481 spin_unlock(&root->delalloc_lock); 8500 spin_unlock(&root->delalloc_lock);
8482 } 8501 }
8502 mutex_unlock(&root->delalloc_mutex);
8483 return ret; 8503 return ret;
8484} 8504}
8485 8505
@@ -8490,7 +8510,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8490 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) 8510 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
8491 return -EROFS; 8511 return -EROFS;
8492 8512
8493 ret = __start_delalloc_inodes(root, delay_iput); 8513 ret = __start_delalloc_inodes(root, delay_iput, -1);
8514 if (ret > 0)
8515 ret = 0;
8494 /* 8516 /*
8495 * the filemap_flush will queue IO into the worker threads, but 8517 * the filemap_flush will queue IO into the worker threads, but
8496 * we have to make sure the IO is actually started and that 8518 * we have to make sure the IO is actually started and that
@@ -8507,7 +8529,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8507 return ret; 8529 return ret;
8508} 8530}
8509 8531
8510int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput) 8532int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
8533 int nr)
8511{ 8534{
8512 struct btrfs_root *root; 8535 struct btrfs_root *root;
8513 struct list_head splice; 8536 struct list_head splice;
@@ -8518,9 +8541,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
8518 8541
8519 INIT_LIST_HEAD(&splice); 8542 INIT_LIST_HEAD(&splice);
8520 8543
8544 mutex_lock(&fs_info->delalloc_root_mutex);
8521 spin_lock(&fs_info->delalloc_root_lock); 8545 spin_lock(&fs_info->delalloc_root_lock);
8522 list_splice_init(&fs_info->delalloc_roots, &splice); 8546 list_splice_init(&fs_info->delalloc_roots, &splice);
8523 while (!list_empty(&splice)) { 8547 while (!list_empty(&splice) && nr) {
8524 root = list_first_entry(&splice, struct btrfs_root, 8548 root = list_first_entry(&splice, struct btrfs_root,
8525 delalloc_root); 8549 delalloc_root);
8526 root = btrfs_grab_fs_root(root); 8550 root = btrfs_grab_fs_root(root);
@@ -8529,15 +8553,20 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
8529 &fs_info->delalloc_roots); 8553 &fs_info->delalloc_roots);
8530 spin_unlock(&fs_info->delalloc_root_lock); 8554 spin_unlock(&fs_info->delalloc_root_lock);
8531 8555
8532 ret = __start_delalloc_inodes(root, delay_iput); 8556 ret = __start_delalloc_inodes(root, delay_iput, nr);
8533 btrfs_put_fs_root(root); 8557 btrfs_put_fs_root(root);
8534 if (ret) 8558 if (ret < 0)
8535 goto out; 8559 goto out;
8536 8560
8561 if (nr != -1) {
8562 nr -= ret;
8563 WARN_ON(nr < 0);
8564 }
8537 spin_lock(&fs_info->delalloc_root_lock); 8565 spin_lock(&fs_info->delalloc_root_lock);
8538 } 8566 }
8539 spin_unlock(&fs_info->delalloc_root_lock); 8567 spin_unlock(&fs_info->delalloc_root_lock);
8540 8568
8569 ret = 0;
8541 atomic_inc(&fs_info->async_submit_draining); 8570 atomic_inc(&fs_info->async_submit_draining);
8542 while (atomic_read(&fs_info->nr_async_submits) || 8571 while (atomic_read(&fs_info->nr_async_submits) ||
8543 atomic_read(&fs_info->async_delalloc_pages)) { 8572 atomic_read(&fs_info->async_delalloc_pages)) {
@@ -8546,13 +8575,13 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
8546 atomic_read(&fs_info->async_delalloc_pages) == 0)); 8575 atomic_read(&fs_info->async_delalloc_pages) == 0));
8547 } 8576 }
8548 atomic_dec(&fs_info->async_submit_draining); 8577 atomic_dec(&fs_info->async_submit_draining);
8549 return 0;
8550out: 8578out:
8551 if (!list_empty_careful(&splice)) { 8579 if (!list_empty_careful(&splice)) {
8552 spin_lock(&fs_info->delalloc_root_lock); 8580 spin_lock(&fs_info->delalloc_root_lock);
8553 list_splice_tail(&splice, &fs_info->delalloc_roots); 8581 list_splice_tail(&splice, &fs_info->delalloc_roots);
8554 spin_unlock(&fs_info->delalloc_root_lock); 8582 spin_unlock(&fs_info->delalloc_root_lock);
8555 } 8583 }
8584 mutex_unlock(&fs_info->delalloc_root_mutex);
8556 return ret; 8585 return ret;
8557} 8586}
8558 8587
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a6d8efa46bfe..0401397b5c92 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -59,6 +59,32 @@
59#include "props.h" 59#include "props.h"
60#include "sysfs.h" 60#include "sysfs.h"
61 61
62#ifdef CONFIG_64BIT
63/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
64 * structures are incorrect, as the timespec structure from userspace
65 * is 4 bytes too small. We define these alternatives here to teach
66 * the kernel about the 32-bit struct packing.
67 */
68struct btrfs_ioctl_timespec_32 {
69 __u64 sec;
70 __u32 nsec;
71} __attribute__ ((__packed__));
72
73struct btrfs_ioctl_received_subvol_args_32 {
74 char uuid[BTRFS_UUID_SIZE]; /* in */
75 __u64 stransid; /* in */
76 __u64 rtransid; /* out */
77 struct btrfs_ioctl_timespec_32 stime; /* in */
78 struct btrfs_ioctl_timespec_32 rtime; /* out */
79 __u64 flags; /* in */
80 __u64 reserved[16]; /* in */
81} __attribute__ ((__packed__));
82
83#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
84 struct btrfs_ioctl_received_subvol_args_32)
85#endif
86
87
62static int btrfs_clone(struct inode *src, struct inode *inode, 88static int btrfs_clone(struct inode *src, struct inode *inode,
63 u64 off, u64 olen, u64 olen_aligned, u64 destoff); 89 u64 off, u64 olen, u64 olen_aligned, u64 destoff);
64 90
@@ -585,6 +611,23 @@ fail:
585 return ret; 611 return ret;
586} 612}
587 613
614static void btrfs_wait_nocow_write(struct btrfs_root *root)
615{
616 s64 writers;
617 DEFINE_WAIT(wait);
618
619 do {
620 prepare_to_wait(&root->subv_writers->wait, &wait,
621 TASK_UNINTERRUPTIBLE);
622
623 writers = percpu_counter_sum(&root->subv_writers->counter);
624 if (writers)
625 schedule();
626
627 finish_wait(&root->subv_writers->wait, &wait);
628 } while (writers);
629}
630
588static int create_snapshot(struct btrfs_root *root, struct inode *dir, 631static int create_snapshot(struct btrfs_root *root, struct inode *dir,
589 struct dentry *dentry, char *name, int namelen, 632 struct dentry *dentry, char *name, int namelen,
590 u64 *async_transid, bool readonly, 633 u64 *async_transid, bool readonly,
@@ -598,15 +641,21 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
598 if (!root->ref_cows) 641 if (!root->ref_cows)
599 return -EINVAL; 642 return -EINVAL;
600 643
644 atomic_inc(&root->will_be_snapshoted);
645 smp_mb__after_atomic_inc();
646 btrfs_wait_nocow_write(root);
647
601 ret = btrfs_start_delalloc_inodes(root, 0); 648 ret = btrfs_start_delalloc_inodes(root, 0);
602 if (ret) 649 if (ret)
603 return ret; 650 goto out;
604 651
605 btrfs_wait_ordered_extents(root, -1); 652 btrfs_wait_ordered_extents(root, -1);
606 653
607 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 654 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
608 if (!pending_snapshot) 655 if (!pending_snapshot) {
609 return -ENOMEM; 656 ret = -ENOMEM;
657 goto out;
658 }
610 659
611 btrfs_init_block_rsv(&pending_snapshot->block_rsv, 660 btrfs_init_block_rsv(&pending_snapshot->block_rsv,
612 BTRFS_BLOCK_RSV_TEMP); 661 BTRFS_BLOCK_RSV_TEMP);
@@ -623,7 +672,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
623 &pending_snapshot->qgroup_reserved, 672 &pending_snapshot->qgroup_reserved,
624 false); 673 false);
625 if (ret) 674 if (ret)
626 goto out; 675 goto free;
627 676
628 pending_snapshot->dentry = dentry; 677 pending_snapshot->dentry = dentry;
629 pending_snapshot->root = root; 678 pending_snapshot->root = root;
@@ -674,8 +723,10 @@ fail:
674 btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, 723 btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
675 &pending_snapshot->block_rsv, 724 &pending_snapshot->block_rsv,
676 pending_snapshot->qgroup_reserved); 725 pending_snapshot->qgroup_reserved);
677out: 726free:
678 kfree(pending_snapshot); 727 kfree(pending_snapshot);
728out:
729 atomic_dec(&root->will_be_snapshoted);
679 return ret; 730 return ret;
680} 731}
681 732
@@ -884,12 +935,14 @@ static int find_new_extents(struct btrfs_root *root,
884 min_key.type = BTRFS_EXTENT_DATA_KEY; 935 min_key.type = BTRFS_EXTENT_DATA_KEY;
885 min_key.offset = *off; 936 min_key.offset = *off;
886 937
887 path->keep_locks = 1;
888
889 while (1) { 938 while (1) {
939 path->keep_locks = 1;
890 ret = btrfs_search_forward(root, &min_key, path, newer_than); 940 ret = btrfs_search_forward(root, &min_key, path, newer_than);
891 if (ret != 0) 941 if (ret != 0)
892 goto none; 942 goto none;
943 path->keep_locks = 0;
944 btrfs_unlock_up_safe(path, 1);
945process_slot:
893 if (min_key.objectid != ino) 946 if (min_key.objectid != ino)
894 goto none; 947 goto none;
895 if (min_key.type != BTRFS_EXTENT_DATA_KEY) 948 if (min_key.type != BTRFS_EXTENT_DATA_KEY)
@@ -908,6 +961,12 @@ static int find_new_extents(struct btrfs_root *root,
908 return 0; 961 return 0;
909 } 962 }
910 963
964 path->slots[0]++;
965 if (path->slots[0] < btrfs_header_nritems(leaf)) {
966 btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
967 goto process_slot;
968 }
969
911 if (min_key.offset == (u64)-1) 970 if (min_key.offset == (u64)-1)
912 goto none; 971 goto none;
913 972
@@ -935,10 +994,13 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
935 read_unlock(&em_tree->lock); 994 read_unlock(&em_tree->lock);
936 995
937 if (!em) { 996 if (!em) {
997 struct extent_state *cached = NULL;
998 u64 end = start + len - 1;
999
938 /* get the big lock and read metadata off disk */ 1000 /* get the big lock and read metadata off disk */
939 lock_extent(io_tree, start, start + len - 1); 1001 lock_extent_bits(io_tree, start, end, 0, &cached);
940 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 1002 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
941 unlock_extent(io_tree, start, start + len - 1); 1003 unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
942 1004
943 if (IS_ERR(em)) 1005 if (IS_ERR(em))
944 return NULL; 1006 return NULL;
@@ -957,7 +1019,8 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
957 return false; 1019 return false;
958 1020
959 next = defrag_lookup_extent(inode, em->start + em->len); 1021 next = defrag_lookup_extent(inode, em->start + em->len);
960 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) 1022 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE ||
1023 (em->block_start + em->block_len == next->block_start))
961 ret = false; 1024 ret = false;
962 1025
963 free_extent_map(next); 1026 free_extent_map(next);
@@ -1076,10 +1139,12 @@ again:
1076 page_start = page_offset(page); 1139 page_start = page_offset(page);
1077 page_end = page_start + PAGE_CACHE_SIZE - 1; 1140 page_end = page_start + PAGE_CACHE_SIZE - 1;
1078 while (1) { 1141 while (1) {
1079 lock_extent(tree, page_start, page_end); 1142 lock_extent_bits(tree, page_start, page_end,
1143 0, &cached_state);
1080 ordered = btrfs_lookup_ordered_extent(inode, 1144 ordered = btrfs_lookup_ordered_extent(inode,
1081 page_start); 1145 page_start);
1082 unlock_extent(tree, page_start, page_end); 1146 unlock_extent_cached(tree, page_start, page_end,
1147 &cached_state, GFP_NOFS);
1083 if (!ordered) 1148 if (!ordered)
1084 break; 1149 break;
1085 1150
@@ -1356,8 +1421,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1356 } 1421 }
1357 } 1422 }
1358 1423
1359 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) 1424 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
1360 filemap_flush(inode->i_mapping); 1425 filemap_flush(inode->i_mapping);
1426 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1427 &BTRFS_I(inode)->runtime_flags))
1428 filemap_flush(inode->i_mapping);
1429 }
1361 1430
1362 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { 1431 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
1363 /* the filemap_flush will queue IO into the worker threads, but 1432 /* the filemap_flush will queue IO into the worker threads, but
@@ -1573,7 +1642,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1573 if (src_inode->i_sb != file_inode(file)->i_sb) { 1642 if (src_inode->i_sb != file_inode(file)->i_sb) {
1574 btrfs_info(BTRFS_I(src_inode)->root->fs_info, 1643 btrfs_info(BTRFS_I(src_inode)->root->fs_info,
1575 "Snapshot src from another FS"); 1644 "Snapshot src from another FS");
1576 ret = -EINVAL; 1645 ret = -EXDEV;
1577 } else if (!inode_owner_or_capable(src_inode)) { 1646 } else if (!inode_owner_or_capable(src_inode)) {
1578 /* 1647 /*
1579 * Subvolume creation is not restricted, but snapshots 1648 * Subvolume creation is not restricted, but snapshots
@@ -1797,7 +1866,9 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
1797 if (di && !IS_ERR(di)) { 1866 if (di && !IS_ERR(di)) {
1798 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); 1867 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
1799 if (key.objectid == root->root_key.objectid) { 1868 if (key.objectid == root->root_key.objectid) {
1800 ret = -ENOTEMPTY; 1869 ret = -EPERM;
1870 btrfs_err(root->fs_info, "deleting default subvolume "
1871 "%llu is not allowed", key.objectid);
1801 goto out; 1872 goto out;
1802 } 1873 }
1803 btrfs_release_path(path); 1874 btrfs_release_path(path);
@@ -2994,8 +3065,9 @@ process_slot:
2994 new_key.offset + datal, 3065 new_key.offset + datal,
2995 1); 3066 1);
2996 if (ret) { 3067 if (ret) {
2997 btrfs_abort_transaction(trans, root, 3068 if (ret != -EINVAL)
2998 ret); 3069 btrfs_abort_transaction(trans,
3070 root, ret);
2999 btrfs_end_transaction(trans, root); 3071 btrfs_end_transaction(trans, root);
3000 goto out; 3072 goto out;
3001 } 3073 }
@@ -3153,8 +3225,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
3153 * decompress into destination's address_space (the file offset 3225 * decompress into destination's address_space (the file offset
3154 * may change, so source mapping won't do), then recompress (or 3226 * may change, so source mapping won't do), then recompress (or
3155 * otherwise reinsert) a subrange. 3227 * otherwise reinsert) a subrange.
3156 * - allow ranges within the same file to be cloned (provided 3228 *
3157 * they don't overlap)? 3229 * - split destination inode's inline extents. The inline extents can
3230 * be either compressed or non-compressed.
3158 */ 3231 */
3159 3232
3160 /* the destination must be opened for writing */ 3233 /* the destination must be opened for writing */
@@ -4353,10 +4426,9 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
4353 return btrfs_qgroup_wait_for_completion(root->fs_info); 4426 return btrfs_qgroup_wait_for_completion(root->fs_info);
4354} 4427}
4355 4428
4356static long btrfs_ioctl_set_received_subvol(struct file *file, 4429static long _btrfs_ioctl_set_received_subvol(struct file *file,
4357 void __user *arg) 4430 struct btrfs_ioctl_received_subvol_args *sa)
4358{ 4431{
4359 struct btrfs_ioctl_received_subvol_args *sa = NULL;
4360 struct inode *inode = file_inode(file); 4432 struct inode *inode = file_inode(file);
4361 struct btrfs_root *root = BTRFS_I(inode)->root; 4433 struct btrfs_root *root = BTRFS_I(inode)->root;
4362 struct btrfs_root_item *root_item = &root->root_item; 4434 struct btrfs_root_item *root_item = &root->root_item;
@@ -4384,13 +4456,6 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
4384 goto out; 4456 goto out;
4385 } 4457 }
4386 4458
4387 sa = memdup_user(arg, sizeof(*sa));
4388 if (IS_ERR(sa)) {
4389 ret = PTR_ERR(sa);
4390 sa = NULL;
4391 goto out;
4392 }
4393
4394 /* 4459 /*
4395 * 1 - root item 4460 * 1 - root item
4396 * 2 - uuid items (received uuid + subvol uuid) 4461 * 2 - uuid items (received uuid + subvol uuid)
@@ -4444,14 +4509,91 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
4444 goto out; 4509 goto out;
4445 } 4510 }
4446 4511
4512out:
4513 up_write(&root->fs_info->subvol_sem);
4514 mnt_drop_write_file(file);
4515 return ret;
4516}
4517
4518#ifdef CONFIG_64BIT
4519static long btrfs_ioctl_set_received_subvol_32(struct file *file,
4520 void __user *arg)
4521{
4522 struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
4523 struct btrfs_ioctl_received_subvol_args *args64 = NULL;
4524 int ret = 0;
4525
4526 args32 = memdup_user(arg, sizeof(*args32));
4527 if (IS_ERR(args32)) {
4528 ret = PTR_ERR(args32);
4529 args32 = NULL;
4530 goto out;
4531 }
4532
4533 args64 = kmalloc(sizeof(*args64), GFP_NOFS);
4534 if (IS_ERR(args64)) {
4535 ret = PTR_ERR(args64);
4536 args64 = NULL;
4537 goto out;
4538 }
4539
4540 memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
4541 args64->stransid = args32->stransid;
4542 args64->rtransid = args32->rtransid;
4543 args64->stime.sec = args32->stime.sec;
4544 args64->stime.nsec = args32->stime.nsec;
4545 args64->rtime.sec = args32->rtime.sec;
4546 args64->rtime.nsec = args32->rtime.nsec;
4547 args64->flags = args32->flags;
4548
4549 ret = _btrfs_ioctl_set_received_subvol(file, args64);
4550 if (ret)
4551 goto out;
4552
4553 memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
4554 args32->stransid = args64->stransid;
4555 args32->rtransid = args64->rtransid;
4556 args32->stime.sec = args64->stime.sec;
4557 args32->stime.nsec = args64->stime.nsec;
4558 args32->rtime.sec = args64->rtime.sec;
4559 args32->rtime.nsec = args64->rtime.nsec;
4560 args32->flags = args64->flags;
4561
4562 ret = copy_to_user(arg, args32, sizeof(*args32));
4563 if (ret)
4564 ret = -EFAULT;
4565
4566out:
4567 kfree(args32);
4568 kfree(args64);
4569 return ret;
4570}
4571#endif
4572
4573static long btrfs_ioctl_set_received_subvol(struct file *file,
4574 void __user *arg)
4575{
4576 struct btrfs_ioctl_received_subvol_args *sa = NULL;
4577 int ret = 0;
4578
4579 sa = memdup_user(arg, sizeof(*sa));
4580 if (IS_ERR(sa)) {
4581 ret = PTR_ERR(sa);
4582 sa = NULL;
4583 goto out;
4584 }
4585
4586 ret = _btrfs_ioctl_set_received_subvol(file, sa);
4587
4588 if (ret)
4589 goto out;
4590
4447 ret = copy_to_user(arg, sa, sizeof(*sa)); 4591 ret = copy_to_user(arg, sa, sizeof(*sa));
4448 if (ret) 4592 if (ret)
4449 ret = -EFAULT; 4593 ret = -EFAULT;
4450 4594
4451out: 4595out:
4452 kfree(sa); 4596 kfree(sa);
4453 up_write(&root->fs_info->subvol_sem);
4454 mnt_drop_write_file(file);
4455 return ret; 4597 return ret;
4456} 4598}
4457 4599
@@ -4746,7 +4888,7 @@ long btrfs_ioctl(struct file *file, unsigned int
4746 case BTRFS_IOC_SYNC: { 4888 case BTRFS_IOC_SYNC: {
4747 int ret; 4889 int ret;
4748 4890
4749 ret = btrfs_start_delalloc_roots(root->fs_info, 0); 4891 ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
4750 if (ret) 4892 if (ret)
4751 return ret; 4893 return ret;
4752 ret = btrfs_sync_fs(file->f_dentry->d_sb, 1); 4894 ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
@@ -4770,6 +4912,10 @@ long btrfs_ioctl(struct file *file, unsigned int
4770 return btrfs_ioctl_balance_progress(root, argp); 4912 return btrfs_ioctl_balance_progress(root, argp);
4771 case BTRFS_IOC_SET_RECEIVED_SUBVOL: 4913 case BTRFS_IOC_SET_RECEIVED_SUBVOL:
4772 return btrfs_ioctl_set_received_subvol(file, argp); 4914 return btrfs_ioctl_set_received_subvol(file, argp);
4915#ifdef CONFIG_64BIT
4916 case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
4917 return btrfs_ioctl_set_received_subvol_32(file, argp);
4918#endif
4773 case BTRFS_IOC_SEND: 4919 case BTRFS_IOC_SEND:
4774 return btrfs_ioctl_send(file, argp); 4920 return btrfs_ioctl_send(file, argp);
4775 case BTRFS_IOC_GET_DEV_STATS: 4921 case BTRFS_IOC_GET_DEV_STATS:
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b16450b840e7..a94b05f72869 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -349,10 +349,13 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
349 if (!uptodate) 349 if (!uptodate)
350 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 350 set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
351 351
352 if (entry->bytes_left == 0) 352 if (entry->bytes_left == 0) {
353 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 353 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
354 else 354 if (waitqueue_active(&entry->wait))
355 wake_up(&entry->wait);
356 } else {
355 ret = 1; 357 ret = 1;
358 }
356out: 359out:
357 if (!ret && cached && entry) { 360 if (!ret && cached && entry) {
358 *cached = entry; 361 *cached = entry;
@@ -410,10 +413,13 @@ have_entry:
410 if (!uptodate) 413 if (!uptodate)
411 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 414 set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
412 415
413 if (entry->bytes_left == 0) 416 if (entry->bytes_left == 0) {
414 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 417 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
415 else 418 if (waitqueue_active(&entry->wait))
419 wake_up(&entry->wait);
420 } else {
416 ret = 1; 421 ret = 1;
422 }
417out: 423out:
418 if (!ret && cached && entry) { 424 if (!ret && cached && entry) {
419 *cached = entry; 425 *cached = entry;
@@ -424,27 +430,48 @@ out:
424} 430}
425 431
426/* Needs to either be called under a log transaction or the log_mutex */ 432/* Needs to either be called under a log transaction or the log_mutex */
427void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode) 433void btrfs_get_logged_extents(struct inode *inode,
434 struct list_head *logged_list)
428{ 435{
429 struct btrfs_ordered_inode_tree *tree; 436 struct btrfs_ordered_inode_tree *tree;
430 struct btrfs_ordered_extent *ordered; 437 struct btrfs_ordered_extent *ordered;
431 struct rb_node *n; 438 struct rb_node *n;
432 int index = log->log_transid % 2;
433 439
434 tree = &BTRFS_I(inode)->ordered_tree; 440 tree = &BTRFS_I(inode)->ordered_tree;
435 spin_lock_irq(&tree->lock); 441 spin_lock_irq(&tree->lock);
436 for (n = rb_first(&tree->tree); n; n = rb_next(n)) { 442 for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
437 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); 443 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
438 spin_lock(&log->log_extents_lock[index]); 444 if (!list_empty(&ordered->log_list))
439 if (list_empty(&ordered->log_list)) { 445 continue;
440 list_add_tail(&ordered->log_list, &log->logged_list[index]); 446 list_add_tail(&ordered->log_list, logged_list);
441 atomic_inc(&ordered->refs); 447 atomic_inc(&ordered->refs);
442 }
443 spin_unlock(&log->log_extents_lock[index]);
444 } 448 }
445 spin_unlock_irq(&tree->lock); 449 spin_unlock_irq(&tree->lock);
446} 450}
447 451
452void btrfs_put_logged_extents(struct list_head *logged_list)
453{
454 struct btrfs_ordered_extent *ordered;
455
456 while (!list_empty(logged_list)) {
457 ordered = list_first_entry(logged_list,
458 struct btrfs_ordered_extent,
459 log_list);
460 list_del_init(&ordered->log_list);
461 btrfs_put_ordered_extent(ordered);
462 }
463}
464
465void btrfs_submit_logged_extents(struct list_head *logged_list,
466 struct btrfs_root *log)
467{
468 int index = log->log_transid % 2;
469
470 spin_lock_irq(&log->log_extents_lock[index]);
471 list_splice_tail(logged_list, &log->logged_list[index]);
472 spin_unlock_irq(&log->log_extents_lock[index]);
473}
474
448void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) 475void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
449{ 476{
450 struct btrfs_ordered_extent *ordered; 477 struct btrfs_ordered_extent *ordered;
@@ -577,7 +604,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
577 INIT_LIST_HEAD(&splice); 604 INIT_LIST_HEAD(&splice);
578 INIT_LIST_HEAD(&works); 605 INIT_LIST_HEAD(&works);
579 606
580 mutex_lock(&root->fs_info->ordered_operations_mutex); 607 mutex_lock(&root->ordered_extent_mutex);
581 spin_lock(&root->ordered_extent_lock); 608 spin_lock(&root->ordered_extent_lock);
582 list_splice_init(&root->ordered_extents, &splice); 609 list_splice_init(&root->ordered_extents, &splice);
583 while (!list_empty(&splice) && nr) { 610 while (!list_empty(&splice) && nr) {
@@ -588,10 +615,11 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
588 atomic_inc(&ordered->refs); 615 atomic_inc(&ordered->refs);
589 spin_unlock(&root->ordered_extent_lock); 616 spin_unlock(&root->ordered_extent_lock);
590 617
591 ordered->flush_work.func = btrfs_run_ordered_extent_work; 618 btrfs_init_work(&ordered->flush_work,
619 btrfs_run_ordered_extent_work, NULL, NULL);
592 list_add_tail(&ordered->work_list, &works); 620 list_add_tail(&ordered->work_list, &works);
593 btrfs_queue_worker(&root->fs_info->flush_workers, 621 btrfs_queue_work(root->fs_info->flush_workers,
594 &ordered->flush_work); 622 &ordered->flush_work);
595 623
596 cond_resched(); 624 cond_resched();
597 spin_lock(&root->ordered_extent_lock); 625 spin_lock(&root->ordered_extent_lock);
@@ -608,7 +636,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
608 btrfs_put_ordered_extent(ordered); 636 btrfs_put_ordered_extent(ordered);
609 cond_resched(); 637 cond_resched();
610 } 638 }
611 mutex_unlock(&root->fs_info->ordered_operations_mutex); 639 mutex_unlock(&root->ordered_extent_mutex);
612 640
613 return count; 641 return count;
614} 642}
@@ -621,6 +649,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
621 649
622 INIT_LIST_HEAD(&splice); 650 INIT_LIST_HEAD(&splice);
623 651
652 mutex_lock(&fs_info->ordered_operations_mutex);
624 spin_lock(&fs_info->ordered_root_lock); 653 spin_lock(&fs_info->ordered_root_lock);
625 list_splice_init(&fs_info->ordered_roots, &splice); 654 list_splice_init(&fs_info->ordered_roots, &splice);
626 while (!list_empty(&splice) && nr) { 655 while (!list_empty(&splice) && nr) {
@@ -643,6 +672,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
643 } 672 }
644 list_splice_tail(&splice, &fs_info->ordered_roots); 673 list_splice_tail(&splice, &fs_info->ordered_roots);
645 spin_unlock(&fs_info->ordered_root_lock); 674 spin_unlock(&fs_info->ordered_root_lock);
675 mutex_unlock(&fs_info->ordered_operations_mutex);
646} 676}
647 677
648/* 678/*
@@ -704,8 +734,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
704 goto out; 734 goto out;
705 } 735 }
706 list_add_tail(&work->list, &works); 736 list_add_tail(&work->list, &works);
707 btrfs_queue_worker(&root->fs_info->flush_workers, 737 btrfs_queue_work(root->fs_info->flush_workers,
708 &work->work); 738 &work->work);
709 739
710 cond_resched(); 740 cond_resched();
711 spin_lock(&root->fs_info->ordered_root_lock); 741 spin_lock(&root->fs_info->ordered_root_lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 9b0450f7ac20..246897058efb 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -197,7 +197,11 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
197 struct inode *inode); 197 struct inode *inode);
198int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); 198int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
199void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); 199void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
200void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); 200void btrfs_get_logged_extents(struct inode *inode,
201 struct list_head *logged_list);
202void btrfs_put_logged_extents(struct list_head *logged_list);
203void btrfs_submit_logged_extents(struct list_head *logged_list,
204 struct btrfs_root *log);
201void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); 205void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
202void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); 206void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
203int __init ordered_data_init(void); 207int __init ordered_data_init(void);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 472302a2d745..2cf905877aaf 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1509,8 +1509,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
1509 ret = qgroup_rescan_init(fs_info, 0, 1); 1509 ret = qgroup_rescan_init(fs_info, 0, 1);
1510 if (!ret) { 1510 if (!ret) {
1511 qgroup_rescan_zero_tracking(fs_info); 1511 qgroup_rescan_zero_tracking(fs_info);
1512 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, 1512 btrfs_queue_work(fs_info->qgroup_rescan_workers,
1513 &fs_info->qgroup_rescan_work); 1513 &fs_info->qgroup_rescan_work);
1514 } 1514 }
1515 ret = 0; 1515 ret = 0;
1516 } 1516 }
@@ -2095,7 +2095,8 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
2095 2095
2096 memset(&fs_info->qgroup_rescan_work, 0, 2096 memset(&fs_info->qgroup_rescan_work, 0,
2097 sizeof(fs_info->qgroup_rescan_work)); 2097 sizeof(fs_info->qgroup_rescan_work));
2098 fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker; 2098 btrfs_init_work(&fs_info->qgroup_rescan_work,
2099 btrfs_qgroup_rescan_worker, NULL, NULL);
2099 2100
2100 if (ret) { 2101 if (ret) {
2101err: 2102err:
@@ -2158,8 +2159,8 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
2158 2159
2159 qgroup_rescan_zero_tracking(fs_info); 2160 qgroup_rescan_zero_tracking(fs_info);
2160 2161
2161 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, 2162 btrfs_queue_work(fs_info->qgroup_rescan_workers,
2162 &fs_info->qgroup_rescan_work); 2163 &fs_info->qgroup_rescan_work);
2163 2164
2164 return 0; 2165 return 0;
2165} 2166}
@@ -2190,6 +2191,6 @@ void
2190btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) 2191btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
2191{ 2192{
2192 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) 2193 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
2193 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, 2194 btrfs_queue_work(fs_info->qgroup_rescan_workers,
2194 &fs_info->qgroup_rescan_work); 2195 &fs_info->qgroup_rescan_work);
2195} 2196}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 9af0b25d991a..4055291a523e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1416,20 +1416,18 @@ cleanup:
1416 1416
1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio) 1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1418{ 1418{
1419 rbio->work.flags = 0; 1419 btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
1420 rbio->work.func = rmw_work;
1421 1420
1422 btrfs_queue_worker(&rbio->fs_info->rmw_workers, 1421 btrfs_queue_work(rbio->fs_info->rmw_workers,
1423 &rbio->work); 1422 &rbio->work);
1424} 1423}
1425 1424
1426static void async_read_rebuild(struct btrfs_raid_bio *rbio) 1425static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1427{ 1426{
1428 rbio->work.flags = 0; 1427 btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
1429 rbio->work.func = read_rebuild_work;
1430 1428
1431 btrfs_queue_worker(&rbio->fs_info->rmw_workers, 1429 btrfs_queue_work(rbio->fs_info->rmw_workers,
1432 &rbio->work); 1430 &rbio->work);
1433} 1431}
1434 1432
1435/* 1433/*
@@ -1667,10 +1665,9 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1667 plug = container_of(cb, struct btrfs_plug_cb, cb); 1665 plug = container_of(cb, struct btrfs_plug_cb, cb);
1668 1666
1669 if (from_schedule) { 1667 if (from_schedule) {
1670 plug->work.flags = 0; 1668 btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
1671 plug->work.func = unplug_work; 1669 btrfs_queue_work(plug->info->rmw_workers,
1672 btrfs_queue_worker(&plug->info->rmw_workers, 1670 &plug->work);
1673 &plug->work);
1674 return; 1671 return;
1675 } 1672 }
1676 run_plug(plug); 1673 run_plug(plug);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 31c797c48c3e..30947f923620 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -793,10 +793,10 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
793 /* FIXME we cannot handle this properly right now */ 793 /* FIXME we cannot handle this properly right now */
794 BUG(); 794 BUG();
795 } 795 }
796 rmw->work.func = reada_start_machine_worker; 796 btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
797 rmw->fs_info = fs_info; 797 rmw->fs_info = fs_info;
798 798
799 btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work); 799 btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
800} 800}
801 801
802#ifdef DEBUG 802#ifdef DEBUG
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 07b3b36f40ee..def428a25b2a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4248,7 +4248,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4248 btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", 4248 btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
4249 rc->block_group->key.objectid, rc->block_group->flags); 4249 rc->block_group->key.objectid, rc->block_group->flags);
4250 4250
4251 ret = btrfs_start_delalloc_roots(fs_info, 0); 4251 ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
4252 if (ret < 0) { 4252 if (ret < 0) {
4253 err = ret; 4253 err = ret;
4254 goto out; 4254 goto out;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 1389b69059de..38bb47e7d6b1 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -16,6 +16,7 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/err.h>
19#include <linux/uuid.h> 20#include <linux/uuid.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "transaction.h" 22#include "transaction.h"
@@ -271,7 +272,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
271 key.offset++; 272 key.offset++;
272 273
273 root = btrfs_read_fs_root(tree_root, &root_key); 274 root = btrfs_read_fs_root(tree_root, &root_key);
274 err = PTR_RET(root); 275 err = PTR_ERR_OR_ZERO(root);
275 if (err && err != -ENOENT) { 276 if (err && err != -ENOENT) {
276 break; 277 break;
277 } else if (err == -ENOENT) { 278 } else if (err == -ENOENT) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efba5d1282ee..93e6d7172844 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -315,6 +315,16 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
315 atomic_inc(&fs_info->scrubs_running); 315 atomic_inc(&fs_info->scrubs_running);
316 atomic_inc(&fs_info->scrubs_paused); 316 atomic_inc(&fs_info->scrubs_paused);
317 mutex_unlock(&fs_info->scrub_lock); 317 mutex_unlock(&fs_info->scrub_lock);
318
319 /*
320 * check if @scrubs_running=@scrubs_paused condition
321 * inside wait_event() is not an atomic operation.
322 * which means we may inc/dec @scrub_running/paused
323 * at any time. Let's wake up @scrub_pause_wait as
324 * much as we can to let commit transaction blocked less.
325 */
326 wake_up(&fs_info->scrub_pause_wait);
327
318 atomic_inc(&sctx->workers_pending); 328 atomic_inc(&sctx->workers_pending);
319} 329}
320 330
@@ -418,7 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
418 sbio->index = i; 428 sbio->index = i;
419 sbio->sctx = sctx; 429 sbio->sctx = sctx;
420 sbio->page_count = 0; 430 sbio->page_count = 0;
421 sbio->work.func = scrub_bio_end_io_worker; 431 btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,
432 NULL, NULL);
422 433
423 if (i != SCRUB_BIOS_PER_SCTX - 1) 434 if (i != SCRUB_BIOS_PER_SCTX - 1)
424 sctx->bios[i]->next_free = i + 1; 435 sctx->bios[i]->next_free = i + 1;
@@ -987,9 +998,10 @@ nodatasum_case:
987 fixup_nodatasum->root = fs_info->extent_root; 998 fixup_nodatasum->root = fs_info->extent_root;
988 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 999 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
989 scrub_pending_trans_workers_inc(sctx); 1000 scrub_pending_trans_workers_inc(sctx);
990 fixup_nodatasum->work.func = scrub_fixup_nodatasum; 1001 btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,
991 btrfs_queue_worker(&fs_info->scrub_workers, 1002 NULL, NULL);
992 &fixup_nodatasum->work); 1003 btrfs_queue_work(fs_info->scrub_workers,
1004 &fixup_nodatasum->work);
993 goto out; 1005 goto out;
994 } 1006 }
995 1007
@@ -1603,8 +1615,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
1603 sbio->err = err; 1615 sbio->err = err;
1604 sbio->bio = bio; 1616 sbio->bio = bio;
1605 1617
1606 sbio->work.func = scrub_wr_bio_end_io_worker; 1618 btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
1607 btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work); 1619 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1608} 1620}
1609 1621
1610static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) 1622static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
@@ -2072,7 +2084,7 @@ static void scrub_bio_end_io(struct bio *bio, int err)
2072 sbio->err = err; 2084 sbio->err = err;
2073 sbio->bio = bio; 2085 sbio->bio = bio;
2074 2086
2075 btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); 2087 btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2076} 2088}
2077 2089
2078static void scrub_bio_end_io_worker(struct btrfs_work *work) 2090static void scrub_bio_end_io_worker(struct btrfs_work *work)
@@ -2686,10 +2698,23 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2686 2698
2687 wait_event(sctx->list_wait, 2699 wait_event(sctx->list_wait,
2688 atomic_read(&sctx->bios_in_flight) == 0); 2700 atomic_read(&sctx->bios_in_flight) == 0);
2689 atomic_set(&sctx->wr_ctx.flush_all_writes, 0); 2701 atomic_inc(&fs_info->scrubs_paused);
2702 wake_up(&fs_info->scrub_pause_wait);
2703
2704 /*
2705 * must be called before we decrease @scrub_paused.
2706 * make sure we don't block transaction commit while
2707 * we are waiting pending workers finished.
2708 */
2690 wait_event(sctx->list_wait, 2709 wait_event(sctx->list_wait,
2691 atomic_read(&sctx->workers_pending) == 0); 2710 atomic_read(&sctx->workers_pending) == 0);
2692 scrub_blocked_if_needed(fs_info); 2711 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2712
2713 mutex_lock(&fs_info->scrub_lock);
2714 __scrub_blocked_if_needed(fs_info);
2715 atomic_dec(&fs_info->scrubs_paused);
2716 mutex_unlock(&fs_info->scrub_lock);
2717 wake_up(&fs_info->scrub_pause_wait);
2693 2718
2694 btrfs_put_block_group(cache); 2719 btrfs_put_block_group(cache);
2695 if (ret) 2720 if (ret)
@@ -2757,33 +2782,35 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2757 int is_dev_replace) 2782 int is_dev_replace)
2758{ 2783{
2759 int ret = 0; 2784 int ret = 0;
2785 int flags = WQ_FREEZABLE | WQ_UNBOUND;
2786 int max_active = fs_info->thread_pool_size;
2760 2787
2761 if (fs_info->scrub_workers_refcnt == 0) { 2788 if (fs_info->scrub_workers_refcnt == 0) {
2762 if (is_dev_replace) 2789 if (is_dev_replace)
2763 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, 2790 fs_info->scrub_workers =
2764 &fs_info->generic_worker); 2791 btrfs_alloc_workqueue("btrfs-scrub", flags,
2792 1, 4);
2765 else 2793 else
2766 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2794 fs_info->scrub_workers =
2767 fs_info->thread_pool_size, 2795 btrfs_alloc_workqueue("btrfs-scrub", flags,
2768 &fs_info->generic_worker); 2796 max_active, 4);
2769 fs_info->scrub_workers.idle_thresh = 4; 2797 if (!fs_info->scrub_workers) {
2770 ret = btrfs_start_workers(&fs_info->scrub_workers); 2798 ret = -ENOMEM;
2771 if (ret)
2772 goto out; 2799 goto out;
2773 btrfs_init_workers(&fs_info->scrub_wr_completion_workers, 2800 }
2774 "scrubwrc", 2801 fs_info->scrub_wr_completion_workers =
2775 fs_info->thread_pool_size, 2802 btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
2776 &fs_info->generic_worker); 2803 max_active, 2);
2777 fs_info->scrub_wr_completion_workers.idle_thresh = 2; 2804 if (!fs_info->scrub_wr_completion_workers) {
2778 ret = btrfs_start_workers( 2805 ret = -ENOMEM;
2779 &fs_info->scrub_wr_completion_workers);
2780 if (ret)
2781 goto out; 2806 goto out;
2782 btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1, 2807 }
2783 &fs_info->generic_worker); 2808 fs_info->scrub_nocow_workers =
2784 ret = btrfs_start_workers(&fs_info->scrub_nocow_workers); 2809 btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
2785 if (ret) 2810 if (!fs_info->scrub_nocow_workers) {
2811 ret = -ENOMEM;
2786 goto out; 2812 goto out;
2813 }
2787 } 2814 }
2788 ++fs_info->scrub_workers_refcnt; 2815 ++fs_info->scrub_workers_refcnt;
2789out: 2816out:
@@ -2793,9 +2820,9 @@ out:
2793static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) 2820static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2794{ 2821{
2795 if (--fs_info->scrub_workers_refcnt == 0) { 2822 if (--fs_info->scrub_workers_refcnt == 0) {
2796 btrfs_stop_workers(&fs_info->scrub_workers); 2823 btrfs_destroy_workqueue(fs_info->scrub_workers);
2797 btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); 2824 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
2798 btrfs_stop_workers(&fs_info->scrub_nocow_workers); 2825 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
2799 } 2826 }
2800 WARN_ON(fs_info->scrub_workers_refcnt < 0); 2827 WARN_ON(fs_info->scrub_workers_refcnt < 0);
2801} 2828}
@@ -3106,10 +3133,10 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3106 nocow_ctx->len = len; 3133 nocow_ctx->len = len;
3107 nocow_ctx->mirror_num = mirror_num; 3134 nocow_ctx->mirror_num = mirror_num;
3108 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; 3135 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3109 nocow_ctx->work.func = copy_nocow_pages_worker; 3136 btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);
3110 INIT_LIST_HEAD(&nocow_ctx->inodes); 3137 INIT_LIST_HEAD(&nocow_ctx->inodes);
3111 btrfs_queue_worker(&fs_info->scrub_nocow_workers, 3138 btrfs_queue_work(fs_info->scrub_nocow_workers,
3112 &nocow_ctx->work); 3139 &nocow_ctx->work);
3113 3140
3114 return 0; 3141 return 0;
3115} 3142}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 9dde9717c1b9..9b6da9d55f9a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -51,15 +51,18 @@ struct fs_path {
51 struct { 51 struct {
52 char *start; 52 char *start;
53 char *end; 53 char *end;
54 char *prepared;
55 54
56 char *buf; 55 char *buf;
57 int buf_len; 56 unsigned short buf_len:15;
58 unsigned int reversed:1; 57 unsigned short reversed:1;
59 unsigned int virtual_mem:1;
60 char inline_buf[]; 58 char inline_buf[];
61 }; 59 };
62 char pad[PAGE_SIZE]; 60 /*
61 * Average path length does not exceed 200 bytes, we'll have
62 * better packing in the slab and higher chance to satisfy
63 * a allocation later during send.
64 */
65 char pad[256];
63 }; 66 };
64}; 67};
65#define FS_PATH_INLINE_SIZE \ 68#define FS_PATH_INLINE_SIZE \
@@ -109,6 +112,7 @@ struct send_ctx {
109 int cur_inode_deleted; 112 int cur_inode_deleted;
110 u64 cur_inode_size; 113 u64 cur_inode_size;
111 u64 cur_inode_mode; 114 u64 cur_inode_mode;
115 u64 cur_inode_rdev;
112 u64 cur_inode_last_extent; 116 u64 cur_inode_last_extent;
113 117
114 u64 send_progress; 118 u64 send_progress;
@@ -120,6 +124,8 @@ struct send_ctx {
120 struct list_head name_cache_list; 124 struct list_head name_cache_list;
121 int name_cache_size; 125 int name_cache_size;
122 126
127 struct file_ra_state ra;
128
123 char *read_buf; 129 char *read_buf;
124 130
125 /* 131 /*
@@ -175,6 +181,47 @@ struct send_ctx {
175 * own move/rename can be performed. 181 * own move/rename can be performed.
176 */ 182 */
177 struct rb_root waiting_dir_moves; 183 struct rb_root waiting_dir_moves;
184
185 /*
186 * A directory that is going to be rm'ed might have a child directory
187 * which is in the pending directory moves index above. In this case,
188 * the directory can only be removed after the move/rename of its child
189 * is performed. Example:
190 *
191 * Parent snapshot:
192 *
193 * . (ino 256)
194 * |-- a/ (ino 257)
195 * |-- b/ (ino 258)
196 * |-- c/ (ino 259)
197 * | |-- x/ (ino 260)
198 * |
199 * |-- y/ (ino 261)
200 *
201 * Send snapshot:
202 *
203 * . (ino 256)
204 * |-- a/ (ino 257)
205 * |-- b/ (ino 258)
206 * |-- YY/ (ino 261)
207 * |-- x/ (ino 260)
208 *
209 * Sequence of steps that lead to the send snapshot:
210 * rm -f /a/b/c/foo.txt
211 * mv /a/b/y /a/b/YY
212 * mv /a/b/c/x /a/b/YY
213 * rmdir /a/b/c
214 *
215 * When the child is processed, its move/rename is delayed until its
216 * parent is processed (as explained above), but all other operations
217 * like update utimes, chown, chgrp, etc, are performed and the paths
218 * that it uses for those operations must use the orphanized name of
219 * its parent (the directory we're going to rm later), so we need to
220 * memorize that name.
221 *
222 * Indexed by the inode number of the directory to be deleted.
223 */
224 struct rb_root orphan_dirs;
178}; 225};
179 226
180struct pending_dir_move { 227struct pending_dir_move {
@@ -189,6 +236,18 @@ struct pending_dir_move {
189struct waiting_dir_move { 236struct waiting_dir_move {
190 struct rb_node node; 237 struct rb_node node;
191 u64 ino; 238 u64 ino;
239 /*
240 * There might be some directory that could not be removed because it
241 * was waiting for this directory inode to be moved first. Therefore
242 * after this directory is moved, we can try to rmdir the ino rmdir_ino.
243 */
244 u64 rmdir_ino;
245};
246
247struct orphan_dir_info {
248 struct rb_node node;
249 u64 ino;
250 u64 gen;
192}; 251};
193 252
194struct name_cache_entry { 253struct name_cache_entry {
@@ -214,6 +273,11 @@ struct name_cache_entry {
214 273
215static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); 274static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
216 275
276static struct waiting_dir_move *
277get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
278
279static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
280
217static int need_send_hole(struct send_ctx *sctx) 281static int need_send_hole(struct send_ctx *sctx)
218{ 282{
219 return (sctx->parent_root && !sctx->cur_inode_new && 283 return (sctx->parent_root && !sctx->cur_inode_new &&
@@ -242,7 +306,6 @@ static struct fs_path *fs_path_alloc(void)
242 if (!p) 306 if (!p)
243 return NULL; 307 return NULL;
244 p->reversed = 0; 308 p->reversed = 0;
245 p->virtual_mem = 0;
246 p->buf = p->inline_buf; 309 p->buf = p->inline_buf;
247 p->buf_len = FS_PATH_INLINE_SIZE; 310 p->buf_len = FS_PATH_INLINE_SIZE;
248 fs_path_reset(p); 311 fs_path_reset(p);
@@ -265,12 +328,8 @@ static void fs_path_free(struct fs_path *p)
265{ 328{
266 if (!p) 329 if (!p)
267 return; 330 return;
268 if (p->buf != p->inline_buf) { 331 if (p->buf != p->inline_buf)
269 if (p->virtual_mem) 332 kfree(p->buf);
270 vfree(p->buf);
271 else
272 kfree(p->buf);
273 }
274 kfree(p); 333 kfree(p);
275} 334}
276 335
@@ -292,40 +351,23 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
292 351
293 path_len = p->end - p->start; 352 path_len = p->end - p->start;
294 old_buf_len = p->buf_len; 353 old_buf_len = p->buf_len;
295 len = PAGE_ALIGN(len); 354
296 355 /*
297 if (p->buf == p->inline_buf) { 356 * First time the inline_buf does not suffice
298 tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN); 357 */
299 if (!tmp_buf) { 358 if (p->buf == p->inline_buf)
300 tmp_buf = vmalloc(len); 359 tmp_buf = kmalloc(len, GFP_NOFS);
301 if (!tmp_buf) 360 else
302 return -ENOMEM; 361 tmp_buf = krealloc(p->buf, len, GFP_NOFS);
303 p->virtual_mem = 1; 362 if (!tmp_buf)
304 } 363 return -ENOMEM;
305 memcpy(tmp_buf, p->buf, p->buf_len); 364 p->buf = tmp_buf;
306 p->buf = tmp_buf; 365 /*
307 p->buf_len = len; 366 * The real size of the buffer is bigger, this will let the fast path
308 } else { 367 * happen most of the time
309 if (p->virtual_mem) { 368 */
310 tmp_buf = vmalloc(len); 369 p->buf_len = ksize(p->buf);
311 if (!tmp_buf) 370
312 return -ENOMEM;
313 memcpy(tmp_buf, p->buf, p->buf_len);
314 vfree(p->buf);
315 } else {
316 tmp_buf = krealloc(p->buf, len, GFP_NOFS);
317 if (!tmp_buf) {
318 tmp_buf = vmalloc(len);
319 if (!tmp_buf)
320 return -ENOMEM;
321 memcpy(tmp_buf, p->buf, p->buf_len);
322 kfree(p->buf);
323 p->virtual_mem = 1;
324 }
325 }
326 p->buf = tmp_buf;
327 p->buf_len = len;
328 }
329 if (p->reversed) { 371 if (p->reversed) {
330 tmp_buf = p->buf + old_buf_len - path_len - 1; 372 tmp_buf = p->buf + old_buf_len - path_len - 1;
331 p->end = p->buf + p->buf_len - 1; 373 p->end = p->buf + p->buf_len - 1;
@@ -338,7 +380,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
338 return 0; 380 return 0;
339} 381}
340 382
341static int fs_path_prepare_for_add(struct fs_path *p, int name_len) 383static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
384 char **prepared)
342{ 385{
343 int ret; 386 int ret;
344 int new_len; 387 int new_len;
@@ -354,11 +397,11 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
354 if (p->start != p->end) 397 if (p->start != p->end)
355 *--p->start = '/'; 398 *--p->start = '/';
356 p->start -= name_len; 399 p->start -= name_len;
357 p->prepared = p->start; 400 *prepared = p->start;
358 } else { 401 } else {
359 if (p->start != p->end) 402 if (p->start != p->end)
360 *p->end++ = '/'; 403 *p->end++ = '/';
361 p->prepared = p->end; 404 *prepared = p->end;
362 p->end += name_len; 405 p->end += name_len;
363 *p->end = 0; 406 *p->end = 0;
364 } 407 }
@@ -370,12 +413,12 @@ out:
370static int fs_path_add(struct fs_path *p, const char *name, int name_len) 413static int fs_path_add(struct fs_path *p, const char *name, int name_len)
371{ 414{
372 int ret; 415 int ret;
416 char *prepared;
373 417
374 ret = fs_path_prepare_for_add(p, name_len); 418 ret = fs_path_prepare_for_add(p, name_len, &prepared);
375 if (ret < 0) 419 if (ret < 0)
376 goto out; 420 goto out;
377 memcpy(p->prepared, name, name_len); 421 memcpy(prepared, name, name_len);
378 p->prepared = NULL;
379 422
380out: 423out:
381 return ret; 424 return ret;
@@ -384,12 +427,12 @@ out:
384static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) 427static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
385{ 428{
386 int ret; 429 int ret;
430 char *prepared;
387 431
388 ret = fs_path_prepare_for_add(p, p2->end - p2->start); 432 ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
389 if (ret < 0) 433 if (ret < 0)
390 goto out; 434 goto out;
391 memcpy(p->prepared, p2->start, p2->end - p2->start); 435 memcpy(prepared, p2->start, p2->end - p2->start);
392 p->prepared = NULL;
393 436
394out: 437out:
395 return ret; 438 return ret;
@@ -400,13 +443,13 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p,
400 unsigned long off, int len) 443 unsigned long off, int len)
401{ 444{
402 int ret; 445 int ret;
446 char *prepared;
403 447
404 ret = fs_path_prepare_for_add(p, len); 448 ret = fs_path_prepare_for_add(p, len, &prepared);
405 if (ret < 0) 449 if (ret < 0)
406 goto out; 450 goto out;
407 451
408 read_extent_buffer(eb, p->prepared, off, len); 452 read_extent_buffer(eb, prepared, off, len);
409 p->prepared = NULL;
410 453
411out: 454out:
412 return ret; 455 return ret;
@@ -915,9 +958,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
915 struct btrfs_dir_item *di; 958 struct btrfs_dir_item *di;
916 struct btrfs_key di_key; 959 struct btrfs_key di_key;
917 char *buf = NULL; 960 char *buf = NULL;
918 char *buf2 = NULL; 961 const int buf_len = PATH_MAX;
919 int buf_len;
920 int buf_virtual = 0;
921 u32 name_len; 962 u32 name_len;
922 u32 data_len; 963 u32 data_len;
923 u32 cur; 964 u32 cur;
@@ -927,7 +968,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
927 int num; 968 int num;
928 u8 type; 969 u8 type;
929 970
930 buf_len = PAGE_SIZE;
931 buf = kmalloc(buf_len, GFP_NOFS); 971 buf = kmalloc(buf_len, GFP_NOFS);
932 if (!buf) { 972 if (!buf) {
933 ret = -ENOMEM; 973 ret = -ENOMEM;
@@ -949,30 +989,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
949 type = btrfs_dir_type(eb, di); 989 type = btrfs_dir_type(eb, di);
950 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 990 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
951 991
992 /*
993 * Path too long
994 */
952 if (name_len + data_len > buf_len) { 995 if (name_len + data_len > buf_len) {
953 buf_len = PAGE_ALIGN(name_len + data_len); 996 ret = -ENAMETOOLONG;
954 if (buf_virtual) { 997 goto out;
955 buf2 = vmalloc(buf_len);
956 if (!buf2) {
957 ret = -ENOMEM;
958 goto out;
959 }
960 vfree(buf);
961 } else {
962 buf2 = krealloc(buf, buf_len, GFP_NOFS);
963 if (!buf2) {
964 buf2 = vmalloc(buf_len);
965 if (!buf2) {
966 ret = -ENOMEM;
967 goto out;
968 }
969 kfree(buf);
970 buf_virtual = 1;
971 }
972 }
973
974 buf = buf2;
975 buf2 = NULL;
976 } 998 }
977 999
978 read_extent_buffer(eb, buf, (unsigned long)(di + 1), 1000 read_extent_buffer(eb, buf, (unsigned long)(di + 1),
@@ -995,10 +1017,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
995 } 1017 }
996 1018
997out: 1019out:
998 if (buf_virtual) 1020 kfree(buf);
999 vfree(buf);
1000 else
1001 kfree(buf);
1002 return ret; 1021 return ret;
1003} 1022}
1004 1023
@@ -1292,8 +1311,6 @@ static int find_extent_clone(struct send_ctx *sctx,
1292 extent_item_pos = logical - found_key.objectid; 1311 extent_item_pos = logical - found_key.objectid;
1293 else 1312 else
1294 extent_item_pos = 0; 1313 extent_item_pos = 0;
1295
1296 extent_item_pos = logical - found_key.objectid;
1297 ret = iterate_extent_inodes(sctx->send_root->fs_info, 1314 ret = iterate_extent_inodes(sctx->send_root->fs_info,
1298 found_key.objectid, extent_item_pos, 1, 1315 found_key.objectid, extent_item_pos, 1,
1299 __iterate_backrefs, backref_ctx); 1316 __iterate_backrefs, backref_ctx);
@@ -1418,11 +1435,7 @@ static int gen_unique_name(struct send_ctx *sctx,
1418 while (1) { 1435 while (1) {
1419 len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", 1436 len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
1420 ino, gen, idx); 1437 ino, gen, idx);
1421 if (len >= sizeof(tmp)) { 1438 ASSERT(len < sizeof(tmp));
1422 /* should really not happen */
1423 ret = -EOVERFLOW;
1424 goto out;
1425 }
1426 1439
1427 di = btrfs_lookup_dir_item(NULL, sctx->send_root, 1440 di = btrfs_lookup_dir_item(NULL, sctx->send_root,
1428 path, BTRFS_FIRST_FREE_OBJECTID, 1441 path, BTRFS_FIRST_FREE_OBJECTID,
@@ -1898,13 +1911,20 @@ static void name_cache_delete(struct send_ctx *sctx,
1898 1911
1899 nce_head = radix_tree_lookup(&sctx->name_cache, 1912 nce_head = radix_tree_lookup(&sctx->name_cache,
1900 (unsigned long)nce->ino); 1913 (unsigned long)nce->ino);
1901 BUG_ON(!nce_head); 1914 if (!nce_head) {
1915 btrfs_err(sctx->send_root->fs_info,
1916 "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
1917 nce->ino, sctx->name_cache_size);
1918 }
1902 1919
1903 list_del(&nce->radix_list); 1920 list_del(&nce->radix_list);
1904 list_del(&nce->list); 1921 list_del(&nce->list);
1905 sctx->name_cache_size--; 1922 sctx->name_cache_size--;
1906 1923
1907 if (list_empty(nce_head)) { 1924 /*
1925 * We may not get to the final release of nce_head if the lookup fails
1926 */
1927 if (nce_head && list_empty(nce_head)) {
1908 radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); 1928 radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
1909 kfree(nce_head); 1929 kfree(nce_head);
1910 } 1930 }
@@ -1977,7 +1997,6 @@ static void name_cache_free(struct send_ctx *sctx)
1977 */ 1997 */
1978static int __get_cur_name_and_parent(struct send_ctx *sctx, 1998static int __get_cur_name_and_parent(struct send_ctx *sctx,
1979 u64 ino, u64 gen, 1999 u64 ino, u64 gen,
1980 int skip_name_cache,
1981 u64 *parent_ino, 2000 u64 *parent_ino,
1982 u64 *parent_gen, 2001 u64 *parent_gen,
1983 struct fs_path *dest) 2002 struct fs_path *dest)
@@ -1987,8 +2006,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1987 struct btrfs_path *path = NULL; 2006 struct btrfs_path *path = NULL;
1988 struct name_cache_entry *nce = NULL; 2007 struct name_cache_entry *nce = NULL;
1989 2008
1990 if (skip_name_cache)
1991 goto get_ref;
1992 /* 2009 /*
1993 * First check if we already did a call to this function with the same 2010 * First check if we already did a call to this function with the same
1994 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes 2011 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
@@ -2033,12 +2050,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
2033 goto out_cache; 2050 goto out_cache;
2034 } 2051 }
2035 2052
2036get_ref:
2037 /* 2053 /*
2038 * Depending on whether the inode was already processed or not, use 2054 * Depending on whether the inode was already processed or not, use
2039 * send_root or parent_root for ref lookup. 2055 * send_root or parent_root for ref lookup.
2040 */ 2056 */
2041 if (ino < sctx->send_progress && !skip_name_cache) 2057 if (ino < sctx->send_progress)
2042 ret = get_first_ref(sctx->send_root, ino, 2058 ret = get_first_ref(sctx->send_root, ino,
2043 parent_ino, parent_gen, dest); 2059 parent_ino, parent_gen, dest);
2044 else 2060 else
@@ -2062,8 +2078,6 @@ get_ref:
2062 goto out; 2078 goto out;
2063 ret = 1; 2079 ret = 1;
2064 } 2080 }
2065 if (skip_name_cache)
2066 goto out;
2067 2081
2068out_cache: 2082out_cache:
2069 /* 2083 /*
@@ -2131,9 +2145,6 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2131 u64 parent_inode = 0; 2145 u64 parent_inode = 0;
2132 u64 parent_gen = 0; 2146 u64 parent_gen = 0;
2133 int stop = 0; 2147 int stop = 0;
2134 u64 start_ino = ino;
2135 u64 start_gen = gen;
2136 int skip_name_cache = 0;
2137 2148
2138 name = fs_path_alloc(); 2149 name = fs_path_alloc();
2139 if (!name) { 2150 if (!name) {
@@ -2141,31 +2152,33 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2141 goto out; 2152 goto out;
2142 } 2153 }
2143 2154
2144 if (is_waiting_for_move(sctx, ino))
2145 skip_name_cache = 1;
2146
2147again:
2148 dest->reversed = 1; 2155 dest->reversed = 1;
2149 fs_path_reset(dest); 2156 fs_path_reset(dest);
2150 2157
2151 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { 2158 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
2152 fs_path_reset(name); 2159 fs_path_reset(name);
2153 2160
2154 ret = __get_cur_name_and_parent(sctx, ino, gen, skip_name_cache, 2161 if (is_waiting_for_rm(sctx, ino)) {
2155 &parent_inode, &parent_gen, name); 2162 ret = gen_unique_name(sctx, ino, gen, name);
2163 if (ret < 0)
2164 goto out;
2165 ret = fs_path_add_path(dest, name);
2166 break;
2167 }
2168
2169 if (is_waiting_for_move(sctx, ino)) {
2170 ret = get_first_ref(sctx->parent_root, ino,
2171 &parent_inode, &parent_gen, name);
2172 } else {
2173 ret = __get_cur_name_and_parent(sctx, ino, gen,
2174 &parent_inode,
2175 &parent_gen, name);
2176 if (ret)
2177 stop = 1;
2178 }
2179
2156 if (ret < 0) 2180 if (ret < 0)
2157 goto out; 2181 goto out;
2158 if (ret)
2159 stop = 1;
2160
2161 if (!skip_name_cache &&
2162 is_waiting_for_move(sctx, parent_inode)) {
2163 ino = start_ino;
2164 gen = start_gen;
2165 stop = 0;
2166 skip_name_cache = 1;
2167 goto again;
2168 }
2169 2182
2170 ret = fs_path_add_path(dest, name); 2183 ret = fs_path_add_path(dest, name);
2171 if (ret < 0) 2184 if (ret < 0)
@@ -2429,10 +2442,16 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
2429 if (!p) 2442 if (!p)
2430 return -ENOMEM; 2443 return -ENOMEM;
2431 2444
2432 ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL, 2445 if (ino != sctx->cur_ino) {
2433 NULL, &rdev); 2446 ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
2434 if (ret < 0) 2447 NULL, NULL, &rdev);
2435 goto out; 2448 if (ret < 0)
2449 goto out;
2450 } else {
2451 gen = sctx->cur_inode_gen;
2452 mode = sctx->cur_inode_mode;
2453 rdev = sctx->cur_inode_rdev;
2454 }
2436 2455
2437 if (S_ISREG(mode)) { 2456 if (S_ISREG(mode)) {
2438 cmd = BTRFS_SEND_C_MKFILE; 2457 cmd = BTRFS_SEND_C_MKFILE;
@@ -2512,17 +2531,26 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
2512 key.objectid = dir; 2531 key.objectid = dir;
2513 key.type = BTRFS_DIR_INDEX_KEY; 2532 key.type = BTRFS_DIR_INDEX_KEY;
2514 key.offset = 0; 2533 key.offset = 0;
2534 ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
2535 if (ret < 0)
2536 goto out;
2537
2515 while (1) { 2538 while (1) {
2516 ret = btrfs_search_slot_for_read(sctx->send_root, &key, path, 2539 eb = path->nodes[0];
2517 1, 0); 2540 slot = path->slots[0];
2518 if (ret < 0) 2541 if (slot >= btrfs_header_nritems(eb)) {
2519 goto out; 2542 ret = btrfs_next_leaf(sctx->send_root, path);
2520 if (!ret) { 2543 if (ret < 0) {
2521 eb = path->nodes[0]; 2544 goto out;
2522 slot = path->slots[0]; 2545 } else if (ret > 0) {
2523 btrfs_item_key_to_cpu(eb, &found_key, slot); 2546 ret = 0;
2547 break;
2548 }
2549 continue;
2524 } 2550 }
2525 if (ret || found_key.objectid != key.objectid || 2551
2552 btrfs_item_key_to_cpu(eb, &found_key, slot);
2553 if (found_key.objectid != key.objectid ||
2526 found_key.type != key.type) { 2554 found_key.type != key.type) {
2527 ret = 0; 2555 ret = 0;
2528 goto out; 2556 goto out;
@@ -2537,8 +2565,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
2537 goto out; 2565 goto out;
2538 } 2566 }
2539 2567
2540 key.offset = found_key.offset + 1; 2568 path->slots[0]++;
2541 btrfs_release_path(path);
2542 } 2569 }
2543 2570
2544out: 2571out:
@@ -2590,7 +2617,7 @@ struct recorded_ref {
2590 * everything mixed. So we first record all refs and later process them. 2617 * everything mixed. So we first record all refs and later process them.
2591 * This function is a helper to record one ref. 2618 * This function is a helper to record one ref.
2592 */ 2619 */
2593static int record_ref(struct list_head *head, u64 dir, 2620static int __record_ref(struct list_head *head, u64 dir,
2594 u64 dir_gen, struct fs_path *path) 2621 u64 dir_gen, struct fs_path *path)
2595{ 2622{
2596 struct recorded_ref *ref; 2623 struct recorded_ref *ref;
@@ -2676,12 +2703,78 @@ out:
2676 return ret; 2703 return ret;
2677} 2704}
2678 2705
2706static struct orphan_dir_info *
2707add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
2708{
2709 struct rb_node **p = &sctx->orphan_dirs.rb_node;
2710 struct rb_node *parent = NULL;
2711 struct orphan_dir_info *entry, *odi;
2712
2713 odi = kmalloc(sizeof(*odi), GFP_NOFS);
2714 if (!odi)
2715 return ERR_PTR(-ENOMEM);
2716 odi->ino = dir_ino;
2717 odi->gen = 0;
2718
2719 while (*p) {
2720 parent = *p;
2721 entry = rb_entry(parent, struct orphan_dir_info, node);
2722 if (dir_ino < entry->ino) {
2723 p = &(*p)->rb_left;
2724 } else if (dir_ino > entry->ino) {
2725 p = &(*p)->rb_right;
2726 } else {
2727 kfree(odi);
2728 return entry;
2729 }
2730 }
2731
2732 rb_link_node(&odi->node, parent, p);
2733 rb_insert_color(&odi->node, &sctx->orphan_dirs);
2734 return odi;
2735}
2736
2737static struct orphan_dir_info *
2738get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
2739{
2740 struct rb_node *n = sctx->orphan_dirs.rb_node;
2741 struct orphan_dir_info *entry;
2742
2743 while (n) {
2744 entry = rb_entry(n, struct orphan_dir_info, node);
2745 if (dir_ino < entry->ino)
2746 n = n->rb_left;
2747 else if (dir_ino > entry->ino)
2748 n = n->rb_right;
2749 else
2750 return entry;
2751 }
2752 return NULL;
2753}
2754
2755static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
2756{
2757 struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
2758
2759 return odi != NULL;
2760}
2761
2762static void free_orphan_dir_info(struct send_ctx *sctx,
2763 struct orphan_dir_info *odi)
2764{
2765 if (!odi)
2766 return;
2767 rb_erase(&odi->node, &sctx->orphan_dirs);
2768 kfree(odi);
2769}
2770
2679/* 2771/*
2680 * Returns 1 if a directory can be removed at this point in time. 2772 * Returns 1 if a directory can be removed at this point in time.
2681 * We check this by iterating all dir items and checking if the inode behind 2773 * We check this by iterating all dir items and checking if the inode behind
2682 * the dir item was already processed. 2774 * the dir item was already processed.
2683 */ 2775 */
2684static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress) 2776static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
2777 u64 send_progress)
2685{ 2778{
2686 int ret = 0; 2779 int ret = 0;
2687 struct btrfs_root *root = sctx->parent_root; 2780 struct btrfs_root *root = sctx->parent_root;
@@ -2704,31 +2797,52 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
2704 key.objectid = dir; 2797 key.objectid = dir;
2705 key.type = BTRFS_DIR_INDEX_KEY; 2798 key.type = BTRFS_DIR_INDEX_KEY;
2706 key.offset = 0; 2799 key.offset = 0;
2800 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2801 if (ret < 0)
2802 goto out;
2707 2803
2708 while (1) { 2804 while (1) {
2709 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 2805 struct waiting_dir_move *dm;
2710 if (ret < 0) 2806
2711 goto out; 2807 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2712 if (!ret) { 2808 ret = btrfs_next_leaf(root, path);
2713 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2809 if (ret < 0)
2714 path->slots[0]); 2810 goto out;
2811 else if (ret > 0)
2812 break;
2813 continue;
2715 } 2814 }
2716 if (ret || found_key.objectid != key.objectid || 2815 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2717 found_key.type != key.type) { 2816 path->slots[0]);
2817 if (found_key.objectid != key.objectid ||
2818 found_key.type != key.type)
2718 break; 2819 break;
2719 }
2720 2820
2721 di = btrfs_item_ptr(path->nodes[0], path->slots[0], 2821 di = btrfs_item_ptr(path->nodes[0], path->slots[0],
2722 struct btrfs_dir_item); 2822 struct btrfs_dir_item);
2723 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); 2823 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
2724 2824
2825 dm = get_waiting_dir_move(sctx, loc.objectid);
2826 if (dm) {
2827 struct orphan_dir_info *odi;
2828
2829 odi = add_orphan_dir_info(sctx, dir);
2830 if (IS_ERR(odi)) {
2831 ret = PTR_ERR(odi);
2832 goto out;
2833 }
2834 odi->gen = dir_gen;
2835 dm->rmdir_ino = dir;
2836 ret = 0;
2837 goto out;
2838 }
2839
2725 if (loc.objectid > send_progress) { 2840 if (loc.objectid > send_progress) {
2726 ret = 0; 2841 ret = 0;
2727 goto out; 2842 goto out;
2728 } 2843 }
2729 2844
2730 btrfs_release_path(path); 2845 path->slots[0]++;
2731 key.offset = found_key.offset + 1;
2732 } 2846 }
2733 2847
2734 ret = 1; 2848 ret = 1;
@@ -2740,19 +2854,9 @@ out:
2740 2854
2741static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) 2855static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
2742{ 2856{
2743 struct rb_node *n = sctx->waiting_dir_moves.rb_node; 2857 struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
2744 struct waiting_dir_move *entry;
2745 2858
2746 while (n) { 2859 return entry != NULL;
2747 entry = rb_entry(n, struct waiting_dir_move, node);
2748 if (ino < entry->ino)
2749 n = n->rb_left;
2750 else if (ino > entry->ino)
2751 n = n->rb_right;
2752 else
2753 return 1;
2754 }
2755 return 0;
2756} 2860}
2757 2861
2758static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) 2862static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
@@ -2765,6 +2869,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
2765 if (!dm) 2869 if (!dm)
2766 return -ENOMEM; 2870 return -ENOMEM;
2767 dm->ino = ino; 2871 dm->ino = ino;
2872 dm->rmdir_ino = 0;
2768 2873
2769 while (*p) { 2874 while (*p) {
2770 parent = *p; 2875 parent = *p;
@@ -2784,31 +2889,41 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
2784 return 0; 2889 return 0;
2785} 2890}
2786 2891
2787static int del_waiting_dir_move(struct send_ctx *sctx, u64 ino) 2892static struct waiting_dir_move *
2893get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
2788{ 2894{
2789 struct rb_node *n = sctx->waiting_dir_moves.rb_node; 2895 struct rb_node *n = sctx->waiting_dir_moves.rb_node;
2790 struct waiting_dir_move *entry; 2896 struct waiting_dir_move *entry;
2791 2897
2792 while (n) { 2898 while (n) {
2793 entry = rb_entry(n, struct waiting_dir_move, node); 2899 entry = rb_entry(n, struct waiting_dir_move, node);
2794 if (ino < entry->ino) { 2900 if (ino < entry->ino)
2795 n = n->rb_left; 2901 n = n->rb_left;
2796 } else if (ino > entry->ino) { 2902 else if (ino > entry->ino)
2797 n = n->rb_right; 2903 n = n->rb_right;
2798 } else { 2904 else
2799 rb_erase(&entry->node, &sctx->waiting_dir_moves); 2905 return entry;
2800 kfree(entry);
2801 return 0;
2802 }
2803 } 2906 }
2804 return -ENOENT; 2907 return NULL;
2908}
2909
2910static void free_waiting_dir_move(struct send_ctx *sctx,
2911 struct waiting_dir_move *dm)
2912{
2913 if (!dm)
2914 return;
2915 rb_erase(&dm->node, &sctx->waiting_dir_moves);
2916 kfree(dm);
2805} 2917}
2806 2918
2807static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino) 2919static int add_pending_dir_move(struct send_ctx *sctx,
2920 u64 ino,
2921 u64 ino_gen,
2922 u64 parent_ino)
2808{ 2923{
2809 struct rb_node **p = &sctx->pending_dir_moves.rb_node; 2924 struct rb_node **p = &sctx->pending_dir_moves.rb_node;
2810 struct rb_node *parent = NULL; 2925 struct rb_node *parent = NULL;
2811 struct pending_dir_move *entry, *pm; 2926 struct pending_dir_move *entry = NULL, *pm;
2812 struct recorded_ref *cur; 2927 struct recorded_ref *cur;
2813 int exists = 0; 2928 int exists = 0;
2814 int ret; 2929 int ret;
@@ -2817,8 +2932,8 @@ static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
2817 if (!pm) 2932 if (!pm)
2818 return -ENOMEM; 2933 return -ENOMEM;
2819 pm->parent_ino = parent_ino; 2934 pm->parent_ino = parent_ino;
2820 pm->ino = sctx->cur_ino; 2935 pm->ino = ino;
2821 pm->gen = sctx->cur_inode_gen; 2936 pm->gen = ino_gen;
2822 INIT_LIST_HEAD(&pm->list); 2937 INIT_LIST_HEAD(&pm->list);
2823 INIT_LIST_HEAD(&pm->update_refs); 2938 INIT_LIST_HEAD(&pm->update_refs);
2824 RB_CLEAR_NODE(&pm->node); 2939 RB_CLEAR_NODE(&pm->node);
@@ -2888,19 +3003,52 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2888{ 3003{
2889 struct fs_path *from_path = NULL; 3004 struct fs_path *from_path = NULL;
2890 struct fs_path *to_path = NULL; 3005 struct fs_path *to_path = NULL;
3006 struct fs_path *name = NULL;
2891 u64 orig_progress = sctx->send_progress; 3007 u64 orig_progress = sctx->send_progress;
2892 struct recorded_ref *cur; 3008 struct recorded_ref *cur;
3009 u64 parent_ino, parent_gen;
3010 struct waiting_dir_move *dm = NULL;
3011 u64 rmdir_ino = 0;
2893 int ret; 3012 int ret;
2894 3013
3014 name = fs_path_alloc();
2895 from_path = fs_path_alloc(); 3015 from_path = fs_path_alloc();
2896 if (!from_path) 3016 if (!name || !from_path) {
2897 return -ENOMEM; 3017 ret = -ENOMEM;
3018 goto out;
3019 }
2898 3020
2899 sctx->send_progress = pm->ino; 3021 dm = get_waiting_dir_move(sctx, pm->ino);
2900 ret = get_cur_path(sctx, pm->ino, pm->gen, from_path); 3022 ASSERT(dm);
3023 rmdir_ino = dm->rmdir_ino;
3024 free_waiting_dir_move(sctx, dm);
3025
3026 ret = get_first_ref(sctx->parent_root, pm->ino,
3027 &parent_ino, &parent_gen, name);
2901 if (ret < 0) 3028 if (ret < 0)
2902 goto out; 3029 goto out;
2903 3030
3031 if (parent_ino == sctx->cur_ino) {
3032 /* child only renamed, not moved */
3033 ASSERT(parent_gen == sctx->cur_inode_gen);
3034 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
3035 from_path);
3036 if (ret < 0)
3037 goto out;
3038 ret = fs_path_add_path(from_path, name);
3039 if (ret < 0)
3040 goto out;
3041 } else {
3042 /* child moved and maybe renamed too */
3043 sctx->send_progress = pm->ino;
3044 ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
3045 if (ret < 0)
3046 goto out;
3047 }
3048
3049 fs_path_free(name);
3050 name = NULL;
3051
2904 to_path = fs_path_alloc(); 3052 to_path = fs_path_alloc();
2905 if (!to_path) { 3053 if (!to_path) {
2906 ret = -ENOMEM; 3054 ret = -ENOMEM;
@@ -2908,9 +3056,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2908 } 3056 }
2909 3057
2910 sctx->send_progress = sctx->cur_ino + 1; 3058 sctx->send_progress = sctx->cur_ino + 1;
2911 ret = del_waiting_dir_move(sctx, pm->ino);
2912 ASSERT(ret == 0);
2913
2914 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); 3059 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
2915 if (ret < 0) 3060 if (ret < 0)
2916 goto out; 3061 goto out;
@@ -2919,6 +3064,35 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2919 if (ret < 0) 3064 if (ret < 0)
2920 goto out; 3065 goto out;
2921 3066
3067 if (rmdir_ino) {
3068 struct orphan_dir_info *odi;
3069
3070 odi = get_orphan_dir_info(sctx, rmdir_ino);
3071 if (!odi) {
3072 /* already deleted */
3073 goto finish;
3074 }
3075 ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
3076 if (ret < 0)
3077 goto out;
3078 if (!ret)
3079 goto finish;
3080
3081 name = fs_path_alloc();
3082 if (!name) {
3083 ret = -ENOMEM;
3084 goto out;
3085 }
3086 ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
3087 if (ret < 0)
3088 goto out;
3089 ret = send_rmdir(sctx, name);
3090 if (ret < 0)
3091 goto out;
3092 free_orphan_dir_info(sctx, odi);
3093 }
3094
3095finish:
2922 ret = send_utimes(sctx, pm->ino, pm->gen); 3096 ret = send_utimes(sctx, pm->ino, pm->gen);
2923 if (ret < 0) 3097 if (ret < 0)
2924 goto out; 3098 goto out;
@@ -2928,12 +3102,15 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2928 * and old parent(s). 3102 * and old parent(s).
2929 */ 3103 */
2930 list_for_each_entry(cur, &pm->update_refs, list) { 3104 list_for_each_entry(cur, &pm->update_refs, list) {
3105 if (cur->dir == rmdir_ino)
3106 continue;
2931 ret = send_utimes(sctx, cur->dir, cur->dir_gen); 3107 ret = send_utimes(sctx, cur->dir, cur->dir_gen);
2932 if (ret < 0) 3108 if (ret < 0)
2933 goto out; 3109 goto out;
2934 } 3110 }
2935 3111
2936out: 3112out:
3113 fs_path_free(name);
2937 fs_path_free(from_path); 3114 fs_path_free(from_path);
2938 fs_path_free(to_path); 3115 fs_path_free(to_path);
2939 sctx->send_progress = orig_progress; 3116 sctx->send_progress = orig_progress;
@@ -3005,17 +3182,19 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3005 int ret; 3182 int ret;
3006 u64 ino = parent_ref->dir; 3183 u64 ino = parent_ref->dir;
3007 u64 parent_ino_before, parent_ino_after; 3184 u64 parent_ino_before, parent_ino_after;
3008 u64 new_gen, old_gen; 3185 u64 old_gen;
3009 struct fs_path *path_before = NULL; 3186 struct fs_path *path_before = NULL;
3010 struct fs_path *path_after = NULL; 3187 struct fs_path *path_after = NULL;
3011 int len1, len2; 3188 int len1, len2;
3012 3189 int register_upper_dirs;
3013 if (parent_ref->dir <= sctx->cur_ino) 3190 u64 gen;
3014 return 0;
3015 3191
3016 if (is_waiting_for_move(sctx, ino)) 3192 if (is_waiting_for_move(sctx, ino))
3017 return 1; 3193 return 1;
3018 3194
3195 if (parent_ref->dir <= sctx->cur_ino)
3196 return 0;
3197
3019 ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen, 3198 ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen,
3020 NULL, NULL, NULL, NULL); 3199 NULL, NULL, NULL, NULL);
3021 if (ret == -ENOENT) 3200 if (ret == -ENOENT)
@@ -3023,12 +3202,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3023 else if (ret < 0) 3202 else if (ret < 0)
3024 return ret; 3203 return ret;
3025 3204
3026 ret = get_inode_info(sctx->send_root, ino, NULL, &new_gen, 3205 if (parent_ref->dir_gen != old_gen)
3027 NULL, NULL, NULL, NULL);
3028 if (ret < 0)
3029 return ret;
3030
3031 if (new_gen != old_gen)
3032 return 0; 3206 return 0;
3033 3207
3034 path_before = fs_path_alloc(); 3208 path_before = fs_path_alloc();
@@ -3051,7 +3225,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3051 } 3225 }
3052 3226
3053 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, 3227 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
3054 NULL, path_after); 3228 &gen, path_after);
3055 if (ret == -ENOENT) { 3229 if (ret == -ENOENT) {
3056 ret = 0; 3230 ret = 0;
3057 goto out; 3231 goto out;
@@ -3061,13 +3235,67 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3061 3235
3062 len1 = fs_path_len(path_before); 3236 len1 = fs_path_len(path_before);
3063 len2 = fs_path_len(path_after); 3237 len2 = fs_path_len(path_after);
3064 if ((parent_ino_before != parent_ino_after) && (len1 != len2 || 3238 if (parent_ino_before != parent_ino_after || len1 != len2 ||
3065 memcmp(path_before->start, path_after->start, len1))) { 3239 memcmp(path_before->start, path_after->start, len1)) {
3066 ret = 1; 3240 ret = 1;
3067 goto out; 3241 goto out;
3068 } 3242 }
3069 ret = 0; 3243 ret = 0;
3070 3244
3245 /*
3246 * Ok, our new most direct ancestor has a higher inode number but
3247 * wasn't moved/renamed. So maybe some of the new ancestors higher in
3248 * the hierarchy have an higher inode number too *and* were renamed
3249 * or moved - in this case we need to wait for the ancestor's rename
3250 * or move operation before we can do the move/rename for the current
3251 * inode.
3252 */
3253 register_upper_dirs = 0;
3254 ino = parent_ino_after;
3255again:
3256 while ((ret == 0 || register_upper_dirs) && ino > sctx->cur_ino) {
3257 u64 parent_gen;
3258
3259 fs_path_reset(path_before);
3260 fs_path_reset(path_after);
3261
3262 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
3263 &parent_gen, path_after);
3264 if (ret < 0)
3265 goto out;
3266 ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
3267 NULL, path_before);
3268 if (ret == -ENOENT) {
3269 ret = 0;
3270 break;
3271 } else if (ret < 0) {
3272 goto out;
3273 }
3274
3275 len1 = fs_path_len(path_before);
3276 len2 = fs_path_len(path_after);
3277 if (parent_ino_before != parent_ino_after || len1 != len2 ||
3278 memcmp(path_before->start, path_after->start, len1)) {
3279 ret = 1;
3280 if (register_upper_dirs) {
3281 break;
3282 } else {
3283 register_upper_dirs = 1;
3284 ino = parent_ref->dir;
3285 gen = parent_ref->dir_gen;
3286 goto again;
3287 }
3288 } else if (register_upper_dirs) {
3289 ret = add_pending_dir_move(sctx, ino, gen,
3290 parent_ino_after);
3291 if (ret < 0 && ret != -EEXIST)
3292 goto out;
3293 }
3294
3295 ino = parent_ino_after;
3296 gen = parent_gen;
3297 }
3298
3071out: 3299out:
3072 fs_path_free(path_before); 3300 fs_path_free(path_before);
3073 fs_path_free(path_after); 3301 fs_path_free(path_after);
@@ -3089,6 +3317,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
3089 u64 ow_gen; 3317 u64 ow_gen;
3090 int did_overwrite = 0; 3318 int did_overwrite = 0;
3091 int is_orphan = 0; 3319 int is_orphan = 0;
3320 u64 last_dir_ino_rm = 0;
3092 3321
3093verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); 3322verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3094 3323
@@ -3227,9 +3456,14 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3227 * dirs, we always have one new and one deleted 3456 * dirs, we always have one new and one deleted
3228 * ref. The deleted ref is ignored later. 3457 * ref. The deleted ref is ignored later.
3229 */ 3458 */
3230 if (wait_for_parent_move(sctx, cur)) { 3459 ret = wait_for_parent_move(sctx, cur);
3460 if (ret < 0)
3461 goto out;
3462 if (ret) {
3231 ret = add_pending_dir_move(sctx, 3463 ret = add_pending_dir_move(sctx,
3232 cur->dir); 3464 sctx->cur_ino,
3465 sctx->cur_inode_gen,
3466 cur->dir);
3233 *pending_move = 1; 3467 *pending_move = 1;
3234 } else { 3468 } else {
3235 ret = send_rename(sctx, valid_path, 3469 ret = send_rename(sctx, valid_path,
@@ -3259,7 +3493,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3259 * later, we do this check again and rmdir it then if possible. 3493 * later, we do this check again and rmdir it then if possible.
3260 * See the use of check_dirs for more details. 3494 * See the use of check_dirs for more details.
3261 */ 3495 */
3262 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino); 3496 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
3497 sctx->cur_ino);
3263 if (ret < 0) 3498 if (ret < 0)
3264 goto out; 3499 goto out;
3265 if (ret) { 3500 if (ret) {
@@ -3350,8 +3585,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3350 ret = send_utimes(sctx, cur->dir, cur->dir_gen); 3585 ret = send_utimes(sctx, cur->dir, cur->dir_gen);
3351 if (ret < 0) 3586 if (ret < 0)
3352 goto out; 3587 goto out;
3353 } else if (ret == inode_state_did_delete) { 3588 } else if (ret == inode_state_did_delete &&
3354 ret = can_rmdir(sctx, cur->dir, sctx->cur_ino); 3589 cur->dir != last_dir_ino_rm) {
3590 ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
3591 sctx->cur_ino);
3355 if (ret < 0) 3592 if (ret < 0)
3356 goto out; 3593 goto out;
3357 if (ret) { 3594 if (ret) {
@@ -3362,6 +3599,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3362 ret = send_rmdir(sctx, valid_path); 3599 ret = send_rmdir(sctx, valid_path);
3363 if (ret < 0) 3600 if (ret < 0)
3364 goto out; 3601 goto out;
3602 last_dir_ino_rm = cur->dir;
3365 } 3603 }
3366 } 3604 }
3367 } 3605 }
@@ -3375,9 +3613,8 @@ out:
3375 return ret; 3613 return ret;
3376} 3614}
3377 3615
3378static int __record_new_ref(int num, u64 dir, int index, 3616static int record_ref(struct btrfs_root *root, int num, u64 dir, int index,
3379 struct fs_path *name, 3617 struct fs_path *name, void *ctx, struct list_head *refs)
3380 void *ctx)
3381{ 3618{
3382 int ret = 0; 3619 int ret = 0;
3383 struct send_ctx *sctx = ctx; 3620 struct send_ctx *sctx = ctx;
@@ -3388,7 +3625,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3388 if (!p) 3625 if (!p)
3389 return -ENOMEM; 3626 return -ENOMEM;
3390 3627
3391 ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL, 3628 ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
3392 NULL, NULL); 3629 NULL, NULL);
3393 if (ret < 0) 3630 if (ret < 0)
3394 goto out; 3631 goto out;
@@ -3400,7 +3637,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3400 if (ret < 0) 3637 if (ret < 0)
3401 goto out; 3638 goto out;
3402 3639
3403 ret = record_ref(&sctx->new_refs, dir, gen, p); 3640 ret = __record_ref(refs, dir, gen, p);
3404 3641
3405out: 3642out:
3406 if (ret) 3643 if (ret)
@@ -3408,37 +3645,23 @@ out:
3408 return ret; 3645 return ret;
3409} 3646}
3410 3647
3648static int __record_new_ref(int num, u64 dir, int index,
3649 struct fs_path *name,
3650 void *ctx)
3651{
3652 struct send_ctx *sctx = ctx;
3653 return record_ref(sctx->send_root, num, dir, index, name,
3654 ctx, &sctx->new_refs);
3655}
3656
3657
3411static int __record_deleted_ref(int num, u64 dir, int index, 3658static int __record_deleted_ref(int num, u64 dir, int index,
3412 struct fs_path *name, 3659 struct fs_path *name,
3413 void *ctx) 3660 void *ctx)
3414{ 3661{
3415 int ret = 0;
3416 struct send_ctx *sctx = ctx; 3662 struct send_ctx *sctx = ctx;
3417 struct fs_path *p; 3663 return record_ref(sctx->parent_root, num, dir, index, name,
3418 u64 gen; 3664 ctx, &sctx->deleted_refs);
3419
3420 p = fs_path_alloc();
3421 if (!p)
3422 return -ENOMEM;
3423
3424 ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
3425 NULL, NULL);
3426 if (ret < 0)
3427 goto out;
3428
3429 ret = get_cur_path(sctx, dir, gen, p);
3430 if (ret < 0)
3431 goto out;
3432 ret = fs_path_add_path(p, name);
3433 if (ret < 0)
3434 goto out;
3435
3436 ret = record_ref(&sctx->deleted_refs, dir, gen, p);
3437
3438out:
3439 if (ret)
3440 fs_path_free(p);
3441 return ret;
3442} 3665}
3443 3666
3444static int record_new_ref(struct send_ctx *sctx) 3667static int record_new_ref(struct send_ctx *sctx)
@@ -3619,21 +3842,31 @@ static int process_all_refs(struct send_ctx *sctx,
3619 root = sctx->parent_root; 3842 root = sctx->parent_root;
3620 cb = __record_deleted_ref; 3843 cb = __record_deleted_ref;
3621 } else { 3844 } else {
3622 BUG(); 3845 btrfs_err(sctx->send_root->fs_info,
3846 "Wrong command %d in process_all_refs", cmd);
3847 ret = -EINVAL;
3848 goto out;
3623 } 3849 }
3624 3850
3625 key.objectid = sctx->cmp_key->objectid; 3851 key.objectid = sctx->cmp_key->objectid;
3626 key.type = BTRFS_INODE_REF_KEY; 3852 key.type = BTRFS_INODE_REF_KEY;
3627 key.offset = 0; 3853 key.offset = 0;
3628 while (1) { 3854 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3629 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 3855 if (ret < 0)
3630 if (ret < 0) 3856 goto out;
3631 goto out;
3632 if (ret)
3633 break;
3634 3857
3858 while (1) {
3635 eb = path->nodes[0]; 3859 eb = path->nodes[0];
3636 slot = path->slots[0]; 3860 slot = path->slots[0];
3861 if (slot >= btrfs_header_nritems(eb)) {
3862 ret = btrfs_next_leaf(root, path);
3863 if (ret < 0)
3864 goto out;
3865 else if (ret > 0)
3866 break;
3867 continue;
3868 }
3869
3637 btrfs_item_key_to_cpu(eb, &found_key, slot); 3870 btrfs_item_key_to_cpu(eb, &found_key, slot);
3638 3871
3639 if (found_key.objectid != key.objectid || 3872 if (found_key.objectid != key.objectid ||
@@ -3642,11 +3875,10 @@ static int process_all_refs(struct send_ctx *sctx,
3642 break; 3875 break;
3643 3876
3644 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); 3877 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
3645 btrfs_release_path(path);
3646 if (ret < 0) 3878 if (ret < 0)
3647 goto out; 3879 goto out;
3648 3880
3649 key.offset = found_key.offset + 1; 3881 path->slots[0]++;
3650 } 3882 }
3651 btrfs_release_path(path); 3883 btrfs_release_path(path);
3652 3884
@@ -3927,19 +4159,25 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
3927 key.objectid = sctx->cmp_key->objectid; 4159 key.objectid = sctx->cmp_key->objectid;
3928 key.type = BTRFS_XATTR_ITEM_KEY; 4160 key.type = BTRFS_XATTR_ITEM_KEY;
3929 key.offset = 0; 4161 key.offset = 0;
3930 while (1) { 4162 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3931 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 4163 if (ret < 0)
3932 if (ret < 0) 4164 goto out;
3933 goto out;
3934 if (ret) {
3935 ret = 0;
3936 goto out;
3937 }
3938 4165
4166 while (1) {
3939 eb = path->nodes[0]; 4167 eb = path->nodes[0];
3940 slot = path->slots[0]; 4168 slot = path->slots[0];
3941 btrfs_item_key_to_cpu(eb, &found_key, slot); 4169 if (slot >= btrfs_header_nritems(eb)) {
4170 ret = btrfs_next_leaf(root, path);
4171 if (ret < 0) {
4172 goto out;
4173 } else if (ret > 0) {
4174 ret = 0;
4175 break;
4176 }
4177 continue;
4178 }
3942 4179
4180 btrfs_item_key_to_cpu(eb, &found_key, slot);
3943 if (found_key.objectid != key.objectid || 4181 if (found_key.objectid != key.objectid ||
3944 found_key.type != key.type) { 4182 found_key.type != key.type) {
3945 ret = 0; 4183 ret = 0;
@@ -3951,8 +4189,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
3951 if (ret < 0) 4189 if (ret < 0)
3952 goto out; 4190 goto out;
3953 4191
3954 btrfs_release_path(path); 4192 path->slots[0]++;
3955 key.offset = found_key.offset + 1;
3956 } 4193 }
3957 4194
3958out: 4195out:
@@ -3991,6 +4228,13 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
3991 goto out; 4228 goto out;
3992 4229
3993 last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT; 4230 last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
4231
4232 /* initial readahead */
4233 memset(&sctx->ra, 0, sizeof(struct file_ra_state));
4234 file_ra_state_init(&sctx->ra, inode->i_mapping);
4235 btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index,
4236 last_index - index + 1);
4237
3994 while (index <= last_index) { 4238 while (index <= last_index) {
3995 unsigned cur_len = min_t(unsigned, len, 4239 unsigned cur_len = min_t(unsigned, len,
3996 PAGE_CACHE_SIZE - pg_offset); 4240 PAGE_CACHE_SIZE - pg_offset);
@@ -4763,18 +5007,19 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
4763 ret = apply_children_dir_moves(sctx); 5007 ret = apply_children_dir_moves(sctx);
4764 if (ret) 5008 if (ret)
4765 goto out; 5009 goto out;
5010 /*
5011 * Need to send that every time, no matter if it actually
5012 * changed between the two trees as we have done changes to
5013 * the inode before. If our inode is a directory and it's
5014 * waiting to be moved/renamed, we will send its utimes when
5015 * it's moved/renamed, therefore we don't need to do it here.
5016 */
5017 sctx->send_progress = sctx->cur_ino + 1;
5018 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
5019 if (ret < 0)
5020 goto out;
4766 } 5021 }
4767 5022
4768 /*
4769 * Need to send that every time, no matter if it actually
4770 * changed between the two trees as we have done changes to
4771 * the inode before.
4772 */
4773 sctx->send_progress = sctx->cur_ino + 1;
4774 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
4775 if (ret < 0)
4776 goto out;
4777
4778out: 5023out:
4779 return ret; 5024 return ret;
4780} 5025}
@@ -4840,6 +5085,8 @@ static int changed_inode(struct send_ctx *sctx,
4840 sctx->left_path->nodes[0], left_ii); 5085 sctx->left_path->nodes[0], left_ii);
4841 sctx->cur_inode_mode = btrfs_inode_mode( 5086 sctx->cur_inode_mode = btrfs_inode_mode(
4842 sctx->left_path->nodes[0], left_ii); 5087 sctx->left_path->nodes[0], left_ii);
5088 sctx->cur_inode_rdev = btrfs_inode_rdev(
5089 sctx->left_path->nodes[0], left_ii);
4843 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) 5090 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
4844 ret = send_create_inode_if_needed(sctx); 5091 ret = send_create_inode_if_needed(sctx);
4845 } else if (result == BTRFS_COMPARE_TREE_DELETED) { 5092 } else if (result == BTRFS_COMPARE_TREE_DELETED) {
@@ -4884,6 +5131,8 @@ static int changed_inode(struct send_ctx *sctx,
4884 sctx->left_path->nodes[0], left_ii); 5131 sctx->left_path->nodes[0], left_ii);
4885 sctx->cur_inode_mode = btrfs_inode_mode( 5132 sctx->cur_inode_mode = btrfs_inode_mode(
4886 sctx->left_path->nodes[0], left_ii); 5133 sctx->left_path->nodes[0], left_ii);
5134 sctx->cur_inode_rdev = btrfs_inode_rdev(
5135 sctx->left_path->nodes[0], left_ii);
4887 ret = send_create_inode_if_needed(sctx); 5136 ret = send_create_inode_if_needed(sctx);
4888 if (ret < 0) 5137 if (ret < 0)
4889 goto out; 5138 goto out;
@@ -5118,6 +5367,7 @@ out:
5118static int full_send_tree(struct send_ctx *sctx) 5367static int full_send_tree(struct send_ctx *sctx)
5119{ 5368{
5120 int ret; 5369 int ret;
5370 struct btrfs_trans_handle *trans = NULL;
5121 struct btrfs_root *send_root = sctx->send_root; 5371 struct btrfs_root *send_root = sctx->send_root;
5122 struct btrfs_key key; 5372 struct btrfs_key key;
5123 struct btrfs_key found_key; 5373 struct btrfs_key found_key;
@@ -5139,6 +5389,19 @@ static int full_send_tree(struct send_ctx *sctx)
5139 key.type = BTRFS_INODE_ITEM_KEY; 5389 key.type = BTRFS_INODE_ITEM_KEY;
5140 key.offset = 0; 5390 key.offset = 0;
5141 5391
5392join_trans:
5393 /*
5394 * We need to make sure the transaction does not get committed
5395 * while we do anything on commit roots. Join a transaction to prevent
5396 * this.
5397 */
5398 trans = btrfs_join_transaction(send_root);
5399 if (IS_ERR(trans)) {
5400 ret = PTR_ERR(trans);
5401 trans = NULL;
5402 goto out;
5403 }
5404
5142 /* 5405 /*
5143 * Make sure the tree has not changed after re-joining. We detect this 5406 * Make sure the tree has not changed after re-joining. We detect this
5144 * by comparing start_ctransid and ctransid. They should always match. 5407 * by comparing start_ctransid and ctransid. They should always match.
@@ -5162,6 +5425,19 @@ static int full_send_tree(struct send_ctx *sctx)
5162 goto out_finish; 5425 goto out_finish;
5163 5426
5164 while (1) { 5427 while (1) {
5428 /*
5429 * When someone want to commit while we iterate, end the
5430 * joined transaction and rejoin.
5431 */
5432 if (btrfs_should_end_transaction(trans, send_root)) {
5433 ret = btrfs_end_transaction(trans, send_root);
5434 trans = NULL;
5435 if (ret < 0)
5436 goto out;
5437 btrfs_release_path(path);
5438 goto join_trans;
5439 }
5440
5165 eb = path->nodes[0]; 5441 eb = path->nodes[0];
5166 slot = path->slots[0]; 5442 slot = path->slots[0];
5167 btrfs_item_key_to_cpu(eb, &found_key, slot); 5443 btrfs_item_key_to_cpu(eb, &found_key, slot);
@@ -5189,6 +5465,12 @@ out_finish:
5189 5465
5190out: 5466out:
5191 btrfs_free_path(path); 5467 btrfs_free_path(path);
5468 if (trans) {
5469 if (!ret)
5470 ret = btrfs_end_transaction(trans, send_root);
5471 else
5472 btrfs_end_transaction(trans, send_root);
5473 }
5192 return ret; 5474 return ret;
5193} 5475}
5194 5476
@@ -5340,6 +5622,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5340 5622
5341 sctx->pending_dir_moves = RB_ROOT; 5623 sctx->pending_dir_moves = RB_ROOT;
5342 sctx->waiting_dir_moves = RB_ROOT; 5624 sctx->waiting_dir_moves = RB_ROOT;
5625 sctx->orphan_dirs = RB_ROOT;
5343 5626
5344 sctx->clone_roots = vzalloc(sizeof(struct clone_root) * 5627 sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
5345 (arg->clone_sources_count + 1)); 5628 (arg->clone_sources_count + 1));
@@ -5477,6 +5760,16 @@ out:
5477 kfree(dm); 5760 kfree(dm);
5478 } 5761 }
5479 5762
5763 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
5764 while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
5765 struct rb_node *n;
5766 struct orphan_dir_info *odi;
5767
5768 n = rb_first(&sctx->orphan_dirs);
5769 odi = rb_entry(n, struct orphan_dir_info, node);
5770 free_orphan_dir_info(sctx, odi);
5771 }
5772
5480 if (sort_clone_roots) { 5773 if (sort_clone_roots) {
5481 for (i = 0; i < sctx->clone_roots_cnt; i++) 5774 for (i = 0; i < sctx->clone_roots_cnt; i++)
5482 btrfs_root_dec_send_in_progress( 5775 btrfs_root_dec_send_in_progress(
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d04db817be5c..9dbf42395153 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1305,13 +1305,6 @@ error_fs_info:
1305 return ERR_PTR(error); 1305 return ERR_PTR(error);
1306} 1306}
1307 1307
1308static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit)
1309{
1310 spin_lock_irq(&workers->lock);
1311 workers->max_workers = new_limit;
1312 spin_unlock_irq(&workers->lock);
1313}
1314
1315static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, 1308static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1316 int new_pool_size, int old_pool_size) 1309 int new_pool_size, int old_pool_size)
1317{ 1310{
@@ -1323,21 +1316,20 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1323 btrfs_info(fs_info, "resize thread pool %d -> %d", 1316 btrfs_info(fs_info, "resize thread pool %d -> %d",
1324 old_pool_size, new_pool_size); 1317 old_pool_size, new_pool_size);
1325 1318
1326 btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size); 1319 btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
1327 btrfs_set_max_workers(&fs_info->workers, new_pool_size); 1320 btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
1328 btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size); 1321 btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
1329 btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size); 1322 btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
1330 btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size); 1323 btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
1331 btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size); 1324 btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
1332 btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size); 1325 btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
1333 btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size); 1326 new_pool_size);
1334 btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size); 1327 btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
1335 btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size); 1328 btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
1336 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); 1329 btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
1337 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); 1330 btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
1338 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); 1331 btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
1339 btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers, 1332 new_pool_size);
1340 new_pool_size);
1341} 1333}
1342 1334
1343static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info) 1335static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
@@ -1388,6 +1380,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1388 unsigned int old_metadata_ratio = fs_info->metadata_ratio; 1380 unsigned int old_metadata_ratio = fs_info->metadata_ratio;
1389 int ret; 1381 int ret;
1390 1382
1383 sync_filesystem(sb);
1391 btrfs_remount_prepare(fs_info); 1384 btrfs_remount_prepare(fs_info);
1392 1385
1393 ret = btrfs_parse_options(root, data); 1386 ret = btrfs_parse_options(root, data);
@@ -1479,6 +1472,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1479 sb->s_flags &= ~MS_RDONLY; 1472 sb->s_flags &= ~MS_RDONLY;
1480 } 1473 }
1481out: 1474out:
1475 wake_up_process(fs_info->transaction_kthread);
1482 btrfs_remount_cleanup(fs_info, old_opts); 1476 btrfs_remount_cleanup(fs_info, old_opts);
1483 return 0; 1477 return 0;
1484 1478
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 865f4cf9a769..c5eb2143dc66 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -24,6 +24,7 @@
24#include <linux/kobject.h> 24#include <linux/kobject.h>
25#include <linux/bug.h> 25#include <linux/bug.h>
26#include <linux/genhd.h> 26#include <linux/genhd.h>
27#include <linux/debugfs.h>
27 28
28#include "ctree.h" 29#include "ctree.h"
29#include "disk-io.h" 30#include "disk-io.h"
@@ -599,6 +600,12 @@ static int add_device_membership(struct btrfs_fs_info *fs_info)
599/* /sys/fs/btrfs/ entry */ 600/* /sys/fs/btrfs/ entry */
600static struct kset *btrfs_kset; 601static struct kset *btrfs_kset;
601 602
603/* /sys/kernel/debug/btrfs */
604static struct dentry *btrfs_debugfs_root_dentry;
605
606/* Debugging tunables and exported data */
607u64 btrfs_debugfs_test;
608
602int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) 609int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
603{ 610{
604 int error; 611 int error;
@@ -642,27 +649,41 @@ failure:
642 return error; 649 return error;
643} 650}
644 651
652static int btrfs_init_debugfs(void)
653{
654#ifdef CONFIG_DEBUG_FS
655 btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL);
656 if (!btrfs_debugfs_root_dentry)
657 return -ENOMEM;
658
659 debugfs_create_u64("test", S_IRUGO | S_IWUGO, btrfs_debugfs_root_dentry,
660 &btrfs_debugfs_test);
661#endif
662 return 0;
663}
664
645int btrfs_init_sysfs(void) 665int btrfs_init_sysfs(void)
646{ 666{
647 int ret; 667 int ret;
668
648 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); 669 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
649 if (!btrfs_kset) 670 if (!btrfs_kset)
650 return -ENOMEM; 671 return -ENOMEM;
651 672
652 init_feature_attrs(); 673 ret = btrfs_init_debugfs();
674 if (ret)
675 return ret;
653 676
677 init_feature_attrs();
654 ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); 678 ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
655 if (ret) {
656 kset_unregister(btrfs_kset);
657 return ret;
658 }
659 679
660 return 0; 680 return ret;
661} 681}
662 682
663void btrfs_exit_sysfs(void) 683void btrfs_exit_sysfs(void)
664{ 684{
665 sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); 685 sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
666 kset_unregister(btrfs_kset); 686 kset_unregister(btrfs_kset);
687 debugfs_remove_recursive(btrfs_debugfs_root_dentry);
667} 688}
668 689
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index f3cea3710d44..9ab576318a84 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -1,6 +1,11 @@
1#ifndef _BTRFS_SYSFS_H_ 1#ifndef _BTRFS_SYSFS_H_
2#define _BTRFS_SYSFS_H_ 2#define _BTRFS_SYSFS_H_
3 3
4/*
5 * Data exported through sysfs
6 */
7extern u64 btrfs_debugfs_test;
8
4enum btrfs_feature_set { 9enum btrfs_feature_set {
5 FEAT_COMPAT, 10 FEAT_COMPAT,
6 FEAT_COMPAT_RO, 11 FEAT_COMPAT_RO,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 34cd83184c4a..a04707f740d6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -683,7 +683,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
683 int lock = (trans->type != TRANS_JOIN_NOLOCK); 683 int lock = (trans->type != TRANS_JOIN_NOLOCK);
684 int err = 0; 684 int err = 0;
685 685
686 if (--trans->use_count) { 686 if (trans->use_count > 1) {
687 trans->use_count--;
687 trans->block_rsv = trans->orig_rsv; 688 trans->block_rsv = trans->orig_rsv;
688 return 0; 689 return 0;
689 } 690 }
@@ -731,17 +732,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
731 } 732 }
732 733
733 if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { 734 if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
734 if (throttle) { 735 if (throttle)
735 /*
736 * We may race with somebody else here so end up having
737 * to call end_transaction on ourselves again, so inc
738 * our use_count.
739 */
740 trans->use_count++;
741 return btrfs_commit_transaction(trans, root); 736 return btrfs_commit_transaction(trans, root);
742 } else { 737 else
743 wake_up_process(info->transaction_kthread); 738 wake_up_process(info->transaction_kthread);
744 }
745 } 739 }
746 740
747 if (trans->type & __TRANS_FREEZABLE) 741 if (trans->type & __TRANS_FREEZABLE)
@@ -1578,10 +1572,9 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1578 1572
1579 trace_btrfs_transaction_commit(root); 1573 trace_btrfs_transaction_commit(root);
1580 1574
1581 btrfs_scrub_continue(root);
1582
1583 if (current->journal_info == trans) 1575 if (current->journal_info == trans)
1584 current->journal_info = NULL; 1576 current->journal_info = NULL;
1577 btrfs_scrub_cancel(root->fs_info);
1585 1578
1586 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1579 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1587} 1580}
@@ -1621,7 +1614,7 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1621static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) 1614static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
1622{ 1615{
1623 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) 1616 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
1624 return btrfs_start_delalloc_roots(fs_info, 1); 1617 return btrfs_start_delalloc_roots(fs_info, 1, -1);
1625 return 0; 1618 return 0;
1626} 1619}
1627 1620
@@ -1754,7 +1747,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1754 /* ->aborted might be set after the previous check, so check it */ 1747 /* ->aborted might be set after the previous check, so check it */
1755 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { 1748 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1756 ret = cur_trans->aborted; 1749 ret = cur_trans->aborted;
1757 goto cleanup_transaction; 1750 goto scrub_continue;
1758 } 1751 }
1759 /* 1752 /*
1760 * the reloc mutex makes sure that we stop 1753 * the reloc mutex makes sure that we stop
@@ -1771,7 +1764,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1771 ret = create_pending_snapshots(trans, root->fs_info); 1764 ret = create_pending_snapshots(trans, root->fs_info);
1772 if (ret) { 1765 if (ret) {
1773 mutex_unlock(&root->fs_info->reloc_mutex); 1766 mutex_unlock(&root->fs_info->reloc_mutex);
1774 goto cleanup_transaction; 1767 goto scrub_continue;
1775 } 1768 }
1776 1769
1777 /* 1770 /*
@@ -1787,13 +1780,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1787 ret = btrfs_run_delayed_items(trans, root); 1780 ret = btrfs_run_delayed_items(trans, root);
1788 if (ret) { 1781 if (ret) {
1789 mutex_unlock(&root->fs_info->reloc_mutex); 1782 mutex_unlock(&root->fs_info->reloc_mutex);
1790 goto cleanup_transaction; 1783 goto scrub_continue;
1791 } 1784 }
1792 1785
1793 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 1786 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1794 if (ret) { 1787 if (ret) {
1795 mutex_unlock(&root->fs_info->reloc_mutex); 1788 mutex_unlock(&root->fs_info->reloc_mutex);
1796 goto cleanup_transaction; 1789 goto scrub_continue;
1797 } 1790 }
1798 1791
1799 /* 1792 /*
@@ -1823,7 +1816,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1823 if (ret) { 1816 if (ret) {
1824 mutex_unlock(&root->fs_info->tree_log_mutex); 1817 mutex_unlock(&root->fs_info->tree_log_mutex);
1825 mutex_unlock(&root->fs_info->reloc_mutex); 1818 mutex_unlock(&root->fs_info->reloc_mutex);
1826 goto cleanup_transaction; 1819 goto scrub_continue;
1827 } 1820 }
1828 1821
1829 /* 1822 /*
@@ -1844,7 +1837,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1844 if (ret) { 1837 if (ret) {
1845 mutex_unlock(&root->fs_info->tree_log_mutex); 1838 mutex_unlock(&root->fs_info->tree_log_mutex);
1846 mutex_unlock(&root->fs_info->reloc_mutex); 1839 mutex_unlock(&root->fs_info->reloc_mutex);
1847 goto cleanup_transaction; 1840 goto scrub_continue;
1848 } 1841 }
1849 1842
1850 /* 1843 /*
@@ -1855,7 +1848,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1855 ret = cur_trans->aborted; 1848 ret = cur_trans->aborted;
1856 mutex_unlock(&root->fs_info->tree_log_mutex); 1849 mutex_unlock(&root->fs_info->tree_log_mutex);
1857 mutex_unlock(&root->fs_info->reloc_mutex); 1850 mutex_unlock(&root->fs_info->reloc_mutex);
1858 goto cleanup_transaction; 1851 goto scrub_continue;
1859 } 1852 }
1860 1853
1861 btrfs_prepare_extent_commit(trans, root); 1854 btrfs_prepare_extent_commit(trans, root);
@@ -1891,13 +1884,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1891 btrfs_error(root->fs_info, ret, 1884 btrfs_error(root->fs_info, ret,
1892 "Error while writing out transaction"); 1885 "Error while writing out transaction");
1893 mutex_unlock(&root->fs_info->tree_log_mutex); 1886 mutex_unlock(&root->fs_info->tree_log_mutex);
1894 goto cleanup_transaction; 1887 goto scrub_continue;
1895 } 1888 }
1896 1889
1897 ret = write_ctree_super(trans, root, 0); 1890 ret = write_ctree_super(trans, root, 0);
1898 if (ret) { 1891 if (ret) {
1899 mutex_unlock(&root->fs_info->tree_log_mutex); 1892 mutex_unlock(&root->fs_info->tree_log_mutex);
1900 goto cleanup_transaction; 1893 goto scrub_continue;
1901 } 1894 }
1902 1895
1903 /* 1896 /*
@@ -1940,6 +1933,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1940 1933
1941 return ret; 1934 return ret;
1942 1935
1936scrub_continue:
1937 btrfs_scrub_continue(root);
1943cleanup_transaction: 1938cleanup_transaction:
1944 btrfs_trans_release_metadata(trans, root); 1939 btrfs_trans_release_metadata(trans, root);
1945 trans->block_rsv = NULL; 1940 trans->block_rsv = NULL;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 39d83da03e03..e2f45fc02610 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -136,13 +136,20 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
136 * syncing the tree wait for us to finish 136 * syncing the tree wait for us to finish
137 */ 137 */
138static int start_log_trans(struct btrfs_trans_handle *trans, 138static int start_log_trans(struct btrfs_trans_handle *trans,
139 struct btrfs_root *root) 139 struct btrfs_root *root,
140 struct btrfs_log_ctx *ctx)
140{ 141{
142 int index;
141 int ret; 143 int ret;
142 int err = 0;
143 144
144 mutex_lock(&root->log_mutex); 145 mutex_lock(&root->log_mutex);
145 if (root->log_root) { 146 if (root->log_root) {
147 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
148 trans->transid) {
149 ret = -EAGAIN;
150 goto out;
151 }
152
146 if (!root->log_start_pid) { 153 if (!root->log_start_pid) {
147 root->log_start_pid = current->pid; 154 root->log_start_pid = current->pid;
148 root->log_multiple_pids = false; 155 root->log_multiple_pids = false;
@@ -152,27 +159,40 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
152 159
153 atomic_inc(&root->log_batch); 160 atomic_inc(&root->log_batch);
154 atomic_inc(&root->log_writers); 161 atomic_inc(&root->log_writers);
162 if (ctx) {
163 index = root->log_transid % 2;
164 list_add_tail(&ctx->list, &root->log_ctxs[index]);
165 ctx->log_transid = root->log_transid;
166 }
155 mutex_unlock(&root->log_mutex); 167 mutex_unlock(&root->log_mutex);
156 return 0; 168 return 0;
157 } 169 }
158 root->log_multiple_pids = false; 170
159 root->log_start_pid = current->pid; 171 ret = 0;
160 mutex_lock(&root->fs_info->tree_log_mutex); 172 mutex_lock(&root->fs_info->tree_log_mutex);
161 if (!root->fs_info->log_root_tree) { 173 if (!root->fs_info->log_root_tree)
162 ret = btrfs_init_log_root_tree(trans, root->fs_info); 174 ret = btrfs_init_log_root_tree(trans, root->fs_info);
163 if (ret) 175 mutex_unlock(&root->fs_info->tree_log_mutex);
164 err = ret; 176 if (ret)
165 } 177 goto out;
166 if (err == 0 && !root->log_root) { 178
179 if (!root->log_root) {
167 ret = btrfs_add_log_tree(trans, root); 180 ret = btrfs_add_log_tree(trans, root);
168 if (ret) 181 if (ret)
169 err = ret; 182 goto out;
170 } 183 }
171 mutex_unlock(&root->fs_info->tree_log_mutex); 184 root->log_multiple_pids = false;
185 root->log_start_pid = current->pid;
172 atomic_inc(&root->log_batch); 186 atomic_inc(&root->log_batch);
173 atomic_inc(&root->log_writers); 187 atomic_inc(&root->log_writers);
188 if (ctx) {
189 index = root->log_transid % 2;
190 list_add_tail(&ctx->list, &root->log_ctxs[index]);
191 ctx->log_transid = root->log_transid;
192 }
193out:
174 mutex_unlock(&root->log_mutex); 194 mutex_unlock(&root->log_mutex);
175 return err; 195 return ret;
176} 196}
177 197
178/* 198/*
@@ -2359,8 +2379,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
2359 return ret; 2379 return ret;
2360} 2380}
2361 2381
2362static int wait_log_commit(struct btrfs_trans_handle *trans, 2382static void wait_log_commit(struct btrfs_trans_handle *trans,
2363 struct btrfs_root *root, unsigned long transid) 2383 struct btrfs_root *root, int transid)
2364{ 2384{
2365 DEFINE_WAIT(wait); 2385 DEFINE_WAIT(wait);
2366 int index = transid % 2; 2386 int index = transid % 2;
@@ -2375,36 +2395,63 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
2375 &wait, TASK_UNINTERRUPTIBLE); 2395 &wait, TASK_UNINTERRUPTIBLE);
2376 mutex_unlock(&root->log_mutex); 2396 mutex_unlock(&root->log_mutex);
2377 2397
2378 if (root->fs_info->last_trans_log_full_commit != 2398 if (root->log_transid_committed < transid &&
2379 trans->transid && root->log_transid < transid + 2 &&
2380 atomic_read(&root->log_commit[index])) 2399 atomic_read(&root->log_commit[index]))
2381 schedule(); 2400 schedule();
2382 2401
2383 finish_wait(&root->log_commit_wait[index], &wait); 2402 finish_wait(&root->log_commit_wait[index], &wait);
2384 mutex_lock(&root->log_mutex); 2403 mutex_lock(&root->log_mutex);
2385 } while (root->fs_info->last_trans_log_full_commit != 2404 } while (root->log_transid_committed < transid &&
2386 trans->transid && root->log_transid < transid + 2 &&
2387 atomic_read(&root->log_commit[index])); 2405 atomic_read(&root->log_commit[index]));
2388 return 0;
2389} 2406}
2390 2407
2391static void wait_for_writer(struct btrfs_trans_handle *trans, 2408static void wait_for_writer(struct btrfs_trans_handle *trans,
2392 struct btrfs_root *root) 2409 struct btrfs_root *root)
2393{ 2410{
2394 DEFINE_WAIT(wait); 2411 DEFINE_WAIT(wait);
2395 while (root->fs_info->last_trans_log_full_commit != 2412
2396 trans->transid && atomic_read(&root->log_writers)) { 2413 while (atomic_read(&root->log_writers)) {
2397 prepare_to_wait(&root->log_writer_wait, 2414 prepare_to_wait(&root->log_writer_wait,
2398 &wait, TASK_UNINTERRUPTIBLE); 2415 &wait, TASK_UNINTERRUPTIBLE);
2399 mutex_unlock(&root->log_mutex); 2416 mutex_unlock(&root->log_mutex);
2400 if (root->fs_info->last_trans_log_full_commit != 2417 if (atomic_read(&root->log_writers))
2401 trans->transid && atomic_read(&root->log_writers))
2402 schedule(); 2418 schedule();
2403 mutex_lock(&root->log_mutex); 2419 mutex_lock(&root->log_mutex);
2404 finish_wait(&root->log_writer_wait, &wait); 2420 finish_wait(&root->log_writer_wait, &wait);
2405 } 2421 }
2406} 2422}
2407 2423
2424static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2425 struct btrfs_log_ctx *ctx)
2426{
2427 if (!ctx)
2428 return;
2429
2430 mutex_lock(&root->log_mutex);
2431 list_del_init(&ctx->list);
2432 mutex_unlock(&root->log_mutex);
2433}
2434
2435/*
2436 * Invoked in log mutex context, or be sure there is no other task which
2437 * can access the list.
2438 */
2439static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2440 int index, int error)
2441{
2442 struct btrfs_log_ctx *ctx;
2443
2444 if (!error) {
2445 INIT_LIST_HEAD(&root->log_ctxs[index]);
2446 return;
2447 }
2448
2449 list_for_each_entry(ctx, &root->log_ctxs[index], list)
2450 ctx->log_ret = error;
2451
2452 INIT_LIST_HEAD(&root->log_ctxs[index]);
2453}
2454
2408/* 2455/*
2409 * btrfs_sync_log does sends a given tree log down to the disk and 2456 * btrfs_sync_log does sends a given tree log down to the disk and
2410 * updates the super blocks to record it. When this call is done, 2457 * updates the super blocks to record it. When this call is done,
@@ -2418,7 +2465,7 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
2418 * that has happened. 2465 * that has happened.
2419 */ 2466 */
2420int btrfs_sync_log(struct btrfs_trans_handle *trans, 2467int btrfs_sync_log(struct btrfs_trans_handle *trans,
2421 struct btrfs_root *root) 2468 struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2422{ 2469{
2423 int index1; 2470 int index1;
2424 int index2; 2471 int index2;
@@ -2426,22 +2473,30 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2426 int ret; 2473 int ret;
2427 struct btrfs_root *log = root->log_root; 2474 struct btrfs_root *log = root->log_root;
2428 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2475 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2429 unsigned long log_transid = 0; 2476 int log_transid = 0;
2477 struct btrfs_log_ctx root_log_ctx;
2430 struct blk_plug plug; 2478 struct blk_plug plug;
2431 2479
2432 mutex_lock(&root->log_mutex); 2480 mutex_lock(&root->log_mutex);
2433 log_transid = root->log_transid; 2481 log_transid = ctx->log_transid;
2434 index1 = root->log_transid % 2; 2482 if (root->log_transid_committed >= log_transid) {
2483 mutex_unlock(&root->log_mutex);
2484 return ctx->log_ret;
2485 }
2486
2487 index1 = log_transid % 2;
2435 if (atomic_read(&root->log_commit[index1])) { 2488 if (atomic_read(&root->log_commit[index1])) {
2436 wait_log_commit(trans, root, root->log_transid); 2489 wait_log_commit(trans, root, log_transid);
2437 mutex_unlock(&root->log_mutex); 2490 mutex_unlock(&root->log_mutex);
2438 return 0; 2491 return ctx->log_ret;
2439 } 2492 }
2493 ASSERT(log_transid == root->log_transid);
2440 atomic_set(&root->log_commit[index1], 1); 2494 atomic_set(&root->log_commit[index1], 1);
2441 2495
2442 /* wait for previous tree log sync to complete */ 2496 /* wait for previous tree log sync to complete */
2443 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2497 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2444 wait_log_commit(trans, root, root->log_transid - 1); 2498 wait_log_commit(trans, root, log_transid - 1);
2499
2445 while (1) { 2500 while (1) {
2446 int batch = atomic_read(&root->log_batch); 2501 int batch = atomic_read(&root->log_batch);
2447 /* when we're on an ssd, just kick the log commit out */ 2502 /* when we're on an ssd, just kick the log commit out */
@@ -2456,7 +2511,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2456 } 2511 }
2457 2512
2458 /* bail out if we need to do a full commit */ 2513 /* bail out if we need to do a full commit */
2459 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2514 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
2515 trans->transid) {
2460 ret = -EAGAIN; 2516 ret = -EAGAIN;
2461 btrfs_free_logged_extents(log, log_transid); 2517 btrfs_free_logged_extents(log, log_transid);
2462 mutex_unlock(&root->log_mutex); 2518 mutex_unlock(&root->log_mutex);
@@ -2477,6 +2533,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2477 blk_finish_plug(&plug); 2533 blk_finish_plug(&plug);
2478 btrfs_abort_transaction(trans, root, ret); 2534 btrfs_abort_transaction(trans, root, ret);
2479 btrfs_free_logged_extents(log, log_transid); 2535 btrfs_free_logged_extents(log, log_transid);
2536 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2537 trans->transid;
2480 mutex_unlock(&root->log_mutex); 2538 mutex_unlock(&root->log_mutex);
2481 goto out; 2539 goto out;
2482 } 2540 }
@@ -2486,7 +2544,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2486 root->log_transid++; 2544 root->log_transid++;
2487 log->log_transid = root->log_transid; 2545 log->log_transid = root->log_transid;
2488 root->log_start_pid = 0; 2546 root->log_start_pid = 0;
2489 smp_mb();
2490 /* 2547 /*
2491 * IO has been started, blocks of the log tree have WRITTEN flag set 2548 * IO has been started, blocks of the log tree have WRITTEN flag set
2492 * in their headers. new modifications of the log will be written to 2549 * in their headers. new modifications of the log will be written to
@@ -2494,9 +2551,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2494 */ 2551 */
2495 mutex_unlock(&root->log_mutex); 2552 mutex_unlock(&root->log_mutex);
2496 2553
2554 btrfs_init_log_ctx(&root_log_ctx);
2555
2497 mutex_lock(&log_root_tree->log_mutex); 2556 mutex_lock(&log_root_tree->log_mutex);
2498 atomic_inc(&log_root_tree->log_batch); 2557 atomic_inc(&log_root_tree->log_batch);
2499 atomic_inc(&log_root_tree->log_writers); 2558 atomic_inc(&log_root_tree->log_writers);
2559
2560 index2 = log_root_tree->log_transid % 2;
2561 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2562 root_log_ctx.log_transid = log_root_tree->log_transid;
2563
2500 mutex_unlock(&log_root_tree->log_mutex); 2564 mutex_unlock(&log_root_tree->log_mutex);
2501 2565
2502 ret = update_log_root(trans, log); 2566 ret = update_log_root(trans, log);
@@ -2509,13 +2573,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2509 } 2573 }
2510 2574
2511 if (ret) { 2575 if (ret) {
2576 if (!list_empty(&root_log_ctx.list))
2577 list_del_init(&root_log_ctx.list);
2578
2512 blk_finish_plug(&plug); 2579 blk_finish_plug(&plug);
2580 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2581 trans->transid;
2513 if (ret != -ENOSPC) { 2582 if (ret != -ENOSPC) {
2514 btrfs_abort_transaction(trans, root, ret); 2583 btrfs_abort_transaction(trans, root, ret);
2515 mutex_unlock(&log_root_tree->log_mutex); 2584 mutex_unlock(&log_root_tree->log_mutex);
2516 goto out; 2585 goto out;
2517 } 2586 }
2518 root->fs_info->last_trans_log_full_commit = trans->transid;
2519 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2587 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2520 btrfs_free_logged_extents(log, log_transid); 2588 btrfs_free_logged_extents(log, log_transid);
2521 mutex_unlock(&log_root_tree->log_mutex); 2589 mutex_unlock(&log_root_tree->log_mutex);
@@ -2523,22 +2591,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2523 goto out; 2591 goto out;
2524 } 2592 }
2525 2593
2526 index2 = log_root_tree->log_transid % 2; 2594 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
2595 mutex_unlock(&log_root_tree->log_mutex);
2596 ret = root_log_ctx.log_ret;
2597 goto out;
2598 }
2599
2600 index2 = root_log_ctx.log_transid % 2;
2527 if (atomic_read(&log_root_tree->log_commit[index2])) { 2601 if (atomic_read(&log_root_tree->log_commit[index2])) {
2528 blk_finish_plug(&plug); 2602 blk_finish_plug(&plug);
2529 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2603 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2530 wait_log_commit(trans, log_root_tree, 2604 wait_log_commit(trans, log_root_tree,
2531 log_root_tree->log_transid); 2605 root_log_ctx.log_transid);
2532 btrfs_free_logged_extents(log, log_transid); 2606 btrfs_free_logged_extents(log, log_transid);
2533 mutex_unlock(&log_root_tree->log_mutex); 2607 mutex_unlock(&log_root_tree->log_mutex);
2534 ret = 0; 2608 ret = root_log_ctx.log_ret;
2535 goto out; 2609 goto out;
2536 } 2610 }
2611 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
2537 atomic_set(&log_root_tree->log_commit[index2], 1); 2612 atomic_set(&log_root_tree->log_commit[index2], 1);
2538 2613
2539 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2614 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2540 wait_log_commit(trans, log_root_tree, 2615 wait_log_commit(trans, log_root_tree,
2541 log_root_tree->log_transid - 1); 2616 root_log_ctx.log_transid - 1);
2542 } 2617 }
2543 2618
2544 wait_for_writer(trans, log_root_tree); 2619 wait_for_writer(trans, log_root_tree);
@@ -2547,7 +2622,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2547 * now that we've moved on to the tree of log tree roots, 2622 * now that we've moved on to the tree of log tree roots,
2548 * check the full commit flag again 2623 * check the full commit flag again
2549 */ 2624 */
2550 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2625 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
2626 trans->transid) {
2551 blk_finish_plug(&plug); 2627 blk_finish_plug(&plug);
2552 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2628 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2553 btrfs_free_logged_extents(log, log_transid); 2629 btrfs_free_logged_extents(log, log_transid);
@@ -2561,6 +2637,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2561 EXTENT_DIRTY | EXTENT_NEW); 2637 EXTENT_DIRTY | EXTENT_NEW);
2562 blk_finish_plug(&plug); 2638 blk_finish_plug(&plug);
2563 if (ret) { 2639 if (ret) {
2640 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2641 trans->transid;
2564 btrfs_abort_transaction(trans, root, ret); 2642 btrfs_abort_transaction(trans, root, ret);
2565 btrfs_free_logged_extents(log, log_transid); 2643 btrfs_free_logged_extents(log, log_transid);
2566 mutex_unlock(&log_root_tree->log_mutex); 2644 mutex_unlock(&log_root_tree->log_mutex);
@@ -2578,8 +2656,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2578 btrfs_header_level(log_root_tree->node)); 2656 btrfs_header_level(log_root_tree->node));
2579 2657
2580 log_root_tree->log_transid++; 2658 log_root_tree->log_transid++;
2581 smp_mb();
2582
2583 mutex_unlock(&log_root_tree->log_mutex); 2659 mutex_unlock(&log_root_tree->log_mutex);
2584 2660
2585 /* 2661 /*
@@ -2591,6 +2667,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2591 */ 2667 */
2592 ret = write_ctree_super(trans, root->fs_info->tree_root, 1); 2668 ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2593 if (ret) { 2669 if (ret) {
2670 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2671 trans->transid;
2594 btrfs_abort_transaction(trans, root, ret); 2672 btrfs_abort_transaction(trans, root, ret);
2595 goto out_wake_log_root; 2673 goto out_wake_log_root;
2596 } 2674 }
@@ -2601,13 +2679,28 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2601 mutex_unlock(&root->log_mutex); 2679 mutex_unlock(&root->log_mutex);
2602 2680
2603out_wake_log_root: 2681out_wake_log_root:
2682 /*
2683 * We needn't get log_mutex here because we are sure all
2684 * the other tasks are blocked.
2685 */
2686 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
2687
2688 mutex_lock(&log_root_tree->log_mutex);
2689 log_root_tree->log_transid_committed++;
2604 atomic_set(&log_root_tree->log_commit[index2], 0); 2690 atomic_set(&log_root_tree->log_commit[index2], 0);
2605 smp_mb(); 2691 mutex_unlock(&log_root_tree->log_mutex);
2692
2606 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2693 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2607 wake_up(&log_root_tree->log_commit_wait[index2]); 2694 wake_up(&log_root_tree->log_commit_wait[index2]);
2608out: 2695out:
2696 /* See above. */
2697 btrfs_remove_all_log_ctxs(root, index1, ret);
2698
2699 mutex_lock(&root->log_mutex);
2700 root->log_transid_committed++;
2609 atomic_set(&root->log_commit[index1], 0); 2701 atomic_set(&root->log_commit[index1], 0);
2610 smp_mb(); 2702 mutex_unlock(&root->log_mutex);
2703
2611 if (waitqueue_active(&root->log_commit_wait[index1])) 2704 if (waitqueue_active(&root->log_commit_wait[index1]))
2612 wake_up(&root->log_commit_wait[index1]); 2705 wake_up(&root->log_commit_wait[index1]);
2613 return ret; 2706 return ret;
@@ -3479,7 +3572,8 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3479 3572
3480static int log_one_extent(struct btrfs_trans_handle *trans, 3573static int log_one_extent(struct btrfs_trans_handle *trans,
3481 struct inode *inode, struct btrfs_root *root, 3574 struct inode *inode, struct btrfs_root *root,
3482 struct extent_map *em, struct btrfs_path *path) 3575 struct extent_map *em, struct btrfs_path *path,
3576 struct list_head *logged_list)
3483{ 3577{
3484 struct btrfs_root *log = root->log_root; 3578 struct btrfs_root *log = root->log_root;
3485 struct btrfs_file_extent_item *fi; 3579 struct btrfs_file_extent_item *fi;
@@ -3495,7 +3589,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3495 u64 extent_offset = em->start - em->orig_start; 3589 u64 extent_offset = em->start - em->orig_start;
3496 u64 block_len; 3590 u64 block_len;
3497 int ret; 3591 int ret;
3498 int index = log->log_transid % 2;
3499 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3592 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3500 int extent_inserted = 0; 3593 int extent_inserted = 0;
3501 3594
@@ -3579,17 +3672,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3579 * First check and see if our csums are on our outstanding ordered 3672 * First check and see if our csums are on our outstanding ordered
3580 * extents. 3673 * extents.
3581 */ 3674 */
3582again: 3675 list_for_each_entry(ordered, logged_list, log_list) {
3583 spin_lock_irq(&log->log_extents_lock[index]);
3584 list_for_each_entry(ordered, &log->logged_list[index], log_list) {
3585 struct btrfs_ordered_sum *sum; 3676 struct btrfs_ordered_sum *sum;
3586 3677
3587 if (!mod_len) 3678 if (!mod_len)
3588 break; 3679 break;
3589 3680
3590 if (ordered->inode != inode)
3591 continue;
3592
3593 if (ordered->file_offset + ordered->len <= mod_start || 3681 if (ordered->file_offset + ordered->len <= mod_start ||
3594 mod_start + mod_len <= ordered->file_offset) 3682 mod_start + mod_len <= ordered->file_offset)
3595 continue; 3683 continue;
@@ -3632,12 +3720,6 @@ again:
3632 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 3720 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
3633 &ordered->flags)) 3721 &ordered->flags))
3634 continue; 3722 continue;
3635 atomic_inc(&ordered->refs);
3636 spin_unlock_irq(&log->log_extents_lock[index]);
3637 /*
3638 * we've dropped the lock, we must either break or
3639 * start over after this.
3640 */
3641 3723
3642 if (ordered->csum_bytes_left) { 3724 if (ordered->csum_bytes_left) {
3643 btrfs_start_ordered_extent(inode, ordered, 0); 3725 btrfs_start_ordered_extent(inode, ordered, 0);
@@ -3647,16 +3729,11 @@ again:
3647 3729
3648 list_for_each_entry(sum, &ordered->list, list) { 3730 list_for_each_entry(sum, &ordered->list, list) {
3649 ret = btrfs_csum_file_blocks(trans, log, sum); 3731 ret = btrfs_csum_file_blocks(trans, log, sum);
3650 if (ret) { 3732 if (ret)
3651 btrfs_put_ordered_extent(ordered);
3652 goto unlocked; 3733 goto unlocked;
3653 }
3654 } 3734 }
3655 btrfs_put_ordered_extent(ordered);
3656 goto again;
3657 3735
3658 } 3736 }
3659 spin_unlock_irq(&log->log_extents_lock[index]);
3660unlocked: 3737unlocked:
3661 3738
3662 if (!mod_len || ret) 3739 if (!mod_len || ret)
@@ -3694,7 +3771,8 @@ unlocked:
3694static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3771static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3695 struct btrfs_root *root, 3772 struct btrfs_root *root,
3696 struct inode *inode, 3773 struct inode *inode,
3697 struct btrfs_path *path) 3774 struct btrfs_path *path,
3775 struct list_head *logged_list)
3698{ 3776{
3699 struct extent_map *em, *n; 3777 struct extent_map *em, *n;
3700 struct list_head extents; 3778 struct list_head extents;
@@ -3752,7 +3830,7 @@ process:
3752 3830
3753 write_unlock(&tree->lock); 3831 write_unlock(&tree->lock);
3754 3832
3755 ret = log_one_extent(trans, inode, root, em, path); 3833 ret = log_one_extent(trans, inode, root, em, path, logged_list);
3756 write_lock(&tree->lock); 3834 write_lock(&tree->lock);
3757 clear_em_logging(tree, em); 3835 clear_em_logging(tree, em);
3758 free_extent_map(em); 3836 free_extent_map(em);
@@ -3788,6 +3866,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3788 struct btrfs_key max_key; 3866 struct btrfs_key max_key;
3789 struct btrfs_root *log = root->log_root; 3867 struct btrfs_root *log = root->log_root;
3790 struct extent_buffer *src = NULL; 3868 struct extent_buffer *src = NULL;
3869 LIST_HEAD(logged_list);
3791 u64 last_extent = 0; 3870 u64 last_extent = 0;
3792 int err = 0; 3871 int err = 0;
3793 int ret; 3872 int ret;
@@ -3836,7 +3915,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3836 3915
3837 mutex_lock(&BTRFS_I(inode)->log_mutex); 3916 mutex_lock(&BTRFS_I(inode)->log_mutex);
3838 3917
3839 btrfs_get_logged_extents(log, inode); 3918 btrfs_get_logged_extents(inode, &logged_list);
3840 3919
3841 /* 3920 /*
3842 * a brute force approach to making sure we get the most uptodate 3921 * a brute force approach to making sure we get the most uptodate
@@ -3962,7 +4041,8 @@ log_extents:
3962 btrfs_release_path(path); 4041 btrfs_release_path(path);
3963 btrfs_release_path(dst_path); 4042 btrfs_release_path(dst_path);
3964 if (fast_search) { 4043 if (fast_search) {
3965 ret = btrfs_log_changed_extents(trans, root, inode, dst_path); 4044 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4045 &logged_list);
3966 if (ret) { 4046 if (ret) {
3967 err = ret; 4047 err = ret;
3968 goto out_unlock; 4048 goto out_unlock;
@@ -3987,8 +4067,10 @@ log_extents:
3987 BTRFS_I(inode)->logged_trans = trans->transid; 4067 BTRFS_I(inode)->logged_trans = trans->transid;
3988 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 4068 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
3989out_unlock: 4069out_unlock:
3990 if (err) 4070 if (unlikely(err))
3991 btrfs_free_logged_extents(log, log->log_transid); 4071 btrfs_put_logged_extents(&logged_list);
4072 else
4073 btrfs_submit_logged_extents(&logged_list, log);
3992 mutex_unlock(&BTRFS_I(inode)->log_mutex); 4074 mutex_unlock(&BTRFS_I(inode)->log_mutex);
3993 4075
3994 btrfs_free_path(path); 4076 btrfs_free_path(path);
@@ -4079,7 +4161,8 @@ out:
4079 */ 4161 */
4080static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 4162static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4081 struct btrfs_root *root, struct inode *inode, 4163 struct btrfs_root *root, struct inode *inode,
4082 struct dentry *parent, int exists_only) 4164 struct dentry *parent, int exists_only,
4165 struct btrfs_log_ctx *ctx)
4083{ 4166{
4084 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 4167 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
4085 struct super_block *sb; 4168 struct super_block *sb;
@@ -4116,9 +4199,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4116 goto end_no_trans; 4199 goto end_no_trans;
4117 } 4200 }
4118 4201
4119 ret = start_log_trans(trans, root); 4202 ret = start_log_trans(trans, root, ctx);
4120 if (ret) 4203 if (ret)
4121 goto end_trans; 4204 goto end_no_trans;
4122 4205
4123 ret = btrfs_log_inode(trans, root, inode, inode_only); 4206 ret = btrfs_log_inode(trans, root, inode, inode_only);
4124 if (ret) 4207 if (ret)
@@ -4166,6 +4249,9 @@ end_trans:
4166 root->fs_info->last_trans_log_full_commit = trans->transid; 4249 root->fs_info->last_trans_log_full_commit = trans->transid;
4167 ret = 1; 4250 ret = 1;
4168 } 4251 }
4252
4253 if (ret)
4254 btrfs_remove_log_ctx(root, ctx);
4169 btrfs_end_log_trans(root); 4255 btrfs_end_log_trans(root);
4170end_no_trans: 4256end_no_trans:
4171 return ret; 4257 return ret;
@@ -4178,12 +4264,14 @@ end_no_trans:
4178 * data on disk. 4264 * data on disk.
4179 */ 4265 */
4180int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 4266int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
4181 struct btrfs_root *root, struct dentry *dentry) 4267 struct btrfs_root *root, struct dentry *dentry,
4268 struct btrfs_log_ctx *ctx)
4182{ 4269{
4183 struct dentry *parent = dget_parent(dentry); 4270 struct dentry *parent = dget_parent(dentry);
4184 int ret; 4271 int ret;
4185 4272
4186 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); 4273 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
4274 0, ctx);
4187 dput(parent); 4275 dput(parent);
4188 4276
4189 return ret; 4277 return ret;
@@ -4420,6 +4508,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
4420 root->fs_info->last_trans_committed)) 4508 root->fs_info->last_trans_committed))
4421 return 0; 4509 return 0;
4422 4510
4423 return btrfs_log_inode_parent(trans, root, inode, parent, 1); 4511 return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
4424} 4512}
4425 4513
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 1d4ae0d15a70..91b145fce333 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,28 @@
22/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ 22/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
23#define BTRFS_NO_LOG_SYNC 256 23#define BTRFS_NO_LOG_SYNC 256
24 24
25struct btrfs_log_ctx {
26 int log_ret;
27 int log_transid;
28 struct list_head list;
29};
30
31static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
32{
33 ctx->log_ret = 0;
34 ctx->log_transid = 0;
35 INIT_LIST_HEAD(&ctx->list);
36}
37
25int btrfs_sync_log(struct btrfs_trans_handle *trans, 38int btrfs_sync_log(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root); 39 struct btrfs_root *root, struct btrfs_log_ctx *ctx);
27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 40int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
28int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 41int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
29 struct btrfs_fs_info *fs_info); 42 struct btrfs_fs_info *fs_info);
30int btrfs_recover_log_trees(struct btrfs_root *tree_root); 43int btrfs_recover_log_trees(struct btrfs_root *tree_root);
31int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 44int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, struct dentry *dentry); 45 struct btrfs_root *root, struct dentry *dentry,
46 struct btrfs_log_ctx *ctx);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 47int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root, 48 struct btrfs_root *root,
35 const char *name, int name_len, 49 const char *name, int name_len,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bab0b84d8f80..d241130a32fd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -415,7 +415,8 @@ loop_lock:
415 device->running_pending = 1; 415 device->running_pending = 1;
416 416
417 spin_unlock(&device->io_lock); 417 spin_unlock(&device->io_lock);
418 btrfs_requeue_work(&device->work); 418 btrfs_queue_work(fs_info->submit_workers,
419 &device->work);
419 goto done; 420 goto done;
420 } 421 }
421 /* unplug every 64 requests just for good measure */ 422 /* unplug every 64 requests just for good measure */
@@ -5263,6 +5264,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5263static void btrfs_end_bio(struct bio *bio, int err) 5264static void btrfs_end_bio(struct bio *bio, int err)
5264{ 5265{
5265 struct btrfs_bio *bbio = bio->bi_private; 5266 struct btrfs_bio *bbio = bio->bi_private;
5267 struct btrfs_device *dev = bbio->stripes[0].dev;
5266 int is_orig_bio = 0; 5268 int is_orig_bio = 0;
5267 5269
5268 if (err) { 5270 if (err) {
@@ -5270,7 +5272,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
5270 if (err == -EIO || err == -EREMOTEIO) { 5272 if (err == -EIO || err == -EREMOTEIO) {
5271 unsigned int stripe_index = 5273 unsigned int stripe_index =
5272 btrfs_io_bio(bio)->stripe_index; 5274 btrfs_io_bio(bio)->stripe_index;
5273 struct btrfs_device *dev;
5274 5275
5275 BUG_ON(stripe_index >= bbio->num_stripes); 5276 BUG_ON(stripe_index >= bbio->num_stripes);
5276 dev = bbio->stripes[stripe_index].dev; 5277 dev = bbio->stripes[stripe_index].dev;
@@ -5292,6 +5293,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
5292 if (bio == bbio->orig_bio) 5293 if (bio == bbio->orig_bio)
5293 is_orig_bio = 1; 5294 is_orig_bio = 1;
5294 5295
5296 btrfs_bio_counter_dec(bbio->fs_info);
5297
5295 if (atomic_dec_and_test(&bbio->stripes_pending)) { 5298 if (atomic_dec_and_test(&bbio->stripes_pending)) {
5296 if (!is_orig_bio) { 5299 if (!is_orig_bio) {
5297 bio_put(bio); 5300 bio_put(bio);
@@ -5328,13 +5331,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
5328 } 5331 }
5329} 5332}
5330 5333
5331struct async_sched {
5332 struct bio *bio;
5333 int rw;
5334 struct btrfs_fs_info *info;
5335 struct btrfs_work work;
5336};
5337
5338/* 5334/*
5339 * see run_scheduled_bios for a description of why bios are collected for 5335 * see run_scheduled_bios for a description of why bios are collected for
5340 * async submit. 5336 * async submit.
@@ -5391,8 +5387,8 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
5391 spin_unlock(&device->io_lock); 5387 spin_unlock(&device->io_lock);
5392 5388
5393 if (should_queue) 5389 if (should_queue)
5394 btrfs_queue_worker(&root->fs_info->submit_workers, 5390 btrfs_queue_work(root->fs_info->submit_workers,
5395 &device->work); 5391 &device->work);
5396} 5392}
5397 5393
5398static int bio_size_ok(struct block_device *bdev, struct bio *bio, 5394static int bio_size_ok(struct block_device *bdev, struct bio *bio,
@@ -5447,6 +5443,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5447 } 5443 }
5448#endif 5444#endif
5449 bio->bi_bdev = dev->bdev; 5445 bio->bi_bdev = dev->bdev;
5446
5447 btrfs_bio_counter_inc_noblocked(root->fs_info);
5448
5450 if (async) 5449 if (async)
5451 btrfs_schedule_bio(root, dev, rw, bio); 5450 btrfs_schedule_bio(root, dev, rw, bio);
5452 else 5451 else
@@ -5515,28 +5514,38 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5515 length = bio->bi_iter.bi_size; 5514 length = bio->bi_iter.bi_size;
5516 map_length = length; 5515 map_length = length;
5517 5516
5517 btrfs_bio_counter_inc_blocked(root->fs_info);
5518 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5518 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
5519 mirror_num, &raid_map); 5519 mirror_num, &raid_map);
5520 if (ret) /* -ENOMEM */ 5520 if (ret) {
5521 btrfs_bio_counter_dec(root->fs_info);
5521 return ret; 5522 return ret;
5523 }
5522 5524
5523 total_devs = bbio->num_stripes; 5525 total_devs = bbio->num_stripes;
5524 bbio->orig_bio = first_bio; 5526 bbio->orig_bio = first_bio;
5525 bbio->private = first_bio->bi_private; 5527 bbio->private = first_bio->bi_private;
5526 bbio->end_io = first_bio->bi_end_io; 5528 bbio->end_io = first_bio->bi_end_io;
5529 bbio->fs_info = root->fs_info;
5527 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 5530 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5528 5531
5529 if (raid_map) { 5532 if (raid_map) {
5530 /* In this case, map_length has been set to the length of 5533 /* In this case, map_length has been set to the length of
5531 a single stripe; not the whole write */ 5534 a single stripe; not the whole write */
5532 if (rw & WRITE) { 5535 if (rw & WRITE) {
5533 return raid56_parity_write(root, bio, bbio, 5536 ret = raid56_parity_write(root, bio, bbio,
5534 raid_map, map_length); 5537 raid_map, map_length);
5535 } else { 5538 } else {
5536 return raid56_parity_recover(root, bio, bbio, 5539 ret = raid56_parity_recover(root, bio, bbio,
5537 raid_map, map_length, 5540 raid_map, map_length,
5538 mirror_num); 5541 mirror_num);
5539 } 5542 }
5543 /*
5544 * FIXME, replace dosen't support raid56 yet, please fix
5545 * it in the future.
5546 */
5547 btrfs_bio_counter_dec(root->fs_info);
5548 return ret;
5540 } 5549 }
5541 5550
5542 if (map_length < length) { 5551 if (map_length < length) {
@@ -5578,6 +5587,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5578 async_submit); 5587 async_submit);
5579 dev_nr++; 5588 dev_nr++;
5580 } 5589 }
5590 btrfs_bio_counter_dec(root->fs_info);
5581 return 0; 5591 return 0;
5582} 5592}
5583 5593
@@ -5666,7 +5676,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
5666 else 5676 else
5667 generate_random_uuid(dev->uuid); 5677 generate_random_uuid(dev->uuid);
5668 5678
5669 dev->work.func = pending_bios_fn; 5679 btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
5670 5680
5671 return dev; 5681 return dev;
5672} 5682}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 8b3cd142b373..80754f9dd3df 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -192,6 +192,7 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
192 192
193struct btrfs_bio { 193struct btrfs_bio {
194 atomic_t stripes_pending; 194 atomic_t stripes_pending;
195 struct btrfs_fs_info *fs_info;
195 bio_end_io_t *end_io; 196 bio_end_io_t *end_io;
196 struct bio *orig_bio; 197 struct bio *orig_bio;
197 void *private; 198 void *private;
diff --git a/fs/buffer.c b/fs/buffer.c
index 27265a8b43c1..8c53a2b15ecb 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3088,7 +3088,7 @@ EXPORT_SYMBOL(submit_bh);
3088 * until the buffer gets unlocked). 3088 * until the buffer gets unlocked).
3089 * 3089 *
3090 * ll_rw_block sets b_end_io to simple completion handler that marks 3090 * ll_rw_block sets b_end_io to simple completion handler that marks
3091 * the buffer up-to-date (if approriate), unlocks the buffer and wakes 3091 * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
3092 * any waiters. 3092 * any waiters.
3093 * 3093 *
3094 * All of the buffers must be for the same device, and must also be a 3094 * All of the buffers must be for the same device, and must also be a
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index ca65f39dc8dc..6494d9f673aa 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -391,12 +391,12 @@ try_again:
391 path.dentry = dir; 391 path.dentry = dir;
392 path_to_graveyard.mnt = cache->mnt; 392 path_to_graveyard.mnt = cache->mnt;
393 path_to_graveyard.dentry = cache->graveyard; 393 path_to_graveyard.dentry = cache->graveyard;
394 ret = security_path_rename(&path, rep, &path_to_graveyard, grave); 394 ret = security_path_rename(&path, rep, &path_to_graveyard, grave, 0);
395 if (ret < 0) { 395 if (ret < 0) {
396 cachefiles_io_error(cache, "Rename security error %d", ret); 396 cachefiles_io_error(cache, "Rename security error %d", ret);
397 } else { 397 } else {
398 ret = vfs_rename(dir->d_inode, rep, 398 ret = vfs_rename(dir->d_inode, rep,
399 cache->graveyard->d_inode, grave, NULL); 399 cache->graveyard->d_inode, grave, NULL, 0);
400 if (ret != 0 && ret != -ENOMEM) 400 if (ret != 0 && ret != -ENOMEM)
401 cachefiles_io_error(cache, 401 cachefiles_io_error(cache,
402 "Rename failed with error %d", ret); 402 "Rename failed with error %d", ret);
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index ebaff368120d..4b1fb5ca65b8 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -265,24 +265,22 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
265 goto nomem_monitor; 265 goto nomem_monitor;
266 } 266 }
267 267
268 ret = add_to_page_cache(newpage, bmapping, 268 ret = add_to_page_cache_lru(newpage, bmapping,
269 netpage->index, cachefiles_gfp); 269 netpage->index, cachefiles_gfp);
270 if (ret == 0) 270 if (ret == 0)
271 goto installed_new_backing_page; 271 goto installed_new_backing_page;
272 if (ret != -EEXIST) 272 if (ret != -EEXIST)
273 goto nomem_page; 273 goto nomem_page;
274 } 274 }
275 275
276 /* we've installed a new backing page, so now we need to add it 276 /* we've installed a new backing page, so now we need to start
277 * to the LRU list and start it reading */ 277 * it reading */
278installed_new_backing_page: 278installed_new_backing_page:
279 _debug("- new %p", newpage); 279 _debug("- new %p", newpage);
280 280
281 backpage = newpage; 281 backpage = newpage;
282 newpage = NULL; 282 newpage = NULL;
283 283
284 lru_cache_add_file(backpage);
285
286read_backing_page: 284read_backing_page:
287 ret = bmapping->a_ops->readpage(NULL, backpage); 285 ret = bmapping->a_ops->readpage(NULL, backpage);
288 if (ret < 0) 286 if (ret < 0)
@@ -510,24 +508,23 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
510 goto nomem; 508 goto nomem;
511 } 509 }
512 510
513 ret = add_to_page_cache(newpage, bmapping, 511 ret = add_to_page_cache_lru(newpage, bmapping,
514 netpage->index, cachefiles_gfp); 512 netpage->index,
513 cachefiles_gfp);
515 if (ret == 0) 514 if (ret == 0)
516 goto installed_new_backing_page; 515 goto installed_new_backing_page;
517 if (ret != -EEXIST) 516 if (ret != -EEXIST)
518 goto nomem; 517 goto nomem;
519 } 518 }
520 519
521 /* we've installed a new backing page, so now we need to add it 520 /* we've installed a new backing page, so now we need
522 * to the LRU list and start it reading */ 521 * to start it reading */
523 installed_new_backing_page: 522 installed_new_backing_page:
524 _debug("- new %p", newpage); 523 _debug("- new %p", newpage);
525 524
526 backpage = newpage; 525 backpage = newpage;
527 newpage = NULL; 526 newpage = NULL;
528 527
529 lru_cache_add_file(backpage);
530
531 reread_backing_page: 528 reread_backing_page:
532 ret = bmapping->a_ops->readpage(NULL, backpage); 529 ret = bmapping->a_ops->readpage(NULL, backpage);
533 if (ret < 0) 530 if (ret < 0)
@@ -538,8 +535,8 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
538 monitor_backing_page: 535 monitor_backing_page:
539 _debug("- monitor add"); 536 _debug("- monitor add");
540 537
541 ret = add_to_page_cache(netpage, op->mapping, netpage->index, 538 ret = add_to_page_cache_lru(netpage, op->mapping,
542 cachefiles_gfp); 539 netpage->index, cachefiles_gfp);
543 if (ret < 0) { 540 if (ret < 0) {
544 if (ret == -EEXIST) { 541 if (ret == -EEXIST) {
545 page_cache_release(netpage); 542 page_cache_release(netpage);
@@ -549,8 +546,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
549 goto nomem; 546 goto nomem;
550 } 547 }
551 548
552 lru_cache_add_file(netpage);
553
554 /* install a monitor */ 549 /* install a monitor */
555 page_cache_get(netpage); 550 page_cache_get(netpage);
556 monitor->netfs_page = netpage; 551 monitor->netfs_page = netpage;
@@ -613,8 +608,8 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
613 backing_page_already_uptodate: 608 backing_page_already_uptodate:
614 _debug("- uptodate"); 609 _debug("- uptodate");
615 610
616 ret = add_to_page_cache(netpage, op->mapping, netpage->index, 611 ret = add_to_page_cache_lru(netpage, op->mapping,
617 cachefiles_gfp); 612 netpage->index, cachefiles_gfp);
618 if (ret < 0) { 613 if (ret < 0) {
619 if (ret == -EEXIST) { 614 if (ret == -EEXIST) {
620 page_cache_release(netpage); 615 page_cache_release(netpage);
@@ -631,8 +626,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
631 626
632 fscache_mark_page_cached(op, netpage); 627 fscache_mark_page_cached(op, netpage);
633 628
634 lru_cache_add_file(netpage);
635
636 /* the netpage is unlocked and marked up to date here */ 629 /* the netpage is unlocked and marked up to date here */
637 fscache_end_io(op, netpage, 0); 630 fscache_end_io(op, netpage, 0);
638 page_cache_release(netpage); 631 page_cache_release(netpage);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 849f6132b327..2c70cbe35d39 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -286,7 +286,7 @@ cifs_destroy_inode(struct inode *inode)
286static void 286static void
287cifs_evict_inode(struct inode *inode) 287cifs_evict_inode(struct inode *inode)
288{ 288{
289 truncate_inode_pages(&inode->i_data, 0); 289 truncate_inode_pages_final(&inode->i_data);
290 clear_inode(inode); 290 clear_inode(inode);
291 cifs_fscache_release_inode_cookie(inode); 291 cifs_fscache_release_inode_cookie(inode);
292} 292}
@@ -541,6 +541,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root)
541 541
542static int cifs_remount(struct super_block *sb, int *flags, char *data) 542static int cifs_remount(struct super_block *sb, int *flags, char *data)
543{ 543{
544 sync_filesystem(sb);
544 *flags |= MS_NODIRATIME; 545 *flags |= MS_NODIRATIME;
545 return 0; 546 return 0;
546} 547}
@@ -1005,7 +1006,7 @@ cifs_init_once(void *inode)
1005 init_rwsem(&cifsi->lock_sem); 1006 init_rwsem(&cifsi->lock_sem);
1006} 1007}
1007 1008
1008static int 1009static int __init
1009cifs_init_inodecache(void) 1010cifs_init_inodecache(void)
1010{ 1011{
1011 cifs_inode_cachep = kmem_cache_create("cifs_inode_cache", 1012 cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index b7143cf783ac..381c993b1427 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -10,7 +10,7 @@ extern int coda_hard;
10extern int coda_fake_statfs; 10extern int coda_fake_statfs;
11 11
12void coda_destroy_inodecache(void); 12void coda_destroy_inodecache(void);
13int coda_init_inodecache(void); 13int __init coda_init_inodecache(void);
14int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync); 14int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync);
15void coda_sysctl_init(void); 15void coda_sysctl_init(void);
16void coda_sysctl_clean(void); 16void coda_sysctl_clean(void);
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 506de34a4ef3..d9c7751f10ac 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -73,7 +73,7 @@ static void init_once(void *foo)
73 inode_init_once(&ei->vfs_inode); 73 inode_init_once(&ei->vfs_inode);
74} 74}
75 75
76int coda_init_inodecache(void) 76int __init coda_init_inodecache(void)
77{ 77{
78 coda_inode_cachep = kmem_cache_create("coda_inode_cache", 78 coda_inode_cachep = kmem_cache_create("coda_inode_cache",
79 sizeof(struct coda_inode_info), 79 sizeof(struct coda_inode_info),
@@ -96,6 +96,7 @@ void coda_destroy_inodecache(void)
96 96
97static int coda_remount(struct super_block *sb, int *flags, char *data) 97static int coda_remount(struct super_block *sb, int *flags, char *data)
98{ 98{
99 sync_filesystem(sb);
99 *flags |= MS_NOATIME; 100 *flags |= MS_NOATIME;
100 return 0; 101 return 0;
101} 102}
@@ -250,7 +251,7 @@ static void coda_put_super(struct super_block *sb)
250 251
251static void coda_evict_inode(struct inode *inode) 252static void coda_evict_inode(struct inode *inode)
252{ 253{
253 truncate_inode_pages(&inode->i_data, 0); 254 truncate_inode_pages_final(&inode->i_data);
254 clear_inode(inode); 255 clear_inode(inode);
255 coda_cache_clear_inode(inode); 256 coda_cache_clear_inode(inode);
256} 257}
diff --git a/fs/compat.c b/fs/compat.c
index 6af20de2c1a3..ca926ad0430c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -72,8 +72,8 @@ int compat_printk(const char *fmt, ...)
72 * Not all architectures have sys_utime, so implement this in terms 72 * Not all architectures have sys_utime, so implement this in terms
73 * of sys_utimes. 73 * of sys_utimes.
74 */ 74 */
75asmlinkage long compat_sys_utime(const char __user *filename, 75COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
76 struct compat_utimbuf __user *t) 76 struct compat_utimbuf __user *, t)
77{ 77{
78 struct timespec tv[2]; 78 struct timespec tv[2];
79 79
@@ -87,13 +87,13 @@ asmlinkage long compat_sys_utime(const char __user *filename,
87 return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); 87 return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
88} 88}
89 89
90asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename, struct compat_timespec __user *t, int flags) 90COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags)
91{ 91{
92 struct timespec tv[2]; 92 struct timespec tv[2];
93 93
94 if (t) { 94 if (t) {
95 if (get_compat_timespec(&tv[0], &t[0]) || 95 if (compat_get_timespec(&tv[0], &t[0]) ||
96 get_compat_timespec(&tv[1], &t[1])) 96 compat_get_timespec(&tv[1], &t[1]))
97 return -EFAULT; 97 return -EFAULT;
98 98
99 if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) 99 if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
@@ -102,7 +102,7 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filena
102 return do_utimes(dfd, filename, t ? tv : NULL, flags); 102 return do_utimes(dfd, filename, t ? tv : NULL, flags);
103} 103}
104 104
105asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename, struct compat_timeval __user *t) 105COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t)
106{ 106{
107 struct timespec tv[2]; 107 struct timespec tv[2];
108 108
@@ -121,7 +121,7 @@ asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filena
121 return do_utimes(dfd, filename, t ? tv : NULL, 0); 121 return do_utimes(dfd, filename, t ? tv : NULL, 0);
122} 122}
123 123
124asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_timeval __user *t) 124COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t)
125{ 125{
126 return compat_sys_futimesat(AT_FDCWD, filename, t); 126 return compat_sys_futimesat(AT_FDCWD, filename, t);
127} 127}
@@ -159,8 +159,8 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
159 return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0; 159 return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
160} 160}
161 161
162asmlinkage long compat_sys_newstat(const char __user * filename, 162COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename,
163 struct compat_stat __user *statbuf) 163 struct compat_stat __user *, statbuf)
164{ 164{
165 struct kstat stat; 165 struct kstat stat;
166 int error; 166 int error;
@@ -171,8 +171,8 @@ asmlinkage long compat_sys_newstat(const char __user * filename,
171 return cp_compat_stat(&stat, statbuf); 171 return cp_compat_stat(&stat, statbuf);
172} 172}
173 173
174asmlinkage long compat_sys_newlstat(const char __user * filename, 174COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename,
175 struct compat_stat __user *statbuf) 175 struct compat_stat __user *, statbuf)
176{ 176{
177 struct kstat stat; 177 struct kstat stat;
178 int error; 178 int error;
@@ -184,9 +184,9 @@ asmlinkage long compat_sys_newlstat(const char __user * filename,
184} 184}
185 185
186#ifndef __ARCH_WANT_STAT64 186#ifndef __ARCH_WANT_STAT64
187asmlinkage long compat_sys_newfstatat(unsigned int dfd, 187COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd,
188 const char __user *filename, 188 const char __user *, filename,
189 struct compat_stat __user *statbuf, int flag) 189 struct compat_stat __user *, statbuf, int, flag)
190{ 190{
191 struct kstat stat; 191 struct kstat stat;
192 int error; 192 int error;
@@ -198,8 +198,8 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd,
198} 198}
199#endif 199#endif
200 200
201asmlinkage long compat_sys_newfstat(unsigned int fd, 201COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd,
202 struct compat_stat __user * statbuf) 202 struct compat_stat __user *, statbuf)
203{ 203{
204 struct kstat stat; 204 struct kstat stat;
205 int error = vfs_fstat(fd, &stat); 205 int error = vfs_fstat(fd, &stat);
@@ -247,7 +247,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
247 * The following statfs calls are copies of code from fs/statfs.c and 247 * The following statfs calls are copies of code from fs/statfs.c and
248 * should be checked against those from time to time 248 * should be checked against those from time to time
249 */ 249 */
250asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) 250COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf)
251{ 251{
252 struct kstatfs tmp; 252 struct kstatfs tmp;
253 int error = user_statfs(pathname, &tmp); 253 int error = user_statfs(pathname, &tmp);
@@ -256,7 +256,7 @@ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_sta
256 return error; 256 return error;
257} 257}
258 258
259asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf) 259COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf)
260{ 260{
261 struct kstatfs tmp; 261 struct kstatfs tmp;
262 int error = fd_statfs(fd, &tmp); 262 int error = fd_statfs(fd, &tmp);
@@ -298,7 +298,7 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
298 return 0; 298 return 0;
299} 299}
300 300
301asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf) 301COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf)
302{ 302{
303 struct kstatfs tmp; 303 struct kstatfs tmp;
304 int error; 304 int error;
@@ -312,7 +312,7 @@ asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t s
312 return error; 312 return error;
313} 313}
314 314
315asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf) 315COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf)
316{ 316{
317 struct kstatfs tmp; 317 struct kstatfs tmp;
318 int error; 318 int error;
@@ -331,7 +331,7 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
331 * Given how simple this syscall is that apporach is more maintainable 331 * Given how simple this syscall is that apporach is more maintainable
332 * than the various conversion hacks. 332 * than the various conversion hacks.
333 */ 333 */
334asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u) 334COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u)
335{ 335{
336 struct compat_ustat tmp; 336 struct compat_ustat tmp;
337 struct kstatfs sbuf; 337 struct kstatfs sbuf;
@@ -399,12 +399,28 @@ static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *u
399} 399}
400#endif 400#endif
401 401
402asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd, 402static unsigned int
403 unsigned long arg) 403convert_fcntl_cmd(unsigned int cmd)
404{
405 switch (cmd) {
406 case F_GETLK64:
407 return F_GETLK;
408 case F_SETLK64:
409 return F_SETLK;
410 case F_SETLKW64:
411 return F_SETLKW;
412 }
413
414 return cmd;
415}
416
417COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
418 compat_ulong_t, arg)
404{ 419{
405 mm_segment_t old_fs; 420 mm_segment_t old_fs;
406 struct flock f; 421 struct flock f;
407 long ret; 422 long ret;
423 unsigned int conv_cmd;
408 424
409 switch (cmd) { 425 switch (cmd) {
410 case F_GETLK: 426 case F_GETLK:
@@ -441,16 +457,18 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
441 case F_GETLK64: 457 case F_GETLK64:
442 case F_SETLK64: 458 case F_SETLK64:
443 case F_SETLKW64: 459 case F_SETLKW64:
460 case F_GETLKP:
461 case F_SETLKP:
462 case F_SETLKPW:
444 ret = get_compat_flock64(&f, compat_ptr(arg)); 463 ret = get_compat_flock64(&f, compat_ptr(arg));
445 if (ret != 0) 464 if (ret != 0)
446 break; 465 break;
447 old_fs = get_fs(); 466 old_fs = get_fs();
448 set_fs(KERNEL_DS); 467 set_fs(KERNEL_DS);
449 ret = sys_fcntl(fd, (cmd == F_GETLK64) ? F_GETLK : 468 conv_cmd = convert_fcntl_cmd(cmd);
450 ((cmd == F_SETLK64) ? F_SETLK : F_SETLKW), 469 ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f);
451 (unsigned long)&f);
452 set_fs(old_fs); 470 set_fs(old_fs);
453 if (cmd == F_GETLK64 && ret == 0) { 471 if ((conv_cmd == F_GETLK || conv_cmd == F_GETLKP) && ret == 0) {
454 /* need to return lock information - see above for commentary */ 472 /* need to return lock information - see above for commentary */
455 if (f.l_start > COMPAT_LOFF_T_MAX) 473 if (f.l_start > COMPAT_LOFF_T_MAX)
456 ret = -EOVERFLOW; 474 ret = -EOVERFLOW;
@@ -468,16 +486,22 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
468 return ret; 486 return ret;
469} 487}
470 488
471asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd, 489COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
472 unsigned long arg) 490 compat_ulong_t, arg)
473{ 491{
474 if ((cmd == F_GETLK64) || (cmd == F_SETLK64) || (cmd == F_SETLKW64)) 492 switch (cmd) {
493 case F_GETLK64:
494 case F_SETLK64:
495 case F_SETLKW64:
496 case F_GETLKP:
497 case F_SETLKP:
498 case F_SETLKPW:
475 return -EINVAL; 499 return -EINVAL;
500 }
476 return compat_sys_fcntl64(fd, cmd, arg); 501 return compat_sys_fcntl64(fd, cmd, arg);
477} 502}
478 503
479asmlinkage long 504COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_reqs, u32 __user *, ctx32p)
480compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p)
481{ 505{
482 long ret; 506 long ret;
483 aio_context_t ctx64; 507 aio_context_t ctx64;
@@ -496,32 +520,24 @@ compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p)
496 return ret; 520 return ret;
497} 521}
498 522
499asmlinkage long 523COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
500compat_sys_io_getevents(aio_context_t ctx_id, 524 compat_long_t, min_nr,
501 unsigned long min_nr, 525 compat_long_t, nr,
502 unsigned long nr, 526 struct io_event __user *, events,
503 struct io_event __user *events, 527 struct compat_timespec __user *, timeout)
504 struct compat_timespec __user *timeout)
505{ 528{
506 long ret;
507 struct timespec t; 529 struct timespec t;
508 struct timespec __user *ut = NULL; 530 struct timespec __user *ut = NULL;
509 531
510 ret = -EFAULT;
511 if (unlikely(!access_ok(VERIFY_WRITE, events,
512 nr * sizeof(struct io_event))))
513 goto out;
514 if (timeout) { 532 if (timeout) {
515 if (get_compat_timespec(&t, timeout)) 533 if (compat_get_timespec(&t, timeout))
516 goto out; 534 return -EFAULT;
517 535
518 ut = compat_alloc_user_space(sizeof(*ut)); 536 ut = compat_alloc_user_space(sizeof(*ut));
519 if (copy_to_user(ut, &t, sizeof(t)) ) 537 if (copy_to_user(ut, &t, sizeof(t)) )
520 goto out; 538 return -EFAULT;
521 } 539 }
522 ret = sys_io_getevents(ctx_id, min_nr, nr, events, ut); 540 return sys_io_getevents(ctx_id, min_nr, nr, events, ut);
523out:
524 return ret;
525} 541}
526 542
527/* A write operation does a read from user space and vice versa */ 543/* A write operation does a read from user space and vice versa */
@@ -617,8 +633,8 @@ copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
617 633
618#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *)) 634#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *))
619 635
620asmlinkage long 636COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
621compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb) 637 int, nr, u32 __user *, iocb)
622{ 638{
623 struct iocb __user * __user *iocb64; 639 struct iocb __user * __user *iocb64;
624 long ret; 640 long ret;
@@ -770,10 +786,10 @@ static int do_nfs4_super_data_conv(void *raw_data)
770#define NCPFS_NAME "ncpfs" 786#define NCPFS_NAME "ncpfs"
771#define NFS4_NAME "nfs4" 787#define NFS4_NAME "nfs4"
772 788
773asmlinkage long compat_sys_mount(const char __user * dev_name, 789COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
774 const char __user * dir_name, 790 const char __user *, dir_name,
775 const char __user * type, unsigned long flags, 791 const char __user *, type, compat_ulong_t, flags,
776 const void __user * data) 792 const void __user *, data)
777{ 793{
778 char *kernel_type; 794 char *kernel_type;
779 unsigned long data_page; 795 unsigned long data_page;
@@ -869,8 +885,8 @@ efault:
869 return -EFAULT; 885 return -EFAULT;
870} 886}
871 887
872asmlinkage long compat_sys_old_readdir(unsigned int fd, 888COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
873 struct compat_old_linux_dirent __user *dirent, unsigned int count) 889 struct compat_old_linux_dirent __user *, dirent, unsigned int, count)
874{ 890{
875 int error; 891 int error;
876 struct fd f = fdget(fd); 892 struct fd f = fdget(fd);
@@ -948,8 +964,8 @@ efault:
948 return -EFAULT; 964 return -EFAULT;
949} 965}
950 966
951asmlinkage long compat_sys_getdents(unsigned int fd, 967COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
952 struct compat_linux_dirent __user *dirent, unsigned int count) 968 struct compat_linux_dirent __user *, dirent, unsigned int, count)
953{ 969{
954 struct fd f; 970 struct fd f;
955 struct compat_linux_dirent __user * lastdirent; 971 struct compat_linux_dirent __user * lastdirent;
@@ -981,7 +997,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
981 return error; 997 return error;
982} 998}
983 999
984#ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64 1000#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64
985 1001
986struct compat_getdents_callback64 { 1002struct compat_getdents_callback64 {
987 struct dir_context ctx; 1003 struct dir_context ctx;
@@ -1033,8 +1049,8 @@ efault:
1033 return -EFAULT; 1049 return -EFAULT;
1034} 1050}
1035 1051
1036asmlinkage long compat_sys_getdents64(unsigned int fd, 1052COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd,
1037 struct linux_dirent64 __user * dirent, unsigned int count) 1053 struct linux_dirent64 __user *, dirent, unsigned int, count)
1038{ 1054{
1039 struct fd f; 1055 struct fd f;
1040 struct linux_dirent64 __user * lastdirent; 1056 struct linux_dirent64 __user * lastdirent;
@@ -1066,7 +1082,7 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
1066 fdput(f); 1082 fdput(f);
1067 return error; 1083 return error;
1068} 1084}
1069#endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */ 1085#endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */
1070 1086
1071/* 1087/*
1072 * Exactly like fs/open.c:sys_open(), except that it doesn't set the 1088 * Exactly like fs/open.c:sys_open(), except that it doesn't set the
@@ -1287,9 +1303,9 @@ out_nofds:
1287 return ret; 1303 return ret;
1288} 1304}
1289 1305
1290asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, 1306COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
1291 compat_ulong_t __user *outp, compat_ulong_t __user *exp, 1307 compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
1292 struct compat_timeval __user *tvp) 1308 struct compat_timeval __user *, tvp)
1293{ 1309{
1294 struct timespec end_time, *to = NULL; 1310 struct timespec end_time, *to = NULL;
1295 struct compat_timeval tv; 1311 struct compat_timeval tv;
@@ -1320,7 +1336,7 @@ struct compat_sel_arg_struct {
1320 compat_uptr_t tvp; 1336 compat_uptr_t tvp;
1321}; 1337};
1322 1338
1323asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg) 1339COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
1324{ 1340{
1325 struct compat_sel_arg_struct a; 1341 struct compat_sel_arg_struct a;
1326 1342
@@ -1381,9 +1397,9 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
1381 return ret; 1397 return ret;
1382} 1398}
1383 1399
1384asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp, 1400COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
1385 compat_ulong_t __user *outp, compat_ulong_t __user *exp, 1401 compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
1386 struct compat_timespec __user *tsp, void __user *sig) 1402 struct compat_timespec __user *, tsp, void __user *, sig)
1387{ 1403{
1388 compat_size_t sigsetsize = 0; 1404 compat_size_t sigsetsize = 0;
1389 compat_uptr_t up = 0; 1405 compat_uptr_t up = 0;
@@ -1400,9 +1416,9 @@ asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp,
1400 sigsetsize); 1416 sigsetsize);
1401} 1417}
1402 1418
1403asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, 1419COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
1404 unsigned int nfds, struct compat_timespec __user *tsp, 1420 unsigned int, nfds, struct compat_timespec __user *, tsp,
1405 const compat_sigset_t __user *sigmask, compat_size_t sigsetsize) 1421 const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
1406{ 1422{
1407 compat_sigset_t ss32; 1423 compat_sigset_t ss32;
1408 sigset_t ksigmask, sigsaved; 1424 sigset_t ksigmask, sigsaved;
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index a81147e2e4ef..4d24d17bcfc1 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -88,6 +88,11 @@ static void cputime_to_compat_timeval(const cputime_t cputime,
88#define ELF_HWCAP COMPAT_ELF_HWCAP 88#define ELF_HWCAP COMPAT_ELF_HWCAP
89#endif 89#endif
90 90
91#ifdef COMPAT_ELF_HWCAP2
92#undef ELF_HWCAP2
93#define ELF_HWCAP2 COMPAT_ELF_HWCAP2
94#endif
95
91#ifdef COMPAT_ARCH_DLINFO 96#ifdef COMPAT_ARCH_DLINFO
92#undef ARCH_DLINFO 97#undef ARCH_DLINFO
93#define ARCH_DLINFO COMPAT_ARCH_DLINFO 98#define ARCH_DLINFO COMPAT_ARCH_DLINFO
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 3881610b6438..e82289047272 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1538,9 +1538,10 @@ static int compat_ioctl_check_table(unsigned int xcmd)
1538 return ioctl_pointer[i] == xcmd; 1538 return ioctl_pointer[i] == xcmd;
1539} 1539}
1540 1540
1541asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, 1541COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
1542 unsigned long arg) 1542 compat_ulong_t, arg32)
1543{ 1543{
1544 unsigned long arg = arg32;
1544 struct fd f = fdget(fd); 1545 struct fd f = fdget(fd);
1545 int error = -EBADF; 1546 int error = -EBADF;
1546 if (!f.file) 1547 if (!f.file)
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 06610cf94d57..ddcfe590b8a8 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -195,8 +195,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
195 struct page *page = NULL; 195 struct page *page = NULL;
196 196
197 if (blocknr + i < devsize) { 197 if (blocknr + i < devsize) {
198 page = read_mapping_page_async(mapping, blocknr + i, 198 page = read_mapping_page(mapping, blocknr + i, NULL);
199 NULL);
200 /* synchronous error? */ 199 /* synchronous error? */
201 if (IS_ERR(page)) 200 if (IS_ERR(page))
202 page = NULL; 201 page = NULL;
@@ -244,6 +243,7 @@ static void cramfs_kill_sb(struct super_block *sb)
244 243
245static int cramfs_remount(struct super_block *sb, int *flags, char *data) 244static int cramfs_remount(struct super_block *sb, int *flags, char *data)
246{ 245{
246 sync_filesystem(sb);
247 *flags |= MS_RDONLY; 247 *flags |= MS_RDONLY;
248 return 0; 248 return 0;
249} 249}
diff --git a/fs/dcache.c b/fs/dcache.c
index ca02c13a84aa..66cba5a8a346 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2483,12 +2483,14 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
2483 dentry->d_name.name = dentry->d_iname; 2483 dentry->d_name.name = dentry->d_iname;
2484 } else { 2484 } else {
2485 /* 2485 /*
2486 * Both are internal. Just copy target to dentry 2486 * Both are internal.
2487 */ 2487 */
2488 memcpy(dentry->d_iname, target->d_name.name, 2488 unsigned int i;
2489 target->d_name.len + 1); 2489 BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
2490 dentry->d_name.len = target->d_name.len; 2490 for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
2491 return; 2491 swap(((long *) &dentry->d_iname)[i],
2492 ((long *) &target->d_iname)[i]);
2493 }
2492 } 2494 }
2493 } 2495 }
2494 swap(dentry->d_name.len, target->d_name.len); 2496 swap(dentry->d_name.len, target->d_name.len);
@@ -2545,13 +2547,15 @@ static void dentry_unlock_parents_for_move(struct dentry *dentry,
2545 * __d_move - move a dentry 2547 * __d_move - move a dentry
2546 * @dentry: entry to move 2548 * @dentry: entry to move
2547 * @target: new dentry 2549 * @target: new dentry
2550 * @exchange: exchange the two dentries
2548 * 2551 *
2549 * Update the dcache to reflect the move of a file name. Negative 2552 * Update the dcache to reflect the move of a file name. Negative
2550 * dcache entries should not be moved in this way. Caller must hold 2553 * dcache entries should not be moved in this way. Caller must hold
2551 * rename_lock, the i_mutex of the source and target directories, 2554 * rename_lock, the i_mutex of the source and target directories,
2552 * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). 2555 * and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
2553 */ 2556 */
2554static void __d_move(struct dentry * dentry, struct dentry * target) 2557static void __d_move(struct dentry *dentry, struct dentry *target,
2558 bool exchange)
2555{ 2559{
2556 if (!dentry->d_inode) 2560 if (!dentry->d_inode)
2557 printk(KERN_WARNING "VFS: moving negative dcache entry\n"); 2561 printk(KERN_WARNING "VFS: moving negative dcache entry\n");
@@ -2573,8 +2577,15 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
2573 __d_drop(dentry); 2577 __d_drop(dentry);
2574 __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash)); 2578 __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
2575 2579
2576 /* Unhash the target: dput() will then get rid of it */ 2580 /*
2581 * Unhash the target (d_delete() is not usable here). If exchanging
2582 * the two dentries, then rehash onto the other's hash queue.
2583 */
2577 __d_drop(target); 2584 __d_drop(target);
2585 if (exchange) {
2586 __d_rehash(target,
2587 d_hash(dentry->d_parent, dentry->d_name.hash));
2588 }
2578 2589
2579 list_del(&dentry->d_u.d_child); 2590 list_del(&dentry->d_u.d_child);
2580 list_del(&target->d_u.d_child); 2591 list_del(&target->d_u.d_child);
@@ -2601,6 +2612,8 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
2601 write_seqcount_end(&dentry->d_seq); 2612 write_seqcount_end(&dentry->d_seq);
2602 2613
2603 dentry_unlock_parents_for_move(dentry, target); 2614 dentry_unlock_parents_for_move(dentry, target);
2615 if (exchange)
2616 fsnotify_d_move(target);
2604 spin_unlock(&target->d_lock); 2617 spin_unlock(&target->d_lock);
2605 fsnotify_d_move(dentry); 2618 fsnotify_d_move(dentry);
2606 spin_unlock(&dentry->d_lock); 2619 spin_unlock(&dentry->d_lock);
@@ -2618,11 +2631,30 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
2618void d_move(struct dentry *dentry, struct dentry *target) 2631void d_move(struct dentry *dentry, struct dentry *target)
2619{ 2632{
2620 write_seqlock(&rename_lock); 2633 write_seqlock(&rename_lock);
2621 __d_move(dentry, target); 2634 __d_move(dentry, target, false);
2622 write_sequnlock(&rename_lock); 2635 write_sequnlock(&rename_lock);
2623} 2636}
2624EXPORT_SYMBOL(d_move); 2637EXPORT_SYMBOL(d_move);
2625 2638
2639/*
2640 * d_exchange - exchange two dentries
2641 * @dentry1: first dentry
2642 * @dentry2: second dentry
2643 */
2644void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
2645{
2646 write_seqlock(&rename_lock);
2647
2648 WARN_ON(!dentry1->d_inode);
2649 WARN_ON(!dentry2->d_inode);
2650 WARN_ON(IS_ROOT(dentry1));
2651 WARN_ON(IS_ROOT(dentry2));
2652
2653 __d_move(dentry1, dentry2, true);
2654
2655 write_sequnlock(&rename_lock);
2656}
2657
2626/** 2658/**
2627 * d_ancestor - search for an ancestor 2659 * d_ancestor - search for an ancestor
2628 * @p1: ancestor dentry 2660 * @p1: ancestor dentry
@@ -2670,7 +2702,7 @@ static struct dentry *__d_unalias(struct inode *inode,
2670 m2 = &alias->d_parent->d_inode->i_mutex; 2702 m2 = &alias->d_parent->d_inode->i_mutex;
2671out_unalias: 2703out_unalias:
2672 if (likely(!d_mountpoint(alias))) { 2704 if (likely(!d_mountpoint(alias))) {
2673 __d_move(alias, dentry); 2705 __d_move(alias, dentry, false);
2674 ret = alias; 2706 ret = alias;
2675 } 2707 }
2676out_err: 2708out_err:
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 9c0444cccbe1..8c41b52da358 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -218,6 +218,7 @@ static int debugfs_remount(struct super_block *sb, int *flags, char *data)
218 int err; 218 int err;
219 struct debugfs_fs_info *fsi = sb->s_fs_info; 219 struct debugfs_fs_info *fsi = sb->s_fs_info;
220 220
221 sync_filesystem(sb);
221 err = debugfs_parse_options(data, &fsi->mount_opts); 222 err = debugfs_parse_options(data, &fsi->mount_opts);
222 if (err) 223 if (err)
223 goto fail; 224 goto fail;
@@ -358,7 +359,7 @@ exit:
358 * @name: a pointer to a string containing the name of the file to create. 359 * @name: a pointer to a string containing the name of the file to create.
359 * @mode: the permission that the file should have. 360 * @mode: the permission that the file should have.
360 * @parent: a pointer to the parent dentry for this file. This should be a 361 * @parent: a pointer to the parent dentry for this file. This should be a
361 * directory dentry if set. If this paramater is NULL, then the 362 * directory dentry if set. If this parameter is NULL, then the
362 * file will be created in the root of the debugfs filesystem. 363 * file will be created in the root of the debugfs filesystem.
363 * @data: a pointer to something that the caller will want to get to later 364 * @data: a pointer to something that the caller will want to get to later
364 * on. The inode.i_private pointer will point to this value on 365 * on. The inode.i_private pointer will point to this value on
@@ -400,7 +401,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file);
400 * @name: a pointer to a string containing the name of the directory to 401 * @name: a pointer to a string containing the name of the directory to
401 * create. 402 * create.
402 * @parent: a pointer to the parent dentry for this file. This should be a 403 * @parent: a pointer to the parent dentry for this file. This should be a
403 * directory dentry if set. If this paramater is NULL, then the 404 * directory dentry if set. If this parameter is NULL, then the
404 * directory will be created in the root of the debugfs filesystem. 405 * directory will be created in the root of the debugfs filesystem.
405 * 406 *
406 * This function creates a directory in debugfs with the given name. 407 * This function creates a directory in debugfs with the given name.
@@ -425,7 +426,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
425 * @name: a pointer to a string containing the name of the symbolic link to 426 * @name: a pointer to a string containing the name of the symbolic link to
426 * create. 427 * create.
427 * @parent: a pointer to the parent dentry for this symbolic link. This 428 * @parent: a pointer to the parent dentry for this symbolic link. This
428 * should be a directory dentry if set. If this paramater is NULL, 429 * should be a directory dentry if set. If this parameter is NULL,
429 * then the symbolic link will be created in the root of the debugfs 430 * then the symbolic link will be created in the root of the debugfs
430 * filesystem. 431 * filesystem.
431 * @target: a pointer to a string containing the path to the target of the 432 * @target: a pointer to a string containing the path to the target of the
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index a726b9f29cb7..c71038079b47 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -313,6 +313,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
313 struct pts_fs_info *fsi = DEVPTS_SB(sb); 313 struct pts_fs_info *fsi = DEVPTS_SB(sb);
314 struct pts_mount_opts *opts = &fsi->mount_opts; 314 struct pts_mount_opts *opts = &fsi->mount_opts;
315 315
316 sync_filesystem(sb);
316 err = parse_mount_options(data, PARSE_REMOUNT, opts); 317 err = parse_mount_options(data, PARSE_REMOUNT, opts);
317 318
318 /* 319 /*
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 160a5489a939..31ba0935e32e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -664,7 +664,6 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
664 goto out; 664 goto out;
665 sector = start_sector << (sdio->blkbits - 9); 665 sector = start_sector << (sdio->blkbits - 9);
666 nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev)); 666 nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));
667 nr_pages = min(nr_pages, BIO_MAX_PAGES);
668 BUG_ON(nr_pages <= 0); 667 BUG_ON(nr_pages <= 0);
669 dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); 668 dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
670 sdio->boundary = 0; 669 sdio->boundary = 0;
@@ -1194,13 +1193,19 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1194 } 1193 }
1195 1194
1196 /* 1195 /*
1197 * For file extending writes updating i_size before data 1196 * For file extending writes updating i_size before data writeouts
1198 * writeouts complete can expose uninitialized blocks. So 1197 * complete can expose uninitialized blocks in dumb filesystems.
1199 * even for AIO, we need to wait for i/o to complete before 1198 * In that case we need to wait for I/O completion even if asked
1200 * returning in this case. 1199 * for an asynchronous write.
1201 */ 1200 */
1202 dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && 1201 if (is_sync_kiocb(iocb))
1203 (end > i_size_read(inode))); 1202 dio->is_async = false;
1203 else if (!(dio->flags & DIO_ASYNC_EXTEND) &&
1204 (rw & WRITE) && end > i_size_read(inode))
1205 dio->is_async = false;
1206 else
1207 dio->is_async = true;
1208
1204 dio->inode = inode; 1209 dio->inode = inode;
1205 dio->rw = rw; 1210 dio->rw = rw;
1206 1211
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 0e90f0c91b93..dcea1e37a1b7 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -14,6 +14,7 @@
14#include "dlm_internal.h" 14#include "dlm_internal.h"
15#include "lock.h" 15#include "lock.h"
16#include "user.h" 16#include "user.h"
17#include "ast.h"
17 18
18static uint64_t dlm_cb_seq; 19static uint64_t dlm_cb_seq;
19static DEFINE_SPINLOCK(dlm_cb_seq_spin); 20static DEFINE_SPINLOCK(dlm_cb_seq_spin);
@@ -308,6 +309,6 @@ void dlm_callback_resume(struct dlm_ls *ls)
308 mutex_unlock(&ls->ls_cb_mutex); 309 mutex_unlock(&ls->ls_cb_mutex);
309 310
310 if (count) 311 if (count)
311 log_debug(ls, "dlm_callback_resume %d", count); 312 log_rinfo(ls, "dlm_callback_resume %d", count);
312} 313}
313 314
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 278a75cda446..d975851a7e1e 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -68,7 +68,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
68 uint16_t namelen; 68 uint16_t namelen;
69 unsigned int count = 0, count_match = 0, count_bad = 0, count_add = 0; 69 unsigned int count = 0, count_match = 0, count_bad = 0, count_add = 0;
70 70
71 log_debug(ls, "dlm_recover_directory"); 71 log_rinfo(ls, "dlm_recover_directory");
72 72
73 if (dlm_no_directory(ls)) 73 if (dlm_no_directory(ls))
74 goto out_status; 74 goto out_status;
@@ -189,7 +189,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
189 error = 0; 189 error = 0;
190 dlm_set_recover_status(ls, DLM_RS_DIR); 190 dlm_set_recover_status(ls, DLM_RS_DIR);
191 191
192 log_debug(ls, "dlm_recover_directory %u in %u new", 192 log_rinfo(ls, "dlm_recover_directory %u in %u new",
193 count, count_add); 193 count, count_add);
194 out_free: 194 out_free:
195 kfree(last_name); 195 kfree(last_name);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index e7665c31f7b1..5eff6ea3e27f 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -65,6 +65,8 @@ struct dlm_mhandle;
65 printk(KERN_ERR "dlm: "fmt"\n" , ##args) 65 printk(KERN_ERR "dlm: "fmt"\n" , ##args)
66#define log_error(ls, fmt, args...) \ 66#define log_error(ls, fmt, args...) \
67 printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args) 67 printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
68#define log_rinfo(ls, fmt, args...) \
69 printk(KERN_INFO "dlm: %s: " fmt "\n", (ls)->ls_name , ##args);
68 70
69#define log_debug(ls, fmt, args...) \ 71#define log_debug(ls, fmt, args...) \
70do { \ 72do { \
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index e223a911a834..83f3d5520307 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -687,6 +687,7 @@ static int find_rsb_dir(struct dlm_ls *ls, char *name, int len,
687 log_error(ls, "find_rsb new from_other %d dir %d our %d %s", 687 log_error(ls, "find_rsb new from_other %d dir %d our %d %s",
688 from_nodeid, dir_nodeid, our_nodeid, r->res_name); 688 from_nodeid, dir_nodeid, our_nodeid, r->res_name);
689 dlm_free_rsb(r); 689 dlm_free_rsb(r);
690 r = NULL;
690 error = -ENOTBLK; 691 error = -ENOTBLK;
691 goto out_unlock; 692 goto out_unlock;
692 } 693 }
@@ -5462,7 +5463,7 @@ void dlm_recover_purge(struct dlm_ls *ls)
5462 up_write(&ls->ls_root_sem); 5463 up_write(&ls->ls_root_sem);
5463 5464
5464 if (lkb_count) 5465 if (lkb_count)
5465 log_debug(ls, "dlm_recover_purge %u locks for %u nodes", 5466 log_rinfo(ls, "dlm_recover_purge %u locks for %u nodes",
5466 lkb_count, nodes_count); 5467 lkb_count, nodes_count);
5467} 5468}
5468 5469
@@ -5536,7 +5537,7 @@ void dlm_recover_grant(struct dlm_ls *ls)
5536 } 5537 }
5537 5538
5538 if (lkb_count) 5539 if (lkb_count)
5539 log_debug(ls, "dlm_recover_grant %u locks on %u resources", 5540 log_rinfo(ls, "dlm_recover_grant %u locks on %u resources",
5540 lkb_count, rsb_count); 5541 lkb_count, rsb_count);
5541} 5542}
5542 5543
@@ -5695,7 +5696,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
5695 put_rsb(r); 5696 put_rsb(r);
5696 out: 5697 out:
5697 if (error && error != -EEXIST) 5698 if (error && error != -EEXIST)
5698 log_debug(ls, "dlm_recover_master_copy remote %d %x error %d", 5699 log_rinfo(ls, "dlm_recover_master_copy remote %d %x error %d",
5699 from_nodeid, remid, error); 5700 from_nodeid, remid, error);
5700 rl->rl_result = cpu_to_le32(error); 5701 rl->rl_result = cpu_to_le32(error);
5701 return error; 5702 return error;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d5abafd56a6d..04d6398c1f1c 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -190,7 +190,7 @@ static int do_uevent(struct dlm_ls *ls, int in)
190 else 190 else
191 kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE); 191 kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
192 192
193 log_debug(ls, "%s the lockspace group...", in ? "joining" : "leaving"); 193 log_rinfo(ls, "%s the lockspace group...", in ? "joining" : "leaving");
194 194
195 /* dlm_controld will see the uevent, do the necessary group management 195 /* dlm_controld will see the uevent, do the necessary group management
196 and then write to sysfs to wake us */ 196 and then write to sysfs to wake us */
@@ -198,7 +198,7 @@ static int do_uevent(struct dlm_ls *ls, int in)
198 error = wait_event_interruptible(ls->ls_uevent_wait, 198 error = wait_event_interruptible(ls->ls_uevent_wait,
199 test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags)); 199 test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
200 200
201 log_debug(ls, "group event done %d %d", error, ls->ls_uevent_result); 201 log_rinfo(ls, "group event done %d %d", error, ls->ls_uevent_result);
202 202
203 if (error) 203 if (error)
204 goto out; 204 goto out;
@@ -640,7 +640,7 @@ static int new_lockspace(const char *name, const char *cluster,
640 640
641 dlm_create_debug_file(ls); 641 dlm_create_debug_file(ls);
642 642
643 log_debug(ls, "join complete"); 643 log_rinfo(ls, "join complete");
644 *lockspace = ls; 644 *lockspace = ls;
645 return 0; 645 return 0;
646 646
@@ -835,7 +835,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
835 dlm_clear_members(ls); 835 dlm_clear_members(ls);
836 dlm_clear_members_gone(ls); 836 dlm_clear_members_gone(ls);
837 kfree(ls->ls_node_array); 837 kfree(ls->ls_node_array);
838 log_debug(ls, "release_lockspace final free"); 838 log_rinfo(ls, "release_lockspace final free");
839 kobject_put(&ls->ls_kobj); 839 kobject_put(&ls->ls_kobj);
840 /* The ls structure will be freed when the kobject is done with */ 840 /* The ls structure will be freed when the kobject is done with */
841 841
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 476557b54921..9c47f1c14a8b 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -60,18 +60,15 @@ void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc)
60 60
61#define SLOT_DEBUG_LINE 128 61#define SLOT_DEBUG_LINE 128
62 62
63static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots, 63static void log_slots(struct dlm_ls *ls, uint32_t gen, int num_slots,
64 struct rcom_slot *ro0, struct dlm_slot *array, 64 struct rcom_slot *ro0, struct dlm_slot *array,
65 int array_size) 65 int array_size)
66{ 66{
67 char line[SLOT_DEBUG_LINE]; 67 char line[SLOT_DEBUG_LINE];
68 int len = SLOT_DEBUG_LINE - 1; 68 int len = SLOT_DEBUG_LINE - 1;
69 int pos = 0; 69 int pos = 0;
70 int ret, i; 70 int ret, i;
71 71
72 if (!dlm_config.ci_log_debug)
73 return;
74
75 memset(line, 0, sizeof(line)); 72 memset(line, 0, sizeof(line));
76 73
77 if (array) { 74 if (array) {
@@ -95,7 +92,7 @@ static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots,
95 } 92 }
96 } 93 }
97 94
98 log_debug(ls, "generation %u slots %d%s", gen, num_slots, line); 95 log_rinfo(ls, "generation %u slots %d%s", gen, num_slots, line);
99} 96}
100 97
101int dlm_slots_copy_in(struct dlm_ls *ls) 98int dlm_slots_copy_in(struct dlm_ls *ls)
@@ -129,7 +126,7 @@ int dlm_slots_copy_in(struct dlm_ls *ls)
129 ro->ro_slot = le16_to_cpu(ro->ro_slot); 126 ro->ro_slot = le16_to_cpu(ro->ro_slot);
130 } 127 }
131 128
132 log_debug_slots(ls, gen, num_slots, ro0, NULL, 0); 129 log_slots(ls, gen, num_slots, ro0, NULL, 0);
133 130
134 list_for_each_entry(memb, &ls->ls_nodes, list) { 131 list_for_each_entry(memb, &ls->ls_nodes, list) {
135 for (i = 0, ro = ro0; i < num_slots; i++, ro++) { 132 for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
@@ -274,7 +271,7 @@ int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
274 271
275 gen++; 272 gen++;
276 273
277 log_debug_slots(ls, gen, num, NULL, array, array_size); 274 log_slots(ls, gen, num, NULL, array, array_size);
278 275
279 max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) - 276 max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) -
280 sizeof(struct rcom_config)) / sizeof(struct rcom_slot); 277 sizeof(struct rcom_config)) / sizeof(struct rcom_slot);
@@ -447,7 +444,7 @@ static int ping_members(struct dlm_ls *ls)
447 break; 444 break;
448 } 445 }
449 if (error) 446 if (error)
450 log_debug(ls, "ping_members aborted %d last nodeid %d", 447 log_rinfo(ls, "ping_members aborted %d last nodeid %d",
451 error, ls->ls_recover_nodeid); 448 error, ls->ls_recover_nodeid);
452 return error; 449 return error;
453} 450}
@@ -539,7 +536,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
539 count as a negative change so the "neg" recovery steps will happen */ 536 count as a negative change so the "neg" recovery steps will happen */
540 537
541 list_for_each_entry(memb, &ls->ls_nodes_gone, list) { 538 list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
542 log_debug(ls, "prev removed member %d", memb->nodeid); 539 log_rinfo(ls, "prev removed member %d", memb->nodeid);
543 neg++; 540 neg++;
544 } 541 }
545 542
@@ -551,10 +548,10 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
551 continue; 548 continue;
552 549
553 if (!node) { 550 if (!node) {
554 log_debug(ls, "remove member %d", memb->nodeid); 551 log_rinfo(ls, "remove member %d", memb->nodeid);
555 } else { 552 } else {
556 /* removed and re-added */ 553 /* removed and re-added */
557 log_debug(ls, "remove member %d comm_seq %u %u", 554 log_rinfo(ls, "remove member %d comm_seq %u %u",
558 memb->nodeid, memb->comm_seq, node->comm_seq); 555 memb->nodeid, memb->comm_seq, node->comm_seq);
559 } 556 }
560 557
@@ -571,7 +568,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
571 if (dlm_is_member(ls, node->nodeid)) 568 if (dlm_is_member(ls, node->nodeid))
572 continue; 569 continue;
573 dlm_add_member(ls, node); 570 dlm_add_member(ls, node);
574 log_debug(ls, "add member %d", node->nodeid); 571 log_rinfo(ls, "add member %d", node->nodeid);
575 } 572 }
576 573
577 list_for_each_entry(memb, &ls->ls_nodes, list) { 574 list_for_each_entry(memb, &ls->ls_nodes, list) {
@@ -591,7 +588,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
591 complete(&ls->ls_members_done); 588 complete(&ls->ls_members_done);
592 } 589 }
593 590
594 log_debug(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); 591 log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
595 return error; 592 return error;
596} 593}
597 594
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index a6bc63f6e31b..eaea789bf97d 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -526,7 +526,7 @@ int dlm_recover_masters(struct dlm_ls *ls)
526 int nodir = dlm_no_directory(ls); 526 int nodir = dlm_no_directory(ls);
527 int error; 527 int error;
528 528
529 log_debug(ls, "dlm_recover_masters"); 529 log_rinfo(ls, "dlm_recover_masters");
530 530
531 down_read(&ls->ls_root_sem); 531 down_read(&ls->ls_root_sem);
532 list_for_each_entry(r, &ls->ls_root_list, res_root_list) { 532 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
@@ -552,7 +552,7 @@ int dlm_recover_masters(struct dlm_ls *ls)
552 } 552 }
553 up_read(&ls->ls_root_sem); 553 up_read(&ls->ls_root_sem);
554 554
555 log_debug(ls, "dlm_recover_masters %u of %u", count, total); 555 log_rinfo(ls, "dlm_recover_masters %u of %u", count, total);
556 556
557 error = dlm_wait_function(ls, &recover_idr_empty); 557 error = dlm_wait_function(ls, &recover_idr_empty);
558 out: 558 out:
@@ -685,7 +685,7 @@ int dlm_recover_locks(struct dlm_ls *ls)
685 } 685 }
686 up_read(&ls->ls_root_sem); 686 up_read(&ls->ls_root_sem);
687 687
688 log_debug(ls, "dlm_recover_locks %d out", count); 688 log_rinfo(ls, "dlm_recover_locks %d out", count);
689 689
690 error = dlm_wait_function(ls, &recover_list_empty); 690 error = dlm_wait_function(ls, &recover_list_empty);
691 out: 691 out:
@@ -883,7 +883,7 @@ void dlm_recover_rsbs(struct dlm_ls *ls)
883 up_read(&ls->ls_root_sem); 883 up_read(&ls->ls_root_sem);
884 884
885 if (count) 885 if (count)
886 log_debug(ls, "dlm_recover_rsbs %d done", count); 886 log_rinfo(ls, "dlm_recover_rsbs %d done", count);
887} 887}
888 888
889/* Create a single list of all root rsb's to be used during recovery */ 889/* Create a single list of all root rsb's to be used during recovery */
@@ -950,6 +950,6 @@ void dlm_clear_toss(struct dlm_ls *ls)
950 } 950 }
951 951
952 if (count) 952 if (count)
953 log_debug(ls, "dlm_clear_toss %u done", count); 953 log_rinfo(ls, "dlm_clear_toss %u done", count);
954} 954}
955 955
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 32f9f8926ec3..6859b4bf971e 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -55,7 +55,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
55 unsigned long start; 55 unsigned long start;
56 int error, neg = 0; 56 int error, neg = 0;
57 57
58 log_debug(ls, "dlm_recover %llu", (unsigned long long)rv->seq); 58 log_rinfo(ls, "dlm_recover %llu", (unsigned long long)rv->seq);
59 59
60 mutex_lock(&ls->ls_recoverd_active); 60 mutex_lock(&ls->ls_recoverd_active);
61 61
@@ -76,7 +76,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
76 76
77 error = dlm_recover_members(ls, rv, &neg); 77 error = dlm_recover_members(ls, rv, &neg);
78 if (error) { 78 if (error) {
79 log_debug(ls, "dlm_recover_members error %d", error); 79 log_rinfo(ls, "dlm_recover_members error %d", error);
80 goto fail; 80 goto fail;
81 } 81 }
82 82
@@ -90,7 +90,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
90 90
91 error = dlm_recover_members_wait(ls); 91 error = dlm_recover_members_wait(ls);
92 if (error) { 92 if (error) {
93 log_debug(ls, "dlm_recover_members_wait error %d", error); 93 log_rinfo(ls, "dlm_recover_members_wait error %d", error);
94 goto fail; 94 goto fail;
95 } 95 }
96 96
@@ -103,7 +103,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
103 103
104 error = dlm_recover_directory(ls); 104 error = dlm_recover_directory(ls);
105 if (error) { 105 if (error) {
106 log_debug(ls, "dlm_recover_directory error %d", error); 106 log_rinfo(ls, "dlm_recover_directory error %d", error);
107 goto fail; 107 goto fail;
108 } 108 }
109 109
@@ -111,11 +111,11 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
111 111
112 error = dlm_recover_directory_wait(ls); 112 error = dlm_recover_directory_wait(ls);
113 if (error) { 113 if (error) {
114 log_debug(ls, "dlm_recover_directory_wait error %d", error); 114 log_rinfo(ls, "dlm_recover_directory_wait error %d", error);
115 goto fail; 115 goto fail;
116 } 116 }
117 117
118 log_debug(ls, "dlm_recover_directory %u out %u messages", 118 log_rinfo(ls, "dlm_recover_directory %u out %u messages",
119 ls->ls_recover_dir_sent_res, ls->ls_recover_dir_sent_msg); 119 ls->ls_recover_dir_sent_res, ls->ls_recover_dir_sent_msg);
120 120
121 /* 121 /*
@@ -144,7 +144,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
144 144
145 error = dlm_recover_masters(ls); 145 error = dlm_recover_masters(ls);
146 if (error) { 146 if (error) {
147 log_debug(ls, "dlm_recover_masters error %d", error); 147 log_rinfo(ls, "dlm_recover_masters error %d", error);
148 goto fail; 148 goto fail;
149 } 149 }
150 150
@@ -154,7 +154,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
154 154
155 error = dlm_recover_locks(ls); 155 error = dlm_recover_locks(ls);
156 if (error) { 156 if (error) {
157 log_debug(ls, "dlm_recover_locks error %d", error); 157 log_rinfo(ls, "dlm_recover_locks error %d", error);
158 goto fail; 158 goto fail;
159 } 159 }
160 160
@@ -162,11 +162,11 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
162 162
163 error = dlm_recover_locks_wait(ls); 163 error = dlm_recover_locks_wait(ls);
164 if (error) { 164 if (error) {
165 log_debug(ls, "dlm_recover_locks_wait error %d", error); 165 log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
166 goto fail; 166 goto fail;
167 } 167 }
168 168
169 log_debug(ls, "dlm_recover_locks %u in", 169 log_rinfo(ls, "dlm_recover_locks %u in",
170 ls->ls_recover_locks_in); 170 ls->ls_recover_locks_in);
171 171
172 /* 172 /*
@@ -186,7 +186,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
186 186
187 error = dlm_recover_locks_wait(ls); 187 error = dlm_recover_locks_wait(ls);
188 if (error) { 188 if (error) {
189 log_debug(ls, "dlm_recover_locks_wait error %d", error); 189 log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
190 goto fail; 190 goto fail;
191 } 191 }
192 } 192 }
@@ -205,7 +205,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
205 205
206 error = dlm_recover_done_wait(ls); 206 error = dlm_recover_done_wait(ls);
207 if (error) { 207 if (error) {
208 log_debug(ls, "dlm_recover_done_wait error %d", error); 208 log_rinfo(ls, "dlm_recover_done_wait error %d", error);
209 goto fail; 209 goto fail;
210 } 210 }
211 211
@@ -217,25 +217,25 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
217 217
218 error = enable_locking(ls, rv->seq); 218 error = enable_locking(ls, rv->seq);
219 if (error) { 219 if (error) {
220 log_debug(ls, "enable_locking error %d", error); 220 log_rinfo(ls, "enable_locking error %d", error);
221 goto fail; 221 goto fail;
222 } 222 }
223 223
224 error = dlm_process_requestqueue(ls); 224 error = dlm_process_requestqueue(ls);
225 if (error) { 225 if (error) {
226 log_debug(ls, "dlm_process_requestqueue error %d", error); 226 log_rinfo(ls, "dlm_process_requestqueue error %d", error);
227 goto fail; 227 goto fail;
228 } 228 }
229 229
230 error = dlm_recover_waiters_post(ls); 230 error = dlm_recover_waiters_post(ls);
231 if (error) { 231 if (error) {
232 log_debug(ls, "dlm_recover_waiters_post error %d", error); 232 log_rinfo(ls, "dlm_recover_waiters_post error %d", error);
233 goto fail; 233 goto fail;
234 } 234 }
235 235
236 dlm_recover_grant(ls); 236 dlm_recover_grant(ls);
237 237
238 log_debug(ls, "dlm_recover %llu generation %u done: %u ms", 238 log_rinfo(ls, "dlm_recover %llu generation %u done: %u ms",
239 (unsigned long long)rv->seq, ls->ls_generation, 239 (unsigned long long)rv->seq, ls->ls_generation,
240 jiffies_to_msecs(jiffies - start)); 240 jiffies_to_msecs(jiffies - start));
241 mutex_unlock(&ls->ls_recoverd_active); 241 mutex_unlock(&ls->ls_recoverd_active);
@@ -245,7 +245,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
245 245
246 fail: 246 fail:
247 dlm_release_root_list(ls); 247 dlm_release_root_list(ls);
248 log_debug(ls, "dlm_recover %llu error %d", 248 log_rinfo(ls, "dlm_recover %llu error %d",
249 (unsigned long long)rv->seq, error); 249 (unsigned long long)rv->seq, error);
250 mutex_unlock(&ls->ls_recoverd_active); 250 mutex_unlock(&ls->ls_recoverd_active);
251 return error; 251 return error;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 9fd702f5bfb2..9280202e488c 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -59,10 +59,22 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
59 if (ret) 59 if (ret)
60 return ret; 60 return ret;
61 if (write) { 61 if (write) {
62 if (sysctl_drop_caches & 1) 62 static int stfu;
63
64 if (sysctl_drop_caches & 1) {
63 iterate_supers(drop_pagecache_sb, NULL); 65 iterate_supers(drop_pagecache_sb, NULL);
64 if (sysctl_drop_caches & 2) 66 count_vm_event(DROP_PAGECACHE);
67 }
68 if (sysctl_drop_caches & 2) {
65 drop_slab(); 69 drop_slab();
70 count_vm_event(DROP_SLAB);
71 }
72 if (!stfu) {
73 pr_info("%s (%d): drop_caches: %d\n",
74 current->comm, task_pid_nr(current),
75 sysctl_drop_caches);
76 }
77 stfu |= sysctl_drop_caches & 4;
66 } 78 }
67 return 0; 79 return 0;
68} 80}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index b167ca48b8ee..d4a9431ec73c 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -641,7 +641,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
641 } 641 }
642 rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, 642 rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry,
643 lower_new_dir_dentry->d_inode, lower_new_dentry, 643 lower_new_dir_dentry->d_inode, lower_new_dentry,
644 NULL); 644 NULL, 0);
645 if (rc) 645 if (rc)
646 goto out_lock; 646 goto out_lock;
647 if (target_inode) 647 if (target_inode)
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index e879cf8ff0b1..afa1b81c3418 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -132,7 +132,7 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
132 */ 132 */
133static void ecryptfs_evict_inode(struct inode *inode) 133static void ecryptfs_evict_inode(struct inode *inode)
134{ 134{
135 truncate_inode_pages(&inode->i_data, 0); 135 truncate_inode_pages_final(&inode->i_data);
136 clear_inode(inode); 136 clear_inode(inode);
137 iput(ecryptfs_inode_to_lower(inode)); 137 iput(ecryptfs_inode_to_lower(inode));
138} 138}
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 8dd524f32284..cdb2971192a5 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -21,7 +21,7 @@ static ssize_t efivarfs_file_write(struct file *file,
21 u32 attributes; 21 u32 attributes;
22 struct inode *inode = file->f_mapping->host; 22 struct inode *inode = file->f_mapping->host;
23 unsigned long datasize = count - sizeof(attributes); 23 unsigned long datasize = count - sizeof(attributes);
24 ssize_t bytes = 0; 24 ssize_t bytes;
25 bool set = false; 25 bool set = false;
26 26
27 if (count < sizeof(attributes)) 27 if (count < sizeof(attributes))
@@ -33,14 +33,9 @@ static ssize_t efivarfs_file_write(struct file *file,
33 if (attributes & ~(EFI_VARIABLE_MASK)) 33 if (attributes & ~(EFI_VARIABLE_MASK))
34 return -EINVAL; 34 return -EINVAL;
35 35
36 data = kmalloc(datasize, GFP_KERNEL); 36 data = memdup_user(userbuf + sizeof(attributes), datasize);
37 if (!data) 37 if (IS_ERR(data))
38 return -ENOMEM; 38 return PTR_ERR(data);
39
40 if (copy_from_user(data, userbuf + sizeof(attributes), datasize)) {
41 bytes = -EFAULT;
42 goto out;
43 }
44 39
45 bytes = efivar_entry_set_get_size(var, attributes, &datasize, 40 bytes = efivar_entry_set_get_size(var, attributes, &datasize,
46 data, &set); 41 data, &set);
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 50215bbd6463..3befcc9f5d63 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -91,7 +91,7 @@ static void init_once(void *foo)
91 inode_init_once(&ei->vfs_inode); 91 inode_init_once(&ei->vfs_inode);
92} 92}
93 93
94static int init_inodecache(void) 94static int __init init_inodecache(void)
95{ 95{
96 efs_inode_cachep = kmem_cache_create("efs_inode_cache", 96 efs_inode_cachep = kmem_cache_create("efs_inode_cache",
97 sizeof(struct efs_inode_info), 97 sizeof(struct efs_inode_info),
@@ -114,6 +114,7 @@ static void destroy_inodecache(void)
114 114
115static int efs_remount(struct super_block *sb, int *flags, char *data) 115static int efs_remount(struct super_block *sb, int *flags, char *data)
116{ 116{
117 sync_filesystem(sb);
117 *flags |= MS_RDONLY; 118 *flags |= MS_RDONLY;
118 return 0; 119 return 0;
119} 120}
diff --git a/fs/exec.c b/fs/exec.c
index 3d78fccdd723..25dfeba6d55f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -97,6 +97,7 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
97 module_put(fmt->module); 97 module_put(fmt->module);
98} 98}
99 99
100#ifdef CONFIG_USELIB
100/* 101/*
101 * Note that a shared library must be both readable and executable due to 102 * Note that a shared library must be both readable and executable due to
102 * security reasons. 103 * security reasons.
@@ -156,6 +157,7 @@ exit:
156out: 157out:
157 return error; 158 return error;
158} 159}
160#endif /* #ifdef CONFIG_USELIB */
159 161
160#ifdef CONFIG_MMU 162#ifdef CONFIG_MMU
161/* 163/*
@@ -1619,9 +1621,9 @@ SYSCALL_DEFINE3(execve,
1619 return do_execve(getname(filename), argv, envp); 1621 return do_execve(getname(filename), argv, envp);
1620} 1622}
1621#ifdef CONFIG_COMPAT 1623#ifdef CONFIG_COMPAT
1622asmlinkage long compat_sys_execve(const char __user * filename, 1624COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
1623 const compat_uptr_t __user * argv, 1625 const compat_uptr_t __user *, argv,
1624 const compat_uptr_t __user * envp) 1626 const compat_uptr_t __user *, envp)
1625{ 1627{
1626 return compat_do_execve(getname(filename), argv, envp); 1628 return compat_do_execve(getname(filename), argv, envp);
1627} 1629}
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index ee4317faccb1..d1c244d67667 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1486,7 +1486,7 @@ void exofs_evict_inode(struct inode *inode)
1486 struct ore_io_state *ios; 1486 struct ore_io_state *ios;
1487 int ret; 1487 int ret;
1488 1488
1489 truncate_inode_pages(&inode->i_data, 0); 1489 truncate_inode_pages_final(&inode->i_data);
1490 1490
1491 /* TODO: should do better here */ 1491 /* TODO: should do better here */
1492 if (inode->i_nlink || is_bad_inode(inode)) 1492 if (inode->i_nlink || is_bad_inode(inode))
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 94ed36849b71..b1d2a4675d42 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -78,7 +78,7 @@ void ext2_evict_inode(struct inode * inode)
78 dquot_drop(inode); 78 dquot_drop(inode);
79 } 79 }
80 80
81 truncate_inode_pages(&inode->i_data, 0); 81 truncate_inode_pages_final(&inode->i_data);
82 82
83 if (want_delete) { 83 if (want_delete) {
84 sb_start_intwrite(inode->i_sb); 84 sb_start_intwrite(inode->i_sb);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 20d6697bd638..d260115c0350 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1254,6 +1254,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1254 unsigned long old_sb_flags; 1254 unsigned long old_sb_flags;
1255 int err; 1255 int err;
1256 1256
1257 sync_filesystem(sb);
1257 spin_lock(&sbi->s_lock); 1258 spin_lock(&sbi->s_lock);
1258 1259
1259 /* Store the old options */ 1260 /* Store the old options */
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 384b6ebb655f..efce2bbfb5e5 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -228,7 +228,7 @@ void ext3_evict_inode (struct inode *inode)
228 log_wait_commit(journal, commit_tid); 228 log_wait_commit(journal, commit_tid);
229 filemap_write_and_wait(&inode->i_data); 229 filemap_write_and_wait(&inode->i_data);
230 } 230 }
231 truncate_inode_pages(&inode->i_data, 0); 231 truncate_inode_pages_final(&inode->i_data);
232 232
233 ext3_discard_reservation(inode); 233 ext3_discard_reservation(inode);
234 rsv = ei->i_block_alloc_info; 234 rsv = ei->i_block_alloc_info;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 37fd31ed16e7..95c6c5a6d0c5 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2649,6 +2649,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2649 int i; 2649 int i;
2650#endif 2650#endif
2651 2651
2652 sync_filesystem(sb);
2653
2652 /* Store the original options */ 2654 /* Store the original options */
2653 old_sb_flags = sb->s_flags; 2655 old_sb_flags = sb->s_flags;
2654 old_opts.s_mount_opt = sbi->s_mount_opt; 2656 old_opts.s_mount_opt = sbi->s_mount_opt;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d3a534fdc5ff..f1c65dc7cc0a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -31,6 +31,7 @@
31#include <linux/percpu_counter.h> 31#include <linux/percpu_counter.h>
32#include <linux/ratelimit.h> 32#include <linux/ratelimit.h>
33#include <crypto/hash.h> 33#include <crypto/hash.h>
34#include <linux/falloc.h>
34#ifdef __KERNEL__ 35#ifdef __KERNEL__
35#include <linux/compat.h> 36#include <linux/compat.h>
36#endif 37#endif
@@ -567,6 +568,8 @@ enum {
567#define EXT4_GET_BLOCKS_NO_LOCK 0x0100 568#define EXT4_GET_BLOCKS_NO_LOCK 0x0100
568 /* Do not put hole in extent cache */ 569 /* Do not put hole in extent cache */
569#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 570#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
571 /* Convert written extents to unwritten */
572#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400
570 573
571/* 574/*
572 * The bit position of these flags must not overlap with any of the 575 * The bit position of these flags must not overlap with any of the
@@ -998,6 +1001,8 @@ struct ext4_inode_info {
998#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group 1001#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group
999 size of blocksize * 8 1002 size of blocksize * 8
1000 blocks */ 1003 blocks */
1004#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated
1005 file systems */
1001 1006
1002#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ 1007#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
1003 ~EXT4_MOUNT_##opt 1008 ~EXT4_MOUNT_##opt
@@ -1326,6 +1331,7 @@ struct ext4_sb_info {
1326 struct list_head s_es_lru; 1331 struct list_head s_es_lru;
1327 unsigned long s_es_last_sorted; 1332 unsigned long s_es_last_sorted;
1328 struct percpu_counter s_extent_cache_cnt; 1333 struct percpu_counter s_extent_cache_cnt;
1334 struct mb_cache *s_mb_cache;
1329 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; 1335 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
1330 1336
1331 /* Ratelimit ext4 messages. */ 1337 /* Ratelimit ext4 messages. */
@@ -2133,8 +2139,6 @@ extern int ext4_writepage_trans_blocks(struct inode *);
2133extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 2139extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
2134extern int ext4_block_truncate_page(handle_t *handle, 2140extern int ext4_block_truncate_page(handle_t *handle,
2135 struct address_space *mapping, loff_t from); 2141 struct address_space *mapping, loff_t from);
2136extern int ext4_block_zero_page_range(handle_t *handle,
2137 struct address_space *mapping, loff_t from, loff_t length);
2138extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, 2142extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
2139 loff_t lstart, loff_t lend); 2143 loff_t lstart, loff_t lend);
2140extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2144extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
@@ -2757,6 +2761,7 @@ extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
2757extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2761extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2758 __u64 start, __u64 len); 2762 __u64 start, __u64 len);
2759extern int ext4_ext_precache(struct inode *inode); 2763extern int ext4_ext_precache(struct inode *inode);
2764extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
2760 2765
2761/* move_extent.c */ 2766/* move_extent.c */
2762extern void ext4_double_down_write_data_sem(struct inode *first, 2767extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -2766,6 +2771,8 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
2766extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, 2771extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2767 __u64 start_orig, __u64 start_donor, 2772 __u64 start_orig, __u64 start_donor,
2768 __u64 len, __u64 *moved_len); 2773 __u64 len, __u64 *moved_len);
2774extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
2775 struct ext4_extent **extent);
2769 2776
2770/* page-io.c */ 2777/* page-io.c */
2771extern int __init ext4_init_pageio(void); 2778extern int __init ext4_init_pageio(void);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 3fe29de832c8..c3fb607413ed 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -259,6 +259,16 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
259 if (WARN_ON_ONCE(err)) { 259 if (WARN_ON_ONCE(err)) {
260 ext4_journal_abort_handle(where, line, __func__, bh, 260 ext4_journal_abort_handle(where, line, __func__, bh,
261 handle, err); 261 handle, err);
262 if (inode == NULL) {
263 pr_err("EXT4: jbd2_journal_dirty_metadata "
264 "failed: handle type %u started at "
265 "line %u, credits %u/%u, errcode %d",
266 handle->h_type,
267 handle->h_line_no,
268 handle->h_requested_credits,
269 handle->h_buffer_credits, err);
270 return err;
271 }
262 ext4_error_inode(inode, where, line, 272 ext4_error_inode(inode, where, line,
263 bh->b_blocknr, 273 bh->b_blocknr,
264 "journal_dirty_metadata failed: " 274 "journal_dirty_metadata failed: "
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74bc2d549c58..82df3ce9874a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -37,7 +37,6 @@
37#include <linux/quotaops.h> 37#include <linux/quotaops.h>
38#include <linux/string.h> 38#include <linux/string.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/falloc.h>
41#include <asm/uaccess.h> 40#include <asm/uaccess.h>
42#include <linux/fiemap.h> 41#include <linux/fiemap.h>
43#include "ext4_jbd2.h" 42#include "ext4_jbd2.h"
@@ -1691,7 +1690,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1691 * the extent that was written properly split out and conversion to 1690 * the extent that was written properly split out and conversion to
1692 * initialized is trivial. 1691 * initialized is trivial.
1693 */ 1692 */
1694 if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) 1693 if (ext4_ext_is_uninitialized(ex1) != ext4_ext_is_uninitialized(ex2))
1695 return 0; 1694 return 0;
1696 1695
1697 ext1_ee_len = ext4_ext_get_actual_len(ex1); 1696 ext1_ee_len = ext4_ext_get_actual_len(ex1);
@@ -1708,6 +1707,11 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1708 */ 1707 */
1709 if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) 1708 if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
1710 return 0; 1709 return 0;
1710 if (ext4_ext_is_uninitialized(ex1) &&
1711 (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
1712 atomic_read(&EXT4_I(inode)->i_unwritten) ||
1713 (ext1_ee_len + ext2_ee_len > EXT_UNINIT_MAX_LEN)))
1714 return 0;
1711#ifdef AGGRESSIVE_TEST 1715#ifdef AGGRESSIVE_TEST
1712 if (ext1_ee_len >= 4) 1716 if (ext1_ee_len >= 4)
1713 return 0; 1717 return 0;
@@ -1731,7 +1735,7 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
1731{ 1735{
1732 struct ext4_extent_header *eh; 1736 struct ext4_extent_header *eh;
1733 unsigned int depth, len; 1737 unsigned int depth, len;
1734 int merge_done = 0; 1738 int merge_done = 0, uninit;
1735 1739
1736 depth = ext_depth(inode); 1740 depth = ext_depth(inode);
1737 BUG_ON(path[depth].p_hdr == NULL); 1741 BUG_ON(path[depth].p_hdr == NULL);
@@ -1741,8 +1745,11 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
1741 if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) 1745 if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1742 break; 1746 break;
1743 /* merge with next extent! */ 1747 /* merge with next extent! */
1748 uninit = ext4_ext_is_uninitialized(ex);
1744 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1749 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1745 + ext4_ext_get_actual_len(ex + 1)); 1750 + ext4_ext_get_actual_len(ex + 1));
1751 if (uninit)
1752 ext4_ext_mark_uninitialized(ex);
1746 1753
1747 if (ex + 1 < EXT_LAST_EXTENT(eh)) { 1754 if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1748 len = (EXT_LAST_EXTENT(eh) - ex - 1) 1755 len = (EXT_LAST_EXTENT(eh) - ex - 1)
@@ -1896,7 +1903,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1896 struct ext4_ext_path *npath = NULL; 1903 struct ext4_ext_path *npath = NULL;
1897 int depth, len, err; 1904 int depth, len, err;
1898 ext4_lblk_t next; 1905 ext4_lblk_t next;
1899 int mb_flags = 0; 1906 int mb_flags = 0, uninit;
1900 1907
1901 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1908 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1902 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1909 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1946,9 +1953,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1946 path + depth); 1953 path + depth);
1947 if (err) 1954 if (err)
1948 return err; 1955 return err;
1949 1956 uninit = ext4_ext_is_uninitialized(ex);
1950 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1957 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1951 + ext4_ext_get_actual_len(newext)); 1958 + ext4_ext_get_actual_len(newext));
1959 if (uninit)
1960 ext4_ext_mark_uninitialized(ex);
1952 eh = path[depth].p_hdr; 1961 eh = path[depth].p_hdr;
1953 nearex = ex; 1962 nearex = ex;
1954 goto merge; 1963 goto merge;
@@ -1971,10 +1980,13 @@ prepend:
1971 if (err) 1980 if (err)
1972 return err; 1981 return err;
1973 1982
1983 uninit = ext4_ext_is_uninitialized(ex);
1974 ex->ee_block = newext->ee_block; 1984 ex->ee_block = newext->ee_block;
1975 ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); 1985 ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
1976 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1986 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1977 + ext4_ext_get_actual_len(newext)); 1987 + ext4_ext_get_actual_len(newext));
1988 if (uninit)
1989 ext4_ext_mark_uninitialized(ex);
1978 eh = path[depth].p_hdr; 1990 eh = path[depth].p_hdr;
1979 nearex = ex; 1991 nearex = ex;
1980 goto merge; 1992 goto merge;
@@ -2585,6 +2597,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2585 ex_ee_block = le32_to_cpu(ex->ee_block); 2597 ex_ee_block = le32_to_cpu(ex->ee_block);
2586 ex_ee_len = ext4_ext_get_actual_len(ex); 2598 ex_ee_len = ext4_ext_get_actual_len(ex);
2587 2599
2600 /*
2601 * If we're starting with an extent other than the last one in the
2602 * node, we need to see if it shares a cluster with the extent to
2603 * the right (towards the end of the file). If its leftmost cluster
2604 * is this extent's rightmost cluster and it is not cluster aligned,
2605 * we'll mark it as a partial that is not to be deallocated.
2606 */
2607
2608 if (ex != EXT_LAST_EXTENT(eh)) {
2609 ext4_fsblk_t current_pblk, right_pblk;
2610 long long current_cluster, right_cluster;
2611
2612 current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2613 current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
2614 right_pblk = ext4_ext_pblock(ex + 1);
2615 right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
2616 if (current_cluster == right_cluster &&
2617 EXT4_PBLK_COFF(sbi, right_pblk))
2618 *partial_cluster = -right_cluster;
2619 }
2620
2588 trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); 2621 trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
2589 2622
2590 while (ex >= EXT_FIRST_EXTENT(eh) && 2623 while (ex >= EXT_FIRST_EXTENT(eh) &&
@@ -2710,10 +2743,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2710 err = ext4_ext_correct_indexes(handle, inode, path); 2743 err = ext4_ext_correct_indexes(handle, inode, path);
2711 2744
2712 /* 2745 /*
2713 * Free the partial cluster only if the current extent does not 2746 * If there's a partial cluster and at least one extent remains in
2714 * reference it. Otherwise we might free used cluster. 2747 * the leaf, free the partial cluster if it isn't shared with the
2748 * current extent. If there's a partial cluster and no extents
2749 * remain in the leaf, it can't be freed here. It can only be
2750 * freed when it's possible to determine if it's not shared with
2751 * any other extent - when the next leaf is processed or when space
2752 * removal is complete.
2715 */ 2753 */
2716 if (*partial_cluster > 0 && 2754 if (*partial_cluster > 0 && eh->eh_entries &&
2717 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != 2755 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
2718 *partial_cluster)) { 2756 *partial_cluster)) {
2719 int flags = get_default_free_blocks_flags(inode); 2757 int flags = get_default_free_blocks_flags(inode);
@@ -3569,6 +3607,8 @@ out:
3569 * b> Splits in two extents: Write is happening at either end of the extent 3607 * b> Splits in two extents: Write is happening at either end of the extent
3570 * c> Splits in three extents: Somone is writing in middle of the extent 3608 * c> Splits in three extents: Somone is writing in middle of the extent
3571 * 3609 *
3610 * This works the same way in the case of initialized -> unwritten conversion.
3611 *
3572 * One of more index blocks maybe needed if the extent tree grow after 3612 * One of more index blocks maybe needed if the extent tree grow after
3573 * the uninitialized extent split. To prevent ENOSPC occur at the IO 3613 * the uninitialized extent split. To prevent ENOSPC occur at the IO
3574 * complete, we need to split the uninitialized extent before DIO submit 3614 * complete, we need to split the uninitialized extent before DIO submit
@@ -3579,7 +3619,7 @@ out:
3579 * 3619 *
3580 * Returns the size of uninitialized extent to be written on success. 3620 * Returns the size of uninitialized extent to be written on success.
3581 */ 3621 */
3582static int ext4_split_unwritten_extents(handle_t *handle, 3622static int ext4_split_convert_extents(handle_t *handle,
3583 struct inode *inode, 3623 struct inode *inode,
3584 struct ext4_map_blocks *map, 3624 struct ext4_map_blocks *map,
3585 struct ext4_ext_path *path, 3625 struct ext4_ext_path *path,
@@ -3591,9 +3631,9 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3591 unsigned int ee_len; 3631 unsigned int ee_len;
3592 int split_flag = 0, depth; 3632 int split_flag = 0, depth;
3593 3633
3594 ext_debug("ext4_split_unwritten_extents: inode %lu, logical" 3634 ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
3595 "block %llu, max_blocks %u\n", inode->i_ino, 3635 __func__, inode->i_ino,
3596 (unsigned long long)map->m_lblk, map->m_len); 3636 (unsigned long long)map->m_lblk, map->m_len);
3597 3637
3598 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3638 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3599 inode->i_sb->s_blocksize_bits; 3639 inode->i_sb->s_blocksize_bits;
@@ -3608,14 +3648,73 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3608 ee_block = le32_to_cpu(ex->ee_block); 3648 ee_block = le32_to_cpu(ex->ee_block);
3609 ee_len = ext4_ext_get_actual_len(ex); 3649 ee_len = ext4_ext_get_actual_len(ex);
3610 3650
3611 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; 3651 /* Convert to unwritten */
3612 split_flag |= EXT4_EXT_MARK_UNINIT2; 3652 if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
3613 if (flags & EXT4_GET_BLOCKS_CONVERT) 3653 split_flag |= EXT4_EXT_DATA_VALID1;
3614 split_flag |= EXT4_EXT_DATA_VALID2; 3654 /* Convert to initialized */
3655 } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
3656 split_flag |= ee_block + ee_len <= eof_block ?
3657 EXT4_EXT_MAY_ZEROOUT : 0;
3658 split_flag |= (EXT4_EXT_MARK_UNINIT2 | EXT4_EXT_DATA_VALID2);
3659 }
3615 flags |= EXT4_GET_BLOCKS_PRE_IO; 3660 flags |= EXT4_GET_BLOCKS_PRE_IO;
3616 return ext4_split_extent(handle, inode, path, map, split_flag, flags); 3661 return ext4_split_extent(handle, inode, path, map, split_flag, flags);
3617} 3662}
3618 3663
3664static int ext4_convert_initialized_extents(handle_t *handle,
3665 struct inode *inode,
3666 struct ext4_map_blocks *map,
3667 struct ext4_ext_path *path)
3668{
3669 struct ext4_extent *ex;
3670 ext4_lblk_t ee_block;
3671 unsigned int ee_len;
3672 int depth;
3673 int err = 0;
3674
3675 depth = ext_depth(inode);
3676 ex = path[depth].p_ext;
3677 ee_block = le32_to_cpu(ex->ee_block);
3678 ee_len = ext4_ext_get_actual_len(ex);
3679
3680 ext_debug("%s: inode %lu, logical"
3681 "block %llu, max_blocks %u\n", __func__, inode->i_ino,
3682 (unsigned long long)ee_block, ee_len);
3683
3684 if (ee_block != map->m_lblk || ee_len > map->m_len) {
3685 err = ext4_split_convert_extents(handle, inode, map, path,
3686 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
3687 if (err < 0)
3688 goto out;
3689 ext4_ext_drop_refs(path);
3690 path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
3691 if (IS_ERR(path)) {
3692 err = PTR_ERR(path);
3693 goto out;
3694 }
3695 depth = ext_depth(inode);
3696 ex = path[depth].p_ext;
3697 }
3698
3699 err = ext4_ext_get_access(handle, inode, path + depth);
3700 if (err)
3701 goto out;
3702 /* first mark the extent as uninitialized */
3703 ext4_ext_mark_uninitialized(ex);
3704
3705 /* note: ext4_ext_correct_indexes() isn't needed here because
3706 * borders are not changed
3707 */
3708 ext4_ext_try_to_merge(handle, inode, path, ex);
3709
3710 /* Mark modified extent as dirty */
3711 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3712out:
3713 ext4_ext_show_leaf(inode, path);
3714 return err;
3715}
3716
3717
3619static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3718static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3620 struct inode *inode, 3719 struct inode *inode,
3621 struct ext4_map_blocks *map, 3720 struct ext4_map_blocks *map,
@@ -3649,8 +3748,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3649 inode->i_ino, (unsigned long long)ee_block, ee_len, 3748 inode->i_ino, (unsigned long long)ee_block, ee_len,
3650 (unsigned long long)map->m_lblk, map->m_len); 3749 (unsigned long long)map->m_lblk, map->m_len);
3651#endif 3750#endif
3652 err = ext4_split_unwritten_extents(handle, inode, map, path, 3751 err = ext4_split_convert_extents(handle, inode, map, path,
3653 EXT4_GET_BLOCKS_CONVERT); 3752 EXT4_GET_BLOCKS_CONVERT);
3654 if (err < 0) 3753 if (err < 0)
3655 goto out; 3754 goto out;
3656 ext4_ext_drop_refs(path); 3755 ext4_ext_drop_refs(path);
@@ -3851,6 +3950,38 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3851} 3950}
3852 3951
3853static int 3952static int
3953ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
3954 struct ext4_map_blocks *map,
3955 struct ext4_ext_path *path, int flags,
3956 unsigned int allocated, ext4_fsblk_t newblock)
3957{
3958 int ret = 0;
3959 int err = 0;
3960
3961 /*
3962 * Make sure that the extent is no bigger than we support with
3963 * uninitialized extent
3964 */
3965 if (map->m_len > EXT_UNINIT_MAX_LEN)
3966 map->m_len = EXT_UNINIT_MAX_LEN / 2;
3967
3968 ret = ext4_convert_initialized_extents(handle, inode, map,
3969 path);
3970 if (ret >= 0) {
3971 ext4_update_inode_fsync_trans(handle, inode, 1);
3972 err = check_eofblocks_fl(handle, inode, map->m_lblk,
3973 path, map->m_len);
3974 } else
3975 err = ret;
3976 map->m_flags |= EXT4_MAP_UNWRITTEN;
3977 if (allocated > map->m_len)
3978 allocated = map->m_len;
3979 map->m_len = allocated;
3980
3981 return err ? err : allocated;
3982}
3983
3984static int
3854ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3985ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3855 struct ext4_map_blocks *map, 3986 struct ext4_map_blocks *map,
3856 struct ext4_ext_path *path, int flags, 3987 struct ext4_ext_path *path, int flags,
@@ -3877,8 +4008,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3877 4008
3878 /* get_block() before submit the IO, split the extent */ 4009 /* get_block() before submit the IO, split the extent */
3879 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 4010 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3880 ret = ext4_split_unwritten_extents(handle, inode, map, 4011 ret = ext4_split_convert_extents(handle, inode, map,
3881 path, flags); 4012 path, flags | EXT4_GET_BLOCKS_CONVERT);
3882 if (ret <= 0) 4013 if (ret <= 0)
3883 goto out; 4014 goto out;
3884 /* 4015 /*
@@ -3993,10 +4124,6 @@ out1:
3993 map->m_pblk = newblock; 4124 map->m_pblk = newblock;
3994 map->m_len = allocated; 4125 map->m_len = allocated;
3995out2: 4126out2:
3996 if (path) {
3997 ext4_ext_drop_refs(path);
3998 kfree(path);
3999 }
4000 return err ? err : allocated; 4127 return err ? err : allocated;
4001} 4128}
4002 4129
@@ -4128,7 +4255,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4128 struct ext4_extent newex, *ex, *ex2; 4255 struct ext4_extent newex, *ex, *ex2;
4129 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4256 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4130 ext4_fsblk_t newblock = 0; 4257 ext4_fsblk_t newblock = 0;
4131 int free_on_err = 0, err = 0, depth; 4258 int free_on_err = 0, err = 0, depth, ret;
4132 unsigned int allocated = 0, offset = 0; 4259 unsigned int allocated = 0, offset = 0;
4133 unsigned int allocated_clusters = 0; 4260 unsigned int allocated_clusters = 0;
4134 struct ext4_allocation_request ar; 4261 struct ext4_allocation_request ar;
@@ -4170,6 +4297,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4170 ext4_fsblk_t ee_start = ext4_ext_pblock(ex); 4297 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4171 unsigned short ee_len; 4298 unsigned short ee_len;
4172 4299
4300
4173 /* 4301 /*
4174 * Uninitialized extents are treated as holes, except that 4302 * Uninitialized extents are treated as holes, except that
4175 * we split out initialized portions during a write. 4303 * we split out initialized portions during a write.
@@ -4186,13 +4314,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4186 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 4314 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
4187 ee_block, ee_len, newblock); 4315 ee_block, ee_len, newblock);
4188 4316
4189 if (!ext4_ext_is_uninitialized(ex)) 4317 /*
4318 * If the extent is initialized check whether the
4319 * caller wants to convert it to unwritten.
4320 */
4321 if ((!ext4_ext_is_uninitialized(ex)) &&
4322 (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4323 allocated = ext4_ext_convert_initialized_extent(
4324 handle, inode, map, path, flags,
4325 allocated, newblock);
4326 goto out2;
4327 } else if (!ext4_ext_is_uninitialized(ex))
4190 goto out; 4328 goto out;
4191 4329
4192 allocated = ext4_ext_handle_uninitialized_extents( 4330 ret = ext4_ext_handle_uninitialized_extents(
4193 handle, inode, map, path, flags, 4331 handle, inode, map, path, flags,
4194 allocated, newblock); 4332 allocated, newblock);
4195 goto out3; 4333 if (ret < 0)
4334 err = ret;
4335 else
4336 allocated = ret;
4337 goto out2;
4196 } 4338 }
4197 } 4339 }
4198 4340
@@ -4473,7 +4615,6 @@ out2:
4473 kfree(path); 4615 kfree(path);
4474 } 4616 }
4475 4617
4476out3:
4477 trace_ext4_ext_map_blocks_exit(inode, flags, map, 4618 trace_ext4_ext_map_blocks_exit(inode, flags, map,
4478 err ? err : allocated); 4619 err ? err : allocated);
4479 ext4_es_lru_add(inode); 4620 ext4_es_lru_add(inode);
@@ -4514,34 +4655,200 @@ retry:
4514 ext4_std_error(inode->i_sb, err); 4655 ext4_std_error(inode->i_sb, err);
4515} 4656}
4516 4657
4517static void ext4_falloc_update_inode(struct inode *inode, 4658static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4518 int mode, loff_t new_size, int update_ctime) 4659 ext4_lblk_t len, int flags, int mode)
4519{ 4660{
4520 struct timespec now; 4661 struct inode *inode = file_inode(file);
4662 handle_t *handle;
4663 int ret = 0;
4664 int ret2 = 0;
4665 int retries = 0;
4666 struct ext4_map_blocks map;
4667 unsigned int credits;
4521 4668
4522 if (update_ctime) { 4669 map.m_lblk = offset;
4523 now = current_fs_time(inode->i_sb); 4670 /*
4524 if (!timespec_equal(&inode->i_ctime, &now)) 4671 * Don't normalize the request if it can fit in one extent so
4525 inode->i_ctime = now; 4672 * that it doesn't get unnecessarily split into multiple
4673 * extents.
4674 */
4675 if (len <= EXT_UNINIT_MAX_LEN)
4676 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4677
4678 /*
4679 * credits to insert 1 extent into extent tree
4680 */
4681 credits = ext4_chunk_trans_blocks(inode, len);
4682
4683retry:
4684 while (ret >= 0 && ret < len) {
4685 map.m_lblk = map.m_lblk + ret;
4686 map.m_len = len = len - ret;
4687 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4688 credits);
4689 if (IS_ERR(handle)) {
4690 ret = PTR_ERR(handle);
4691 break;
4692 }
4693 ret = ext4_map_blocks(handle, inode, &map, flags);
4694 if (ret <= 0) {
4695 ext4_debug("inode #%lu: block %u: len %u: "
4696 "ext4_ext_map_blocks returned %d",
4697 inode->i_ino, map.m_lblk,
4698 map.m_len, ret);
4699 ext4_mark_inode_dirty(handle, inode);
4700 ret2 = ext4_journal_stop(handle);
4701 break;
4702 }
4703 ret2 = ext4_journal_stop(handle);
4704 if (ret2)
4705 break;
4706 }
4707 if (ret == -ENOSPC &&
4708 ext4_should_retry_alloc(inode->i_sb, &retries)) {
4709 ret = 0;
4710 goto retry;
4526 } 4711 }
4712
4713 return ret > 0 ? ret2 : ret;
4714}
4715
4716static long ext4_zero_range(struct file *file, loff_t offset,
4717 loff_t len, int mode)
4718{
4719 struct inode *inode = file_inode(file);
4720 handle_t *handle = NULL;
4721 unsigned int max_blocks;
4722 loff_t new_size = 0;
4723 int ret = 0;
4724 int flags;
4725 int partial;
4726 loff_t start, end;
4727 ext4_lblk_t lblk;
4728 struct address_space *mapping = inode->i_mapping;
4729 unsigned int blkbits = inode->i_blkbits;
4730
4731 trace_ext4_zero_range(inode, offset, len, mode);
4732
4733 /*
4734 * Write out all dirty pages to avoid race conditions
4735 * Then release them.
4736 */
4737 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4738 ret = filemap_write_and_wait_range(mapping, offset,
4739 offset + len - 1);
4740 if (ret)
4741 return ret;
4742 }
4743
4527 /* 4744 /*
4528 * Update only when preallocation was requested beyond 4745 * Round up offset. This is not fallocate, we neet to zero out
4529 * the file size. 4746 * blocks, so convert interior block aligned part of the range to
4747 * unwritten and possibly manually zero out unaligned parts of the
4748 * range.
4530 */ 4749 */
4531 if (!(mode & FALLOC_FL_KEEP_SIZE)) { 4750 start = round_up(offset, 1 << blkbits);
4751 end = round_down((offset + len), 1 << blkbits);
4752
4753 if (start < offset || end > offset + len)
4754 return -EINVAL;
4755 partial = (offset + len) & ((1 << blkbits) - 1);
4756
4757 lblk = start >> blkbits;
4758 max_blocks = (end >> blkbits);
4759 if (max_blocks < lblk)
4760 max_blocks = 0;
4761 else
4762 max_blocks -= lblk;
4763
4764 flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
4765 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
4766 if (mode & FALLOC_FL_KEEP_SIZE)
4767 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4768
4769 mutex_lock(&inode->i_mutex);
4770
4771 /*
4772 * Indirect files do not support unwritten extnets
4773 */
4774 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4775 ret = -EOPNOTSUPP;
4776 goto out_mutex;
4777 }
4778
4779 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4780 offset + len > i_size_read(inode)) {
4781 new_size = offset + len;
4782 ret = inode_newsize_ok(inode, new_size);
4783 if (ret)
4784 goto out_mutex;
4785 /*
4786 * If we have a partial block after EOF we have to allocate
4787 * the entire block.
4788 */
4789 if (partial)
4790 max_blocks += 1;
4791 }
4792
4793 if (max_blocks > 0) {
4794
4795 /* Now release the pages and zero block aligned part of pages*/
4796 truncate_pagecache_range(inode, start, end - 1);
4797
4798 /* Wait all existing dio workers, newcomers will block on i_mutex */
4799 ext4_inode_block_unlocked_dio(inode);
4800 inode_dio_wait(inode);
4801
4802 /*
4803 * Remove entire range from the extent status tree.
4804 */
4805 ret = ext4_es_remove_extent(inode, lblk, max_blocks);
4806 if (ret)
4807 goto out_dio;
4808
4809 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
4810 mode);
4811 if (ret)
4812 goto out_dio;
4813 }
4814
4815 handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
4816 if (IS_ERR(handle)) {
4817 ret = PTR_ERR(handle);
4818 ext4_std_error(inode->i_sb, ret);
4819 goto out_dio;
4820 }
4821
4822 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4823
4824 if (new_size) {
4532 if (new_size > i_size_read(inode)) 4825 if (new_size > i_size_read(inode))
4533 i_size_write(inode, new_size); 4826 i_size_write(inode, new_size);
4534 if (new_size > EXT4_I(inode)->i_disksize) 4827 if (new_size > EXT4_I(inode)->i_disksize)
4535 ext4_update_i_disksize(inode, new_size); 4828 ext4_update_i_disksize(inode, new_size);
4536 } else { 4829 } else {
4537 /* 4830 /*
4538 * Mark that we allocate beyond EOF so the subsequent truncate 4831 * Mark that we allocate beyond EOF so the subsequent truncate
4539 * can proceed even if the new size is the same as i_size. 4832 * can proceed even if the new size is the same as i_size.
4540 */ 4833 */
4541 if (new_size > i_size_read(inode)) 4834 if ((offset + len) > i_size_read(inode))
4542 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 4835 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4543 } 4836 }
4544 4837
4838 ext4_mark_inode_dirty(handle, inode);
4839
4840 /* Zero out partial block at the edges of the range */
4841 ret = ext4_zero_partial_blocks(handle, inode, offset, len);
4842
4843 if (file->f_flags & O_SYNC)
4844 ext4_handle_sync(handle);
4845
4846 ext4_journal_stop(handle);
4847out_dio:
4848 ext4_inode_resume_unlocked_dio(inode);
4849out_mutex:
4850 mutex_unlock(&inode->i_mutex);
4851 return ret;
4545} 4852}
4546 4853
4547/* 4854/*
@@ -4555,22 +4862,25 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4555{ 4862{
4556 struct inode *inode = file_inode(file); 4863 struct inode *inode = file_inode(file);
4557 handle_t *handle; 4864 handle_t *handle;
4558 loff_t new_size; 4865 loff_t new_size = 0;
4559 unsigned int max_blocks; 4866 unsigned int max_blocks;
4560 int ret = 0; 4867 int ret = 0;
4561 int ret2 = 0;
4562 int retries = 0;
4563 int flags; 4868 int flags;
4564 struct ext4_map_blocks map; 4869 ext4_lblk_t lblk;
4565 unsigned int credits, blkbits = inode->i_blkbits; 4870 struct timespec tv;
4871 unsigned int blkbits = inode->i_blkbits;
4566 4872
4567 /* Return error if mode is not supported */ 4873 /* Return error if mode is not supported */
4568 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 4874 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
4875 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
4569 return -EOPNOTSUPP; 4876 return -EOPNOTSUPP;
4570 4877
4571 if (mode & FALLOC_FL_PUNCH_HOLE) 4878 if (mode & FALLOC_FL_PUNCH_HOLE)
4572 return ext4_punch_hole(inode, offset, len); 4879 return ext4_punch_hole(inode, offset, len);
4573 4880
4881 if (mode & FALLOC_FL_COLLAPSE_RANGE)
4882 return ext4_collapse_range(inode, offset, len);
4883
4574 ret = ext4_convert_inline_data(inode); 4884 ret = ext4_convert_inline_data(inode);
4575 if (ret) 4885 if (ret)
4576 return ret; 4886 return ret;
@@ -4582,83 +4892,66 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4582 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4892 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4583 return -EOPNOTSUPP; 4893 return -EOPNOTSUPP;
4584 4894
4895 if (mode & FALLOC_FL_ZERO_RANGE)
4896 return ext4_zero_range(file, offset, len, mode);
4897
4585 trace_ext4_fallocate_enter(inode, offset, len, mode); 4898 trace_ext4_fallocate_enter(inode, offset, len, mode);
4586 map.m_lblk = offset >> blkbits; 4899 lblk = offset >> blkbits;
4587 /* 4900 /*
4588 * We can't just convert len to max_blocks because 4901 * We can't just convert len to max_blocks because
4589 * If blocksize = 4096 offset = 3072 and len = 2048 4902 * If blocksize = 4096 offset = 3072 and len = 2048
4590 */ 4903 */
4591 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 4904 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
4592 - map.m_lblk; 4905 - lblk;
4593 /* 4906
4594 * credits to insert 1 extent into extent tree
4595 */
4596 credits = ext4_chunk_trans_blocks(inode, max_blocks);
4597 mutex_lock(&inode->i_mutex);
4598 ret = inode_newsize_ok(inode, (len + offset));
4599 if (ret) {
4600 mutex_unlock(&inode->i_mutex);
4601 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4602 return ret;
4603 }
4604 flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT; 4907 flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
4605 if (mode & FALLOC_FL_KEEP_SIZE) 4908 if (mode & FALLOC_FL_KEEP_SIZE)
4606 flags |= EXT4_GET_BLOCKS_KEEP_SIZE; 4909 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4607 /*
4608 * Don't normalize the request if it can fit in one extent so
4609 * that it doesn't get unnecessarily split into multiple
4610 * extents.
4611 */
4612 if (len <= EXT_UNINIT_MAX_LEN << blkbits)
4613 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4614 4910
4615retry: 4911 mutex_lock(&inode->i_mutex);
4616 while (ret >= 0 && ret < max_blocks) {
4617 map.m_lblk = map.m_lblk + ret;
4618 map.m_len = max_blocks = max_blocks - ret;
4619 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4620 credits);
4621 if (IS_ERR(handle)) {
4622 ret = PTR_ERR(handle);
4623 break;
4624 }
4625 ret = ext4_map_blocks(handle, inode, &map, flags);
4626 if (ret <= 0) {
4627#ifdef EXT4FS_DEBUG
4628 ext4_warning(inode->i_sb,
4629 "inode #%lu: block %u: len %u: "
4630 "ext4_ext_map_blocks returned %d",
4631 inode->i_ino, map.m_lblk,
4632 map.m_len, ret);
4633#endif
4634 ext4_mark_inode_dirty(handle, inode);
4635 ret2 = ext4_journal_stop(handle);
4636 break;
4637 }
4638 if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
4639 blkbits) >> blkbits))
4640 new_size = offset + len;
4641 else
4642 new_size = ((loff_t) map.m_lblk + ret) << blkbits;
4643 4912
4644 ext4_falloc_update_inode(inode, mode, new_size, 4913 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4645 (map.m_flags & EXT4_MAP_NEW)); 4914 offset + len > i_size_read(inode)) {
4646 ext4_mark_inode_dirty(handle, inode); 4915 new_size = offset + len;
4647 if ((file->f_flags & O_SYNC) && ret >= max_blocks) 4916 ret = inode_newsize_ok(inode, new_size);
4648 ext4_handle_sync(handle); 4917 if (ret)
4649 ret2 = ext4_journal_stop(handle); 4918 goto out;
4650 if (ret2)
4651 break;
4652 } 4919 }
4653 if (ret == -ENOSPC && 4920
4654 ext4_should_retry_alloc(inode->i_sb, &retries)) { 4921 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode);
4655 ret = 0; 4922 if (ret)
4656 goto retry; 4923 goto out;
4924
4925 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
4926 if (IS_ERR(handle))
4927 goto out;
4928
4929 tv = inode->i_ctime = ext4_current_time(inode);
4930
4931 if (new_size) {
4932 if (new_size > i_size_read(inode)) {
4933 i_size_write(inode, new_size);
4934 inode->i_mtime = tv;
4935 }
4936 if (new_size > EXT4_I(inode)->i_disksize)
4937 ext4_update_i_disksize(inode, new_size);
4938 } else {
4939 /*
4940 * Mark that we allocate beyond EOF so the subsequent truncate
4941 * can proceed even if the new size is the same as i_size.
4942 */
4943 if ((offset + len) > i_size_read(inode))
4944 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4657 } 4945 }
4946 ext4_mark_inode_dirty(handle, inode);
4947 if (file->f_flags & O_SYNC)
4948 ext4_handle_sync(handle);
4949
4950 ext4_journal_stop(handle);
4951out:
4658 mutex_unlock(&inode->i_mutex); 4952 mutex_unlock(&inode->i_mutex);
4659 trace_ext4_fallocate_exit(inode, offset, max_blocks, 4953 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4660 ret > 0 ? ret2 : ret); 4954 return ret;
4661 return ret > 0 ? ret2 : ret;
4662} 4955}
4663 4956
4664/* 4957/*
@@ -4869,3 +5162,304 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4869 ext4_es_lru_add(inode); 5162 ext4_es_lru_add(inode);
4870 return error; 5163 return error;
4871} 5164}
5165
5166/*
5167 * ext4_access_path:
5168 * Function to access the path buffer for marking it dirty.
5169 * It also checks if there are sufficient credits left in the journal handle
5170 * to update path.
5171 */
5172static int
5173ext4_access_path(handle_t *handle, struct inode *inode,
5174 struct ext4_ext_path *path)
5175{
5176 int credits, err;
5177
5178 if (!ext4_handle_valid(handle))
5179 return 0;
5180
5181 /*
5182 * Check if need to extend journal credits
5183 * 3 for leaf, sb, and inode plus 2 (bmap and group
5184 * descriptor) for each block group; assume two block
5185 * groups
5186 */
5187 if (handle->h_buffer_credits < 7) {
5188 credits = ext4_writepage_trans_blocks(inode);
5189 err = ext4_ext_truncate_extend_restart(handle, inode, credits);
5190 /* EAGAIN is success */
5191 if (err && err != -EAGAIN)
5192 return err;
5193 }
5194
5195 err = ext4_ext_get_access(handle, inode, path);
5196 return err;
5197}
5198
5199/*
5200 * ext4_ext_shift_path_extents:
5201 * Shift the extents of a path structure lying between path[depth].p_ext
5202 * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
5203 * from starting block for each extent.
5204 */
5205static int
5206ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
5207 struct inode *inode, handle_t *handle,
5208 ext4_lblk_t *start)
5209{
5210 int depth, err = 0;
5211 struct ext4_extent *ex_start, *ex_last;
5212 bool update = 0;
5213 depth = path->p_depth;
5214
5215 while (depth >= 0) {
5216 if (depth == path->p_depth) {
5217 ex_start = path[depth].p_ext;
5218 if (!ex_start)
5219 return -EIO;
5220
5221 ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
5222 if (!ex_last)
5223 return -EIO;
5224
5225 err = ext4_access_path(handle, inode, path + depth);
5226 if (err)
5227 goto out;
5228
5229 if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
5230 update = 1;
5231
5232 *start = ex_last->ee_block +
5233 ext4_ext_get_actual_len(ex_last);
5234
5235 while (ex_start <= ex_last) {
5236 ex_start->ee_block -= shift;
5237 if (ex_start >
5238 EXT_FIRST_EXTENT(path[depth].p_hdr)) {
5239 if (ext4_ext_try_to_merge_right(inode,
5240 path, ex_start - 1))
5241 ex_last--;
5242 }
5243 ex_start++;
5244 }
5245 err = ext4_ext_dirty(handle, inode, path + depth);
5246 if (err)
5247 goto out;
5248
5249 if (--depth < 0 || !update)
5250 break;
5251 }
5252
5253 /* Update index too */
5254 err = ext4_access_path(handle, inode, path + depth);
5255 if (err)
5256 goto out;
5257
5258 path[depth].p_idx->ei_block -= shift;
5259 err = ext4_ext_dirty(handle, inode, path + depth);
5260 if (err)
5261 goto out;
5262
5263 /* we are done if current index is not a starting index */
5264 if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
5265 break;
5266
5267 depth--;
5268 }
5269
5270out:
5271 return err;
5272}
5273
5274/*
5275 * ext4_ext_shift_extents:
5276 * All the extents which lies in the range from start to the last allocated
5277 * block for the file are shifted downwards by shift blocks.
5278 * On success, 0 is returned, error otherwise.
5279 */
5280static int
5281ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5282 ext4_lblk_t start, ext4_lblk_t shift)
5283{
5284 struct ext4_ext_path *path;
5285 int ret = 0, depth;
5286 struct ext4_extent *extent;
5287 ext4_lblk_t stop_block, current_block;
5288 ext4_lblk_t ex_start, ex_end;
5289
5290 /* Let path point to the last extent */
5291 path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
5292 if (IS_ERR(path))
5293 return PTR_ERR(path);
5294
5295 depth = path->p_depth;
5296 extent = path[depth].p_ext;
5297 if (!extent) {
5298 ext4_ext_drop_refs(path);
5299 kfree(path);
5300 return ret;
5301 }
5302
5303 stop_block = extent->ee_block + ext4_ext_get_actual_len(extent);
5304 ext4_ext_drop_refs(path);
5305 kfree(path);
5306
5307 /* Nothing to shift, if hole is at the end of file */
5308 if (start >= stop_block)
5309 return ret;
5310
5311 /*
5312 * Don't start shifting extents until we make sure the hole is big
5313 * enough to accomodate the shift.
5314 */
5315 path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
5316 depth = path->p_depth;
5317 extent = path[depth].p_ext;
5318 ex_start = extent->ee_block;
5319 ex_end = extent->ee_block + ext4_ext_get_actual_len(extent);
5320 ext4_ext_drop_refs(path);
5321 kfree(path);
5322
5323 if ((start == ex_start && shift > ex_start) ||
5324 (shift > start - ex_end))
5325 return -EINVAL;
5326
5327 /* Its safe to start updating extents */
5328 while (start < stop_block) {
5329 path = ext4_ext_find_extent(inode, start, NULL, 0);
5330 if (IS_ERR(path))
5331 return PTR_ERR(path);
5332 depth = path->p_depth;
5333 extent = path[depth].p_ext;
5334 current_block = extent->ee_block;
5335 if (start > current_block) {
5336 /* Hole, move to the next extent */
5337 ret = mext_next_extent(inode, path, &extent);
5338 if (ret != 0) {
5339 ext4_ext_drop_refs(path);
5340 kfree(path);
5341 if (ret == 1)
5342 ret = 0;
5343 break;
5344 }
5345 }
5346 ret = ext4_ext_shift_path_extents(path, shift, inode,
5347 handle, &start);
5348 ext4_ext_drop_refs(path);
5349 kfree(path);
5350 if (ret)
5351 break;
5352 }
5353
5354 return ret;
5355}
5356
5357/*
5358 * ext4_collapse_range:
5359 * This implements the fallocate's collapse range functionality for ext4
5360 * Returns: 0 and non-zero on error.
5361 */
5362int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
5363{
5364 struct super_block *sb = inode->i_sb;
5365 ext4_lblk_t punch_start, punch_stop;
5366 handle_t *handle;
5367 unsigned int credits;
5368 loff_t new_size;
5369 int ret;
5370
5371 BUG_ON(offset + len > i_size_read(inode));
5372
5373 /* Collapse range works only on fs block size aligned offsets. */
5374 if (offset & (EXT4_BLOCK_SIZE(sb) - 1) ||
5375 len & (EXT4_BLOCK_SIZE(sb) - 1))
5376 return -EINVAL;
5377
5378 if (!S_ISREG(inode->i_mode))
5379 return -EOPNOTSUPP;
5380
5381 trace_ext4_collapse_range(inode, offset, len);
5382
5383 punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5384 punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
5385
5386 /* Write out all dirty pages */
5387 ret = filemap_write_and_wait_range(inode->i_mapping, offset, -1);
5388 if (ret)
5389 return ret;
5390
5391 /* Take mutex lock */
5392 mutex_lock(&inode->i_mutex);
5393
5394 /* It's not possible punch hole on append only file */
5395 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
5396 ret = -EPERM;
5397 goto out_mutex;
5398 }
5399
5400 if (IS_SWAPFILE(inode)) {
5401 ret = -ETXTBSY;
5402 goto out_mutex;
5403 }
5404
5405 /* Currently just for extent based files */
5406 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5407 ret = -EOPNOTSUPP;
5408 goto out_mutex;
5409 }
5410
5411 truncate_pagecache_range(inode, offset, -1);
5412
5413 /* Wait for existing dio to complete */
5414 ext4_inode_block_unlocked_dio(inode);
5415 inode_dio_wait(inode);
5416
5417 credits = ext4_writepage_trans_blocks(inode);
5418 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5419 if (IS_ERR(handle)) {
5420 ret = PTR_ERR(handle);
5421 goto out_dio;
5422 }
5423
5424 down_write(&EXT4_I(inode)->i_data_sem);
5425 ext4_discard_preallocations(inode);
5426
5427 ret = ext4_es_remove_extent(inode, punch_start,
5428 EXT_MAX_BLOCKS - punch_start - 1);
5429 if (ret) {
5430 up_write(&EXT4_I(inode)->i_data_sem);
5431 goto out_stop;
5432 }
5433
5434 ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
5435 if (ret) {
5436 up_write(&EXT4_I(inode)->i_data_sem);
5437 goto out_stop;
5438 }
5439
5440 ret = ext4_ext_shift_extents(inode, handle, punch_stop,
5441 punch_stop - punch_start);
5442 if (ret) {
5443 up_write(&EXT4_I(inode)->i_data_sem);
5444 goto out_stop;
5445 }
5446
5447 new_size = i_size_read(inode) - len;
5448 truncate_setsize(inode, new_size);
5449 EXT4_I(inode)->i_disksize = new_size;
5450
5451 ext4_discard_preallocations(inode);
5452 up_write(&EXT4_I(inode)->i_data_sem);
5453 if (IS_SYNC(inode))
5454 ext4_handle_sync(handle);
5455 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
5456 ext4_mark_inode_dirty(handle, inode);
5457
5458out_stop:
5459 ext4_journal_stop(handle);
5460out_dio:
5461 ext4_inode_resume_unlocked_dio(inode);
5462out_mutex:
5463 mutex_unlock(&inode->i_mutex);
5464 return ret;
5465}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 3981ff783950..0a014a7194b2 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -184,7 +184,7 @@ static void ext4_es_print_tree(struct inode *inode)
184 while (node) { 184 while (node) {
185 struct extent_status *es; 185 struct extent_status *es;
186 es = rb_entry(node, struct extent_status, rb_node); 186 es = rb_entry(node, struct extent_status, rb_node);
187 printk(KERN_DEBUG " [%u/%u) %llu %llx", 187 printk(KERN_DEBUG " [%u/%u) %llu %x",
188 es->es_lblk, es->es_len, 188 es->es_lblk, es->es_len,
189 ext4_es_pblock(es), ext4_es_status(es)); 189 ext4_es_pblock(es), ext4_es_status(es));
190 node = rb_next(node); 190 node = rb_next(node);
@@ -445,8 +445,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
445 pr_warn("ES insert assertion failed for " 445 pr_warn("ES insert assertion failed for "
446 "inode: %lu we can find an extent " 446 "inode: %lu we can find an extent "
447 "at block [%d/%d/%llu/%c], but we " 447 "at block [%d/%d/%llu/%c], but we "
448 "want to add an delayed/hole extent " 448 "want to add a delayed/hole extent "
449 "[%d/%d/%llu/%llx]\n", 449 "[%d/%d/%llu/%x]\n",
450 inode->i_ino, ee_block, ee_len, 450 inode->i_ino, ee_block, ee_len,
451 ee_start, ee_status ? 'u' : 'w', 451 ee_start, ee_status ? 'u' : 'w',
452 es->es_lblk, es->es_len, 452 es->es_lblk, es->es_len,
@@ -486,8 +486,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
486 if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { 486 if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
487 pr_warn("ES insert assertion failed for inode: %lu " 487 pr_warn("ES insert assertion failed for inode: %lu "
488 "can't find an extent at block %d but we want " 488 "can't find an extent at block %d but we want "
489 "to add an written/unwritten extent " 489 "to add a written/unwritten extent "
490 "[%d/%d/%llu/%llx]\n", inode->i_ino, 490 "[%d/%d/%llu/%x]\n", inode->i_ino,
491 es->es_lblk, es->es_lblk, es->es_len, 491 es->es_lblk, es->es_lblk, es->es_len,
492 ext4_es_pblock(es), ext4_es_status(es)); 492 ext4_es_pblock(es), ext4_es_status(es));
493 } 493 }
@@ -524,7 +524,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
524 */ 524 */
525 pr_warn("ES insert assertion failed for inode: %lu " 525 pr_warn("ES insert assertion failed for inode: %lu "
526 "We can find blocks but we want to add a " 526 "We can find blocks but we want to add a "
527 "delayed/hole extent [%d/%d/%llu/%llx]\n", 527 "delayed/hole extent [%d/%d/%llu/%x]\n",
528 inode->i_ino, es->es_lblk, es->es_len, 528 inode->i_ino, es->es_lblk, es->es_len,
529 ext4_es_pblock(es), ext4_es_status(es)); 529 ext4_es_pblock(es), ext4_es_status(es));
530 return; 530 return;
@@ -554,7 +554,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
554 if (ext4_es_is_written(es)) { 554 if (ext4_es_is_written(es)) {
555 pr_warn("ES insert assertion failed for inode: %lu " 555 pr_warn("ES insert assertion failed for inode: %lu "
556 "We can't find the block but we want to add " 556 "We can't find the block but we want to add "
557 "an written extent [%d/%d/%llu/%llx]\n", 557 "a written extent [%d/%d/%llu/%x]\n",
558 inode->i_ino, es->es_lblk, es->es_len, 558 inode->i_ino, es->es_lblk, es->es_len,
559 ext4_es_pblock(es), ext4_es_status(es)); 559 ext4_es_pblock(es), ext4_es_status(es));
560 return; 560 return;
@@ -658,8 +658,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
658 658
659 newes.es_lblk = lblk; 659 newes.es_lblk = lblk;
660 newes.es_len = len; 660 newes.es_len = len;
661 ext4_es_store_pblock(&newes, pblk); 661 ext4_es_store_pblock_status(&newes, pblk, status);
662 ext4_es_store_status(&newes, status);
663 trace_ext4_es_insert_extent(inode, &newes); 662 trace_ext4_es_insert_extent(inode, &newes);
664 663
665 ext4_es_insert_extent_check(inode, &newes); 664 ext4_es_insert_extent_check(inode, &newes);
@@ -699,8 +698,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
699 698
700 newes.es_lblk = lblk; 699 newes.es_lblk = lblk;
701 newes.es_len = len; 700 newes.es_len = len;
702 ext4_es_store_pblock(&newes, pblk); 701 ext4_es_store_pblock_status(&newes, pblk, status);
703 ext4_es_store_status(&newes, status);
704 trace_ext4_es_cache_extent(inode, &newes); 702 trace_ext4_es_cache_extent(inode, &newes);
705 703
706 if (!len) 704 if (!len)
@@ -812,13 +810,13 @@ retry:
812 810
813 newes.es_lblk = end + 1; 811 newes.es_lblk = end + 1;
814 newes.es_len = len2; 812 newes.es_len = len2;
813 block = 0x7FDEADBEEF;
815 if (ext4_es_is_written(&orig_es) || 814 if (ext4_es_is_written(&orig_es) ||
816 ext4_es_is_unwritten(&orig_es)) { 815 ext4_es_is_unwritten(&orig_es))
817 block = ext4_es_pblock(&orig_es) + 816 block = ext4_es_pblock(&orig_es) +
818 orig_es.es_len - len2; 817 orig_es.es_len - len2;
819 ext4_es_store_pblock(&newes, block); 818 ext4_es_store_pblock_status(&newes, block,
820 } 819 ext4_es_status(&orig_es));
821 ext4_es_store_status(&newes, ext4_es_status(&orig_es));
822 err = __es_insert_extent(inode, &newes); 820 err = __es_insert_extent(inode, &newes);
823 if (err) { 821 if (err) {
824 es->es_lblk = orig_es.es_lblk; 822 es->es_lblk = orig_es.es_lblk;
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 167f4ab8ecc3..f1b62a419920 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -129,6 +129,15 @@ static inline void ext4_es_store_status(struct extent_status *es,
129 (es->es_pblk & ~ES_MASK)); 129 (es->es_pblk & ~ES_MASK));
130} 130}
131 131
132static inline void ext4_es_store_pblock_status(struct extent_status *es,
133 ext4_fsblk_t pb,
134 unsigned int status)
135{
136 es->es_pblk = (((ext4_fsblk_t)
137 (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
138 (pb & ~ES_MASK));
139}
140
132extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); 141extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
133extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); 142extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
134extern void ext4_es_lru_add(struct inode *inode); 143extern void ext4_es_lru_add(struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 1a5073959f32..6db7f7db7777 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -153,7 +153,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
153 ssize_t err; 153 ssize_t err;
154 154
155 err = generic_write_sync(file, iocb->ki_pos - ret, ret); 155 err = generic_write_sync(file, iocb->ki_pos - ret, ret);
156 if (err < 0 && ret > 0) 156 if (err < 0)
157 ret = err; 157 ret = err;
158 } 158 }
159 blk_finish_plug(&plug); 159 blk_finish_plug(&plug);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 24bfd7ff3049..5b0d2c7d5408 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -215,7 +215,7 @@ void ext4_evict_inode(struct inode *inode)
215 jbd2_complete_transaction(journal, commit_tid); 215 jbd2_complete_transaction(journal, commit_tid);
216 filemap_write_and_wait(&inode->i_data); 216 filemap_write_and_wait(&inode->i_data);
217 } 217 }
218 truncate_inode_pages(&inode->i_data, 0); 218 truncate_inode_pages_final(&inode->i_data);
219 219
220 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); 220 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
221 goto no_delete; 221 goto no_delete;
@@ -226,7 +226,7 @@ void ext4_evict_inode(struct inode *inode)
226 226
227 if (ext4_should_order_data(inode)) 227 if (ext4_should_order_data(inode))
228 ext4_begin_ordered_truncate(inode, 0); 228 ext4_begin_ordered_truncate(inode, 0);
229 truncate_inode_pages(&inode->i_data, 0); 229 truncate_inode_pages_final(&inode->i_data);
230 230
231 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); 231 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
232 if (is_bad_inode(inode)) 232 if (is_bad_inode(inode))
@@ -504,6 +504,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
504{ 504{
505 struct extent_status es; 505 struct extent_status es;
506 int retval; 506 int retval;
507 int ret = 0;
507#ifdef ES_AGGRESSIVE_TEST 508#ifdef ES_AGGRESSIVE_TEST
508 struct ext4_map_blocks orig_map; 509 struct ext4_map_blocks orig_map;
509 510
@@ -515,6 +516,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
515 "logical block %lu\n", inode->i_ino, flags, map->m_len, 516 "logical block %lu\n", inode->i_ino, flags, map->m_len,
516 (unsigned long) map->m_lblk); 517 (unsigned long) map->m_lblk);
517 518
519 /*
520 * ext4_map_blocks returns an int, and m_len is an unsigned int
521 */
522 if (unlikely(map->m_len > INT_MAX))
523 map->m_len = INT_MAX;
524
518 /* Lookup extent status tree firstly */ 525 /* Lookup extent status tree firstly */
519 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { 526 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
520 ext4_es_lru_add(inode); 527 ext4_es_lru_add(inode);
@@ -553,7 +560,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
553 EXT4_GET_BLOCKS_KEEP_SIZE); 560 EXT4_GET_BLOCKS_KEEP_SIZE);
554 } 561 }
555 if (retval > 0) { 562 if (retval > 0) {
556 int ret;
557 unsigned int status; 563 unsigned int status;
558 564
559 if (unlikely(retval != map->m_len)) { 565 if (unlikely(retval != map->m_len)) {
@@ -580,7 +586,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
580 586
581found: 587found:
582 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 588 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
583 int ret = check_block_validity(inode, map); 589 ret = check_block_validity(inode, map);
584 if (ret != 0) 590 if (ret != 0)
585 return ret; 591 return ret;
586 } 592 }
@@ -597,7 +603,13 @@ found:
597 * with buffer head unmapped. 603 * with buffer head unmapped.
598 */ 604 */
599 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) 605 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
600 return retval; 606 /*
607 * If we need to convert extent to unwritten
608 * we continue and do the actual work in
609 * ext4_ext_map_blocks()
610 */
611 if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
612 return retval;
601 613
602 /* 614 /*
603 * Here we clear m_flags because after allocating an new extent, 615 * Here we clear m_flags because after allocating an new extent,
@@ -653,7 +665,6 @@ found:
653 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); 665 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
654 666
655 if (retval > 0) { 667 if (retval > 0) {
656 int ret;
657 unsigned int status; 668 unsigned int status;
658 669
659 if (unlikely(retval != map->m_len)) { 670 if (unlikely(retval != map->m_len)) {
@@ -688,7 +699,7 @@ found:
688has_zeroout: 699has_zeroout:
689 up_write((&EXT4_I(inode)->i_data_sem)); 700 up_write((&EXT4_I(inode)->i_data_sem));
690 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 701 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
691 int ret = check_block_validity(inode, map); 702 ret = check_block_validity(inode, map);
692 if (ret != 0) 703 if (ret != 0)
693 return ret; 704 return ret;
694 } 705 }
@@ -3313,33 +3324,13 @@ void ext4_set_aops(struct inode *inode)
3313} 3324}
3314 3325
3315/* 3326/*
3316 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3317 * up to the end of the block which corresponds to `from'.
3318 * This required during truncate. We need to physically zero the tail end
3319 * of that block so it doesn't yield old data if the file is later grown.
3320 */
3321int ext4_block_truncate_page(handle_t *handle,
3322 struct address_space *mapping, loff_t from)
3323{
3324 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3325 unsigned length;
3326 unsigned blocksize;
3327 struct inode *inode = mapping->host;
3328
3329 blocksize = inode->i_sb->s_blocksize;
3330 length = blocksize - (offset & (blocksize - 1));
3331
3332 return ext4_block_zero_page_range(handle, mapping, from, length);
3333}
3334
3335/*
3336 * ext4_block_zero_page_range() zeros out a mapping of length 'length' 3327 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3337 * starting from file offset 'from'. The range to be zero'd must 3328 * starting from file offset 'from'. The range to be zero'd must
3338 * be contained with in one block. If the specified range exceeds 3329 * be contained with in one block. If the specified range exceeds
3339 * the end of the block it will be shortened to end of the block 3330 * the end of the block it will be shortened to end of the block
3340 * that cooresponds to 'from' 3331 * that cooresponds to 'from'
3341 */ 3332 */
3342int ext4_block_zero_page_range(handle_t *handle, 3333static int ext4_block_zero_page_range(handle_t *handle,
3343 struct address_space *mapping, loff_t from, loff_t length) 3334 struct address_space *mapping, loff_t from, loff_t length)
3344{ 3335{
3345 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3336 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -3429,6 +3420,26 @@ unlock:
3429 return err; 3420 return err;
3430} 3421}
3431 3422
3423/*
3424 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3425 * up to the end of the block which corresponds to `from'.
3426 * This required during truncate. We need to physically zero the tail end
3427 * of that block so it doesn't yield old data if the file is later grown.
3428 */
3429int ext4_block_truncate_page(handle_t *handle,
3430 struct address_space *mapping, loff_t from)
3431{
3432 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3433 unsigned length;
3434 unsigned blocksize;
3435 struct inode *inode = mapping->host;
3436
3437 blocksize = inode->i_sb->s_blocksize;
3438 length = blocksize - (offset & (blocksize - 1));
3439
3440 return ext4_block_zero_page_range(handle, mapping, from, length);
3441}
3442
3432int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, 3443int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
3433 loff_t lstart, loff_t length) 3444 loff_t lstart, loff_t length)
3434{ 3445{
@@ -3502,7 +3513,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
3502 if (!S_ISREG(inode->i_mode)) 3513 if (!S_ISREG(inode->i_mode))
3503 return -EOPNOTSUPP; 3514 return -EOPNOTSUPP;
3504 3515
3505 trace_ext4_punch_hole(inode, offset, length); 3516 trace_ext4_punch_hole(inode, offset, length, 0);
3506 3517
3507 /* 3518 /*
3508 * Write out all dirty pages to avoid race conditions 3519 * Write out all dirty pages to avoid race conditions
@@ -3609,6 +3620,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
3609 up_write(&EXT4_I(inode)->i_data_sem); 3620 up_write(&EXT4_I(inode)->i_data_sem);
3610 if (IS_SYNC(inode)) 3621 if (IS_SYNC(inode))
3611 ext4_handle_sync(handle); 3622 ext4_handle_sync(handle);
3623
3624 /* Now release the pages again to reduce race window */
3625 if (last_block_offset > first_block_offset)
3626 truncate_pagecache_range(inode, first_block_offset,
3627 last_block_offset);
3628
3612 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3629 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3613 ext4_mark_inode_dirty(handle, inode); 3630 ext4_mark_inode_dirty(handle, inode);
3614out_stop: 3631out_stop:
@@ -3682,7 +3699,7 @@ void ext4_truncate(struct inode *inode)
3682 3699
3683 /* 3700 /*
3684 * There is a possibility that we're either freeing the inode 3701 * There is a possibility that we're either freeing the inode
3685 * or it completely new indode. In those cases we might not 3702 * or it's a completely new inode. In those cases we might not
3686 * have i_mutex locked because it's not necessary. 3703 * have i_mutex locked because it's not necessary.
3687 */ 3704 */
3688 if (!(inode->i_state & (I_NEW|I_FREEING))) 3705 if (!(inode->i_state & (I_NEW|I_FREEING)))
@@ -3934,8 +3951,8 @@ void ext4_set_inode_flags(struct inode *inode)
3934 new_fl |= S_NOATIME; 3951 new_fl |= S_NOATIME;
3935 if (flags & EXT4_DIRSYNC_FL) 3952 if (flags & EXT4_DIRSYNC_FL)
3936 new_fl |= S_DIRSYNC; 3953 new_fl |= S_DIRSYNC;
3937 set_mask_bits(&inode->i_flags, 3954 inode_set_flags(inode, new_fl,
3938 S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl); 3955 S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
3939} 3956}
3940 3957
3941/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ 3958/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4154,11 +4171,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4154 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); 4171 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
4155 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); 4172 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
4156 4173
4157 inode->i_version = le32_to_cpu(raw_inode->i_disk_version); 4174 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
4158 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4175 inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
4159 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4176 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4160 inode->i_version |= 4177 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4161 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4178 inode->i_version |=
4179 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
4180 }
4162 } 4181 }
4163 4182
4164 ret = 0; 4183 ret = 0;
@@ -4328,8 +4347,7 @@ static int ext4_do_update_inode(handle_t *handle,
4328 goto out_brelse; 4347 goto out_brelse;
4329 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 4348 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
4330 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); 4349 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
4331 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 4350 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
4332 cpu_to_le32(EXT4_OS_HURD))
4333 raw_inode->i_file_acl_high = 4351 raw_inode->i_file_acl_high =
4334 cpu_to_le16(ei->i_file_acl >> 32); 4352 cpu_to_le16(ei->i_file_acl >> 32);
4335 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); 4353 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
@@ -4374,12 +4392,15 @@ static int ext4_do_update_inode(handle_t *handle,
4374 raw_inode->i_block[block] = ei->i_data[block]; 4392 raw_inode->i_block[block] = ei->i_data[block];
4375 } 4393 }
4376 4394
4377 raw_inode->i_disk_version = cpu_to_le32(inode->i_version); 4395 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
4378 if (ei->i_extra_isize) { 4396 raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
4379 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4397 if (ei->i_extra_isize) {
4380 raw_inode->i_version_hi = 4398 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4381 cpu_to_le32(inode->i_version >> 32); 4399 raw_inode->i_version_hi =
4382 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 4400 cpu_to_le32(inode->i_version >> 32);
4401 raw_inode->i_extra_isize =
4402 cpu_to_le16(ei->i_extra_isize);
4403 }
4383 } 4404 }
4384 4405
4385 ext4_inode_csum_set(inode, raw_inode, ei); 4406 ext4_inode_csum_set(inode, raw_inode, ei);
@@ -4446,7 +4467,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
4446 return -EIO; 4467 return -EIO;
4447 } 4468 }
4448 4469
4449 if (wbc->sync_mode != WB_SYNC_ALL) 4470 /*
4471 * No need to force transaction in WB_SYNC_NONE mode. Also
4472 * ext4_sync_fs() will force the commit after everything is
4473 * written.
4474 */
4475 if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
4450 return 0; 4476 return 0;
4451 4477
4452 err = ext4_force_commit(inode->i_sb); 4478 err = ext4_force_commit(inode->i_sb);
@@ -4456,7 +4482,11 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
4456 err = __ext4_get_inode_loc(inode, &iloc, 0); 4482 err = __ext4_get_inode_loc(inode, &iloc, 0);
4457 if (err) 4483 if (err)
4458 return err; 4484 return err;
4459 if (wbc->sync_mode == WB_SYNC_ALL) 4485 /*
4486 * sync(2) will flush the whole buffer cache. No need to do
4487 * it here separately for each inode.
4488 */
4489 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
4460 sync_dirty_buffer(iloc.bh); 4490 sync_dirty_buffer(iloc.bh);
4461 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 4491 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
4462 EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, 4492 EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a2a837f00407..0f2252ec274d 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -104,21 +104,15 @@ static long swap_inode_boot_loader(struct super_block *sb,
104 struct ext4_inode_info *ei_bl; 104 struct ext4_inode_info *ei_bl;
105 struct ext4_sb_info *sbi = EXT4_SB(sb); 105 struct ext4_sb_info *sbi = EXT4_SB(sb);
106 106
107 if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) { 107 if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode))
108 err = -EINVAL; 108 return -EINVAL;
109 goto swap_boot_out;
110 }
111 109
112 if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { 110 if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
113 err = -EPERM; 111 return -EPERM;
114 goto swap_boot_out;
115 }
116 112
117 inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); 113 inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
118 if (IS_ERR(inode_bl)) { 114 if (IS_ERR(inode_bl))
119 err = PTR_ERR(inode_bl); 115 return PTR_ERR(inode_bl);
120 goto swap_boot_out;
121 }
122 ei_bl = EXT4_I(inode_bl); 116 ei_bl = EXT4_I(inode_bl);
123 117
124 filemap_flush(inode->i_mapping); 118 filemap_flush(inode->i_mapping);
@@ -193,20 +187,14 @@ static long swap_inode_boot_loader(struct super_block *sb,
193 ext4_mark_inode_dirty(handle, inode); 187 ext4_mark_inode_dirty(handle, inode);
194 } 188 }
195 } 189 }
196
197 ext4_journal_stop(handle); 190 ext4_journal_stop(handle);
198
199 ext4_double_up_write_data_sem(inode, inode_bl); 191 ext4_double_up_write_data_sem(inode, inode_bl);
200 192
201journal_err_out: 193journal_err_out:
202 ext4_inode_resume_unlocked_dio(inode); 194 ext4_inode_resume_unlocked_dio(inode);
203 ext4_inode_resume_unlocked_dio(inode_bl); 195 ext4_inode_resume_unlocked_dio(inode_bl);
204
205 unlock_two_nondirectories(inode, inode_bl); 196 unlock_two_nondirectories(inode, inode_bl);
206
207 iput(inode_bl); 197 iput(inode_bl);
208
209swap_boot_out:
210 return err; 198 return err;
211} 199}
212 200
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 04a5c7504be9..a888cac76e9c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1808,6 +1808,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1808 ext4_lock_group(ac->ac_sb, group); 1808 ext4_lock_group(ac->ac_sb, group);
1809 max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, 1809 max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
1810 ac->ac_g_ex.fe_len, &ex); 1810 ac->ac_g_ex.fe_len, &ex);
1811 ex.fe_logical = 0xDEADFA11; /* debug value */
1811 1812
1812 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { 1813 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1813 ext4_fsblk_t start; 1814 ext4_fsblk_t start;
@@ -1936,7 +1937,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1936 */ 1937 */
1937 break; 1938 break;
1938 } 1939 }
1939 1940 ex.fe_logical = 0xDEADC0DE; /* debug value */
1940 ext4_mb_measure_extent(ac, &ex, e4b); 1941 ext4_mb_measure_extent(ac, &ex, e4b);
1941 1942
1942 i += ex.fe_len; 1943 i += ex.fe_len;
@@ -1977,6 +1978,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1977 max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); 1978 max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
1978 if (max >= sbi->s_stripe) { 1979 if (max >= sbi->s_stripe) {
1979 ac->ac_found++; 1980 ac->ac_found++;
1981 ex.fe_logical = 0xDEADF00D; /* debug value */
1980 ac->ac_b_ex = ex; 1982 ac->ac_b_ex = ex;
1981 ext4_mb_use_best_found(ac, e4b); 1983 ext4_mb_use_best_found(ac, e4b);
1982 break; 1984 break;
@@ -4006,8 +4008,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4006 (unsigned long)ac->ac_b_ex.fe_len, 4008 (unsigned long)ac->ac_b_ex.fe_len,
4007 (unsigned long)ac->ac_b_ex.fe_logical, 4009 (unsigned long)ac->ac_b_ex.fe_logical,
4008 (int)ac->ac_criteria); 4010 (int)ac->ac_criteria);
4009 ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found", 4011 ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
4010 ac->ac_ex_scanned, ac->ac_found);
4011 ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); 4012 ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
4012 ngroups = ext4_get_groups_count(sb); 4013 ngroups = ext4_get_groups_count(sb);
4013 for (i = 0; i < ngroups; i++) { 4014 for (i = 0; i < ngroups; i++) {
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 08481ee84cd5..d634e183b4d4 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -48,7 +48,7 @@ extern ushort ext4_mballoc_debug;
48 } \ 48 } \
49 } while (0) 49 } while (0)
50#else 50#else
51#define mb_debug(n, fmt, a...) 51#define mb_debug(n, fmt, a...) no_printk(fmt, ## a)
52#endif 52#endif
53 53
54#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ 54#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
@@ -175,8 +175,6 @@ struct ext4_allocation_context {
175 /* copy of the best found extent taken before preallocation efforts */ 175 /* copy of the best found extent taken before preallocation efforts */
176 struct ext4_free_extent ac_f_ex; 176 struct ext4_free_extent ac_f_ex;
177 177
178 /* number of iterations done. we have to track to limit searching */
179 unsigned long ac_ex_scanned;
180 __u16 ac_groups_scanned; 178 __u16 ac_groups_scanned;
181 __u16 ac_found; 179 __u16 ac_found;
182 __u16 ac_tail; 180 __u16 ac_tail;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 773b503bd18c..58ee7dc87669 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
76 * ext4_ext_path structure refers to the last extent, or a negative error 76 * ext4_ext_path structure refers to the last extent, or a negative error
77 * value on failure. 77 * value on failure.
78 */ 78 */
79static int 79int
80mext_next_extent(struct inode *inode, struct ext4_ext_path *path, 80mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
81 struct ext4_extent **extent) 81 struct ext4_extent **extent)
82{ 82{
@@ -861,8 +861,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
861 } 861 }
862 if (!buffer_mapped(bh)) { 862 if (!buffer_mapped(bh)) {
863 zero_user(page, block_start, blocksize); 863 zero_user(page, block_start, blocksize);
864 if (!err) 864 set_buffer_uptodate(bh);
865 set_buffer_uptodate(bh);
866 continue; 865 continue;
867 } 866 }
868 } 867 }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index d050e043e884..1cb84f78909e 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3000,6 +3000,154 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
3000 return ext4_get_first_inline_block(inode, parent_de, retval); 3000 return ext4_get_first_inline_block(inode, parent_de, retval);
3001} 3001}
3002 3002
3003struct ext4_renament {
3004 struct inode *dir;
3005 struct dentry *dentry;
3006 struct inode *inode;
3007 bool is_dir;
3008 int dir_nlink_delta;
3009
3010 /* entry for "dentry" */
3011 struct buffer_head *bh;
3012 struct ext4_dir_entry_2 *de;
3013 int inlined;
3014
3015 /* entry for ".." in inode if it's a directory */
3016 struct buffer_head *dir_bh;
3017 struct ext4_dir_entry_2 *parent_de;
3018 int dir_inlined;
3019};
3020
3021static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
3022{
3023 int retval;
3024
3025 ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode,
3026 &retval, &ent->parent_de,
3027 &ent->dir_inlined);
3028 if (!ent->dir_bh)
3029 return retval;
3030 if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino)
3031 return -EIO;
3032 BUFFER_TRACE(ent->dir_bh, "get_write_access");
3033 return ext4_journal_get_write_access(handle, ent->dir_bh);
3034}
3035
3036static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent,
3037 unsigned dir_ino)
3038{
3039 int retval;
3040
3041 ent->parent_de->inode = cpu_to_le32(dir_ino);
3042 BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata");
3043 if (!ent->dir_inlined) {
3044 if (is_dx(ent->inode)) {
3045 retval = ext4_handle_dirty_dx_node(handle,
3046 ent->inode,
3047 ent->dir_bh);
3048 } else {
3049 retval = ext4_handle_dirty_dirent_node(handle,
3050 ent->inode,
3051 ent->dir_bh);
3052 }
3053 } else {
3054 retval = ext4_mark_inode_dirty(handle, ent->inode);
3055 }
3056 if (retval) {
3057 ext4_std_error(ent->dir->i_sb, retval);
3058 return retval;
3059 }
3060 return 0;
3061}
3062
3063static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
3064 unsigned ino, unsigned file_type)
3065{
3066 int retval;
3067
3068 BUFFER_TRACE(ent->bh, "get write access");
3069 retval = ext4_journal_get_write_access(handle, ent->bh);
3070 if (retval)
3071 return retval;
3072 ent->de->inode = cpu_to_le32(ino);
3073 if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb,
3074 EXT4_FEATURE_INCOMPAT_FILETYPE))
3075 ent->de->file_type = file_type;
3076 ent->dir->i_version++;
3077 ent->dir->i_ctime = ent->dir->i_mtime =
3078 ext4_current_time(ent->dir);
3079 ext4_mark_inode_dirty(handle, ent->dir);
3080 BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
3081 if (!ent->inlined) {
3082 retval = ext4_handle_dirty_dirent_node(handle,
3083 ent->dir, ent->bh);
3084 if (unlikely(retval)) {
3085 ext4_std_error(ent->dir->i_sb, retval);
3086 return retval;
3087 }
3088 }
3089 brelse(ent->bh);
3090 ent->bh = NULL;
3091
3092 return 0;
3093}
3094
3095static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
3096 const struct qstr *d_name)
3097{
3098 int retval = -ENOENT;
3099 struct buffer_head *bh;
3100 struct ext4_dir_entry_2 *de;
3101
3102 bh = ext4_find_entry(dir, d_name, &de, NULL);
3103 if (bh) {
3104 retval = ext4_delete_entry(handle, dir, de, bh);
3105 brelse(bh);
3106 }
3107 return retval;
3108}
3109
3110static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
3111{
3112 int retval;
3113 /*
3114 * ent->de could have moved from under us during htree split, so make
3115 * sure that we are deleting the right entry. We might also be pointing
3116 * to a stale entry in the unused part of ent->bh so just checking inum
3117 * and the name isn't enough.
3118 */
3119 if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino ||
3120 ent->de->name_len != ent->dentry->d_name.len ||
3121 strncmp(ent->de->name, ent->dentry->d_name.name,
3122 ent->de->name_len)) {
3123 retval = ext4_find_delete_entry(handle, ent->dir,
3124 &ent->dentry->d_name);
3125 } else {
3126 retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh);
3127 if (retval == -ENOENT) {
3128 retval = ext4_find_delete_entry(handle, ent->dir,
3129 &ent->dentry->d_name);
3130 }
3131 }
3132
3133 if (retval) {
3134 ext4_warning(ent->dir->i_sb,
3135 "Deleting old file (%lu), %d, error=%d",
3136 ent->dir->i_ino, ent->dir->i_nlink, retval);
3137 }
3138}
3139
3140static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
3141{
3142 if (ent->dir_nlink_delta) {
3143 if (ent->dir_nlink_delta == -1)
3144 ext4_dec_count(handle, ent->dir);
3145 else
3146 ext4_inc_count(handle, ent->dir);
3147 ext4_mark_inode_dirty(handle, ent->dir);
3148 }
3149}
3150
3003/* 3151/*
3004 * Anybody can rename anything with this: the permission checks are left to the 3152 * Anybody can rename anything with this: the permission checks are left to the
3005 * higher-level routines. 3153 * higher-level routines.
@@ -3012,198 +3160,267 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3012 struct inode *new_dir, struct dentry *new_dentry) 3160 struct inode *new_dir, struct dentry *new_dentry)
3013{ 3161{
3014 handle_t *handle = NULL; 3162 handle_t *handle = NULL;
3015 struct inode *old_inode, *new_inode; 3163 struct ext4_renament old = {
3016 struct buffer_head *old_bh, *new_bh, *dir_bh; 3164 .dir = old_dir,
3017 struct ext4_dir_entry_2 *old_de, *new_de; 3165 .dentry = old_dentry,
3166 .inode = old_dentry->d_inode,
3167 };
3168 struct ext4_renament new = {
3169 .dir = new_dir,
3170 .dentry = new_dentry,
3171 .inode = new_dentry->d_inode,
3172 };
3018 int retval; 3173 int retval;
3019 int inlined = 0, new_inlined = 0;
3020 struct ext4_dir_entry_2 *parent_de;
3021 3174
3022 dquot_initialize(old_dir); 3175 dquot_initialize(old.dir);
3023 dquot_initialize(new_dir); 3176 dquot_initialize(new.dir);
3024
3025 old_bh = new_bh = dir_bh = NULL;
3026 3177
3027 /* Initialize quotas before so that eventual writes go 3178 /* Initialize quotas before so that eventual writes go
3028 * in separate transaction */ 3179 * in separate transaction */
3029 if (new_dentry->d_inode) 3180 if (new.inode)
3030 dquot_initialize(new_dentry->d_inode); 3181 dquot_initialize(new.inode);
3031 3182
3032 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL); 3183 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
3033 /* 3184 /*
3034 * Check for inode number is _not_ due to possible IO errors. 3185 * Check for inode number is _not_ due to possible IO errors.
3035 * We might rmdir the source, keep it as pwd of some process 3186 * We might rmdir the source, keep it as pwd of some process
3036 * and merrily kill the link to whatever was created under the 3187 * and merrily kill the link to whatever was created under the
3037 * same name. Goodbye sticky bit ;-< 3188 * same name. Goodbye sticky bit ;-<
3038 */ 3189 */
3039 old_inode = old_dentry->d_inode;
3040 retval = -ENOENT; 3190 retval = -ENOENT;
3041 if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) 3191 if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
3042 goto end_rename; 3192 goto end_rename;
3043 3193
3044 new_inode = new_dentry->d_inode; 3194 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
3045 new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, 3195 &new.de, &new.inlined);
3046 &new_de, &new_inlined); 3196 if (new.bh) {
3047 if (new_bh) { 3197 if (!new.inode) {
3048 if (!new_inode) { 3198 brelse(new.bh);
3049 brelse(new_bh); 3199 new.bh = NULL;
3050 new_bh = NULL;
3051 } 3200 }
3052 } 3201 }
3053 if (new_inode && !test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC)) 3202 if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC))
3054 ext4_alloc_da_blocks(old_inode); 3203 ext4_alloc_da_blocks(old.inode);
3055 3204
3056 handle = ext4_journal_start(old_dir, EXT4_HT_DIR, 3205 handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
3057 (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + 3206 (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
3058 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); 3207 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
3059 if (IS_ERR(handle)) 3208 if (IS_ERR(handle))
3060 return PTR_ERR(handle); 3209 return PTR_ERR(handle);
3061 3210
3062 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) 3211 if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
3063 ext4_handle_sync(handle); 3212 ext4_handle_sync(handle);
3064 3213
3065 if (S_ISDIR(old_inode->i_mode)) { 3214 if (S_ISDIR(old.inode->i_mode)) {
3066 if (new_inode) { 3215 if (new.inode) {
3067 retval = -ENOTEMPTY; 3216 retval = -ENOTEMPTY;
3068 if (!empty_dir(new_inode)) 3217 if (!empty_dir(new.inode))
3218 goto end_rename;
3219 } else {
3220 retval = -EMLINK;
3221 if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir))
3069 goto end_rename; 3222 goto end_rename;
3070 } 3223 }
3071 retval = -EIO; 3224 retval = ext4_rename_dir_prepare(handle, &old);
3072 dir_bh = ext4_get_first_dir_block(handle, old_inode,
3073 &retval, &parent_de,
3074 &inlined);
3075 if (!dir_bh)
3076 goto end_rename;
3077 if (le32_to_cpu(parent_de->inode) != old_dir->i_ino)
3078 goto end_rename;
3079 retval = -EMLINK;
3080 if (!new_inode && new_dir != old_dir &&
3081 EXT4_DIR_LINK_MAX(new_dir))
3082 goto end_rename;
3083 BUFFER_TRACE(dir_bh, "get_write_access");
3084 retval = ext4_journal_get_write_access(handle, dir_bh);
3085 if (retval) 3225 if (retval)
3086 goto end_rename; 3226 goto end_rename;
3087 } 3227 }
3088 if (!new_bh) { 3228 if (!new.bh) {
3089 retval = ext4_add_entry(handle, new_dentry, old_inode); 3229 retval = ext4_add_entry(handle, new.dentry, old.inode);
3090 if (retval) 3230 if (retval)
3091 goto end_rename; 3231 goto end_rename;
3092 } else { 3232 } else {
3093 BUFFER_TRACE(new_bh, "get write access"); 3233 retval = ext4_setent(handle, &new,
3094 retval = ext4_journal_get_write_access(handle, new_bh); 3234 old.inode->i_ino, old.de->file_type);
3095 if (retval) 3235 if (retval)
3096 goto end_rename; 3236 goto end_rename;
3097 new_de->inode = cpu_to_le32(old_inode->i_ino);
3098 if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
3099 EXT4_FEATURE_INCOMPAT_FILETYPE))
3100 new_de->file_type = old_de->file_type;
3101 new_dir->i_version++;
3102 new_dir->i_ctime = new_dir->i_mtime =
3103 ext4_current_time(new_dir);
3104 ext4_mark_inode_dirty(handle, new_dir);
3105 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
3106 if (!new_inlined) {
3107 retval = ext4_handle_dirty_dirent_node(handle,
3108 new_dir, new_bh);
3109 if (unlikely(retval)) {
3110 ext4_std_error(new_dir->i_sb, retval);
3111 goto end_rename;
3112 }
3113 }
3114 brelse(new_bh);
3115 new_bh = NULL;
3116 } 3237 }
3117 3238
3118 /* 3239 /*
3119 * Like most other Unix systems, set the ctime for inodes on a 3240 * Like most other Unix systems, set the ctime for inodes on a
3120 * rename. 3241 * rename.
3121 */ 3242 */
3122 old_inode->i_ctime = ext4_current_time(old_inode); 3243 old.inode->i_ctime = ext4_current_time(old.inode);
3123 ext4_mark_inode_dirty(handle, old_inode); 3244 ext4_mark_inode_dirty(handle, old.inode);
3124 3245
3125 /* 3246 /*
3126 * ok, that's it 3247 * ok, that's it
3127 */ 3248 */
3128 if (le32_to_cpu(old_de->inode) != old_inode->i_ino || 3249 ext4_rename_delete(handle, &old);
3129 old_de->name_len != old_dentry->d_name.len || 3250
3130 strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || 3251 if (new.inode) {
3131 (retval = ext4_delete_entry(handle, old_dir, 3252 ext4_dec_count(handle, new.inode);
3132 old_de, old_bh)) == -ENOENT) { 3253 new.inode->i_ctime = ext4_current_time(new.inode);
3133 /* old_de could have moved from under us during htree split, so
3134 * make sure that we are deleting the right entry. We might
3135 * also be pointing to a stale entry in the unused part of
3136 * old_bh so just checking inum and the name isn't enough. */
3137 struct buffer_head *old_bh2;
3138 struct ext4_dir_entry_2 *old_de2;
3139
3140 old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name,
3141 &old_de2, NULL);
3142 if (old_bh2) {
3143 retval = ext4_delete_entry(handle, old_dir,
3144 old_de2, old_bh2);
3145 brelse(old_bh2);
3146 }
3147 } 3254 }
3148 if (retval) { 3255 old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir);
3149 ext4_warning(old_dir->i_sb, 3256 ext4_update_dx_flag(old.dir);
3150 "Deleting old file (%lu), %d, error=%d", 3257 if (old.dir_bh) {
3151 old_dir->i_ino, old_dir->i_nlink, retval); 3258 retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
3152 } 3259 if (retval)
3153
3154 if (new_inode) {
3155 ext4_dec_count(handle, new_inode);
3156 new_inode->i_ctime = ext4_current_time(new_inode);
3157 }
3158 old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
3159 ext4_update_dx_flag(old_dir);
3160 if (dir_bh) {
3161 parent_de->inode = cpu_to_le32(new_dir->i_ino);
3162 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
3163 if (!inlined) {
3164 if (is_dx(old_inode)) {
3165 retval = ext4_handle_dirty_dx_node(handle,
3166 old_inode,
3167 dir_bh);
3168 } else {
3169 retval = ext4_handle_dirty_dirent_node(handle,
3170 old_inode, dir_bh);
3171 }
3172 } else {
3173 retval = ext4_mark_inode_dirty(handle, old_inode);
3174 }
3175 if (retval) {
3176 ext4_std_error(old_dir->i_sb, retval);
3177 goto end_rename; 3260 goto end_rename;
3178 } 3261
3179 ext4_dec_count(handle, old_dir); 3262 ext4_dec_count(handle, old.dir);
3180 if (new_inode) { 3263 if (new.inode) {
3181 /* checked empty_dir above, can't have another parent, 3264 /* checked empty_dir above, can't have another parent,
3182 * ext4_dec_count() won't work for many-linked dirs */ 3265 * ext4_dec_count() won't work for many-linked dirs */
3183 clear_nlink(new_inode); 3266 clear_nlink(new.inode);
3184 } else { 3267 } else {
3185 ext4_inc_count(handle, new_dir); 3268 ext4_inc_count(handle, new.dir);
3186 ext4_update_dx_flag(new_dir); 3269 ext4_update_dx_flag(new.dir);
3187 ext4_mark_inode_dirty(handle, new_dir); 3270 ext4_mark_inode_dirty(handle, new.dir);
3188 } 3271 }
3189 } 3272 }
3190 ext4_mark_inode_dirty(handle, old_dir); 3273 ext4_mark_inode_dirty(handle, old.dir);
3191 if (new_inode) { 3274 if (new.inode) {
3192 ext4_mark_inode_dirty(handle, new_inode); 3275 ext4_mark_inode_dirty(handle, new.inode);
3193 if (!new_inode->i_nlink) 3276 if (!new.inode->i_nlink)
3194 ext4_orphan_add(handle, new_inode); 3277 ext4_orphan_add(handle, new.inode);
3195 } 3278 }
3196 retval = 0; 3279 retval = 0;
3197 3280
3198end_rename: 3281end_rename:
3199 brelse(dir_bh); 3282 brelse(old.dir_bh);
3200 brelse(old_bh); 3283 brelse(old.bh);
3201 brelse(new_bh); 3284 brelse(new.bh);
3202 if (handle) 3285 if (handle)
3203 ext4_journal_stop(handle); 3286 ext4_journal_stop(handle);
3204 return retval; 3287 return retval;
3205} 3288}
3206 3289
3290static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
3291 struct inode *new_dir, struct dentry *new_dentry)
3292{
3293 handle_t *handle = NULL;
3294 struct ext4_renament old = {
3295 .dir = old_dir,
3296 .dentry = old_dentry,
3297 .inode = old_dentry->d_inode,
3298 };
3299 struct ext4_renament new = {
3300 .dir = new_dir,
3301 .dentry = new_dentry,
3302 .inode = new_dentry->d_inode,
3303 };
3304 u8 new_file_type;
3305 int retval;
3306
3307 dquot_initialize(old.dir);
3308 dquot_initialize(new.dir);
3309
3310 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
3311 &old.de, &old.inlined);
3312 /*
3313 * Check for inode number is _not_ due to possible IO errors.
3314 * We might rmdir the source, keep it as pwd of some process
3315 * and merrily kill the link to whatever was created under the
3316 * same name. Goodbye sticky bit ;-<
3317 */
3318 retval = -ENOENT;
3319 if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
3320 goto end_rename;
3321
3322 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
3323 &new.de, &new.inlined);
3324
3325 /* RENAME_EXCHANGE case: old *and* new must both exist */
3326 if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
3327 goto end_rename;
3328
3329 handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
3330 (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
3331 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
3332 if (IS_ERR(handle))
3333 return PTR_ERR(handle);
3334
3335 if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
3336 ext4_handle_sync(handle);
3337
3338 if (S_ISDIR(old.inode->i_mode)) {
3339 old.is_dir = true;
3340 retval = ext4_rename_dir_prepare(handle, &old);
3341 if (retval)
3342 goto end_rename;
3343 }
3344 if (S_ISDIR(new.inode->i_mode)) {
3345 new.is_dir = true;
3346 retval = ext4_rename_dir_prepare(handle, &new);
3347 if (retval)
3348 goto end_rename;
3349 }
3350
3351 /*
3352 * Other than the special case of overwriting a directory, parents'
3353 * nlink only needs to be modified if this is a cross directory rename.
3354 */
3355 if (old.dir != new.dir && old.is_dir != new.is_dir) {
3356 old.dir_nlink_delta = old.is_dir ? -1 : 1;
3357 new.dir_nlink_delta = -old.dir_nlink_delta;
3358 retval = -EMLINK;
3359 if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) ||
3360 (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir)))
3361 goto end_rename;
3362 }
3363
3364 new_file_type = new.de->file_type;
3365 retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type);
3366 if (retval)
3367 goto end_rename;
3368
3369 retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type);
3370 if (retval)
3371 goto end_rename;
3372
3373 /*
3374 * Like most other Unix systems, set the ctime for inodes on a
3375 * rename.
3376 */
3377 old.inode->i_ctime = ext4_current_time(old.inode);
3378 new.inode->i_ctime = ext4_current_time(new.inode);
3379 ext4_mark_inode_dirty(handle, old.inode);
3380 ext4_mark_inode_dirty(handle, new.inode);
3381
3382 if (old.dir_bh) {
3383 retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
3384 if (retval)
3385 goto end_rename;
3386 }
3387 if (new.dir_bh) {
3388 retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino);
3389 if (retval)
3390 goto end_rename;
3391 }
3392 ext4_update_dir_count(handle, &old);
3393 ext4_update_dir_count(handle, &new);
3394 retval = 0;
3395
3396end_rename:
3397 brelse(old.dir_bh);
3398 brelse(new.dir_bh);
3399 brelse(old.bh);
3400 brelse(new.bh);
3401 if (handle)
3402 ext4_journal_stop(handle);
3403 return retval;
3404}
3405
3406static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
3407 struct inode *new_dir, struct dentry *new_dentry,
3408 unsigned int flags)
3409{
3410 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
3411 return -EINVAL;
3412
3413 if (flags & RENAME_EXCHANGE) {
3414 return ext4_cross_rename(old_dir, old_dentry,
3415 new_dir, new_dentry);
3416 }
3417 /*
3418 * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE"
3419 * is equivalent to regular rename.
3420 */
3421 return ext4_rename(old_dir, old_dentry, new_dir, new_dentry);
3422}
3423
3207/* 3424/*
3208 * directories can handle most operations... 3425 * directories can handle most operations...
3209 */ 3426 */
@@ -3218,6 +3435,7 @@ const struct inode_operations ext4_dir_inode_operations = {
3218 .mknod = ext4_mknod, 3435 .mknod = ext4_mknod,
3219 .tmpfile = ext4_tmpfile, 3436 .tmpfile = ext4_tmpfile,
3220 .rename = ext4_rename, 3437 .rename = ext4_rename,
3438 .rename2 = ext4_rename2,
3221 .setattr = ext4_setattr, 3439 .setattr = ext4_setattr,
3222 .setxattr = generic_setxattr, 3440 .setxattr = generic_setxattr,
3223 .getxattr = generic_getxattr, 3441 .getxattr = generic_getxattr,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 710fed2377d4..f3c667091618 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -59,6 +59,7 @@ static struct kset *ext4_kset;
59static struct ext4_lazy_init *ext4_li_info; 59static struct ext4_lazy_init *ext4_li_info;
60static struct mutex ext4_li_mtx; 60static struct mutex ext4_li_mtx;
61static struct ext4_features *ext4_feat; 61static struct ext4_features *ext4_feat;
62static int ext4_mballoc_ready;
62 63
63static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 64static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
64 unsigned long journal_devnum); 65 unsigned long journal_devnum);
@@ -845,6 +846,10 @@ static void ext4_put_super(struct super_block *sb)
845 invalidate_bdev(sbi->journal_bdev); 846 invalidate_bdev(sbi->journal_bdev);
846 ext4_blkdev_remove(sbi); 847 ext4_blkdev_remove(sbi);
847 } 848 }
849 if (sbi->s_mb_cache) {
850 ext4_xattr_destroy_cache(sbi->s_mb_cache);
851 sbi->s_mb_cache = NULL;
852 }
848 if (sbi->s_mmp_tsk) 853 if (sbi->s_mmp_tsk)
849 kthread_stop(sbi->s_mmp_tsk); 854 kthread_stop(sbi->s_mmp_tsk);
850 sb->s_fs_info = NULL; 855 sb->s_fs_info = NULL;
@@ -940,7 +945,7 @@ static void init_once(void *foo)
940 inode_init_once(&ei->vfs_inode); 945 inode_init_once(&ei->vfs_inode);
941} 946}
942 947
943static int init_inodecache(void) 948static int __init init_inodecache(void)
944{ 949{
945 ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", 950 ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
946 sizeof(struct ext4_inode_info), 951 sizeof(struct ext4_inode_info),
@@ -3575,6 +3580,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3575 "feature flags set on rev 0 fs, " 3580 "feature flags set on rev 0 fs, "
3576 "running e2fsck is recommended"); 3581 "running e2fsck is recommended");
3577 3582
3583 if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
3584 set_opt2(sb, HURD_COMPAT);
3585 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
3586 EXT4_FEATURE_INCOMPAT_64BIT)) {
3587 ext4_msg(sb, KERN_ERR,
3588 "The Hurd can't support 64-bit file systems");
3589 goto failed_mount;
3590 }
3591 }
3592
3578 if (IS_EXT2_SB(sb)) { 3593 if (IS_EXT2_SB(sb)) {
3579 if (ext2_feature_set_ok(sb)) 3594 if (ext2_feature_set_ok(sb))
3580 ext4_msg(sb, KERN_INFO, "mounting ext2 file system " 3595 ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
@@ -4010,6 +4025,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
4010 percpu_counter_set(&sbi->s_dirtyclusters_counter, 0); 4025 percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
4011 4026
4012no_journal: 4027no_journal:
4028 if (ext4_mballoc_ready) {
4029 sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
4030 if (!sbi->s_mb_cache) {
4031 ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
4032 goto failed_mount_wq;
4033 }
4034 }
4035
4013 /* 4036 /*
4014 * Get the # of file system overhead blocks from the 4037 * Get the # of file system overhead blocks from the
4015 * superblock if present. 4038 * superblock if present.
@@ -4835,6 +4858,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4835 } 4858 }
4836 4859
4837 if (*flags & MS_RDONLY) { 4860 if (*flags & MS_RDONLY) {
4861 err = sync_filesystem(sb);
4862 if (err < 0)
4863 goto restore_opts;
4838 err = dquot_suspend(sb, -1); 4864 err = dquot_suspend(sb, -1);
4839 if (err < 0) 4865 if (err < 0)
4840 goto restore_opts; 4866 goto restore_opts;
@@ -5516,11 +5542,9 @@ static int __init ext4_init_fs(void)
5516 5542
5517 err = ext4_init_mballoc(); 5543 err = ext4_init_mballoc();
5518 if (err) 5544 if (err)
5519 goto out3;
5520
5521 err = ext4_init_xattr();
5522 if (err)
5523 goto out2; 5545 goto out2;
5546 else
5547 ext4_mballoc_ready = 1;
5524 err = init_inodecache(); 5548 err = init_inodecache();
5525 if (err) 5549 if (err)
5526 goto out1; 5550 goto out1;
@@ -5536,10 +5560,9 @@ out:
5536 unregister_as_ext3(); 5560 unregister_as_ext3();
5537 destroy_inodecache(); 5561 destroy_inodecache();
5538out1: 5562out1:
5539 ext4_exit_xattr(); 5563 ext4_mballoc_ready = 0;
5540out2:
5541 ext4_exit_mballoc(); 5564 ext4_exit_mballoc();
5542out3: 5565out2:
5543 ext4_exit_feat_adverts(); 5566 ext4_exit_feat_adverts();
5544out4: 5567out4:
5545 if (ext4_proc_root) 5568 if (ext4_proc_root)
@@ -5562,7 +5585,6 @@ static void __exit ext4_exit_fs(void)
5562 unregister_as_ext3(); 5585 unregister_as_ext3();
5563 unregister_filesystem(&ext4_fs_type); 5586 unregister_filesystem(&ext4_fs_type);
5564 destroy_inodecache(); 5587 destroy_inodecache();
5565 ext4_exit_xattr();
5566 ext4_exit_mballoc(); 5588 ext4_exit_mballoc();
5567 ext4_exit_feat_adverts(); 5589 ext4_exit_feat_adverts();
5568 remove_proc_entry("fs/ext4", NULL); 5590 remove_proc_entry("fs/ext4", NULL);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index e175e94116ac..1f5cf5880718 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -81,7 +81,7 @@
81# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) 81# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
82#endif 82#endif
83 83
84static void ext4_xattr_cache_insert(struct buffer_head *); 84static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
85static struct buffer_head *ext4_xattr_cache_find(struct inode *, 85static struct buffer_head *ext4_xattr_cache_find(struct inode *,
86 struct ext4_xattr_header *, 86 struct ext4_xattr_header *,
87 struct mb_cache_entry **); 87 struct mb_cache_entry **);
@@ -90,8 +90,6 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *,
90static int ext4_xattr_list(struct dentry *dentry, char *buffer, 90static int ext4_xattr_list(struct dentry *dentry, char *buffer,
91 size_t buffer_size); 91 size_t buffer_size);
92 92
93static struct mb_cache *ext4_xattr_cache;
94
95static const struct xattr_handler *ext4_xattr_handler_map[] = { 93static const struct xattr_handler *ext4_xattr_handler_map[] = {
96 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, 94 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
97#ifdef CONFIG_EXT4_FS_POSIX_ACL 95#ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -117,6 +115,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
117 NULL 115 NULL
118}; 116};
119 117
118#define EXT4_GET_MB_CACHE(inode) (((struct ext4_sb_info *) \
119 inode->i_sb->s_fs_info)->s_mb_cache)
120
120static __le32 ext4_xattr_block_csum(struct inode *inode, 121static __le32 ext4_xattr_block_csum(struct inode *inode,
121 sector_t block_nr, 122 sector_t block_nr,
122 struct ext4_xattr_header *hdr) 123 struct ext4_xattr_header *hdr)
@@ -265,6 +266,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
265 struct ext4_xattr_entry *entry; 266 struct ext4_xattr_entry *entry;
266 size_t size; 267 size_t size;
267 int error; 268 int error;
269 struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
268 270
269 ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", 271 ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
270 name_index, name, buffer, (long)buffer_size); 272 name_index, name, buffer, (long)buffer_size);
@@ -286,7 +288,7 @@ bad_block:
286 error = -EIO; 288 error = -EIO;
287 goto cleanup; 289 goto cleanup;
288 } 290 }
289 ext4_xattr_cache_insert(bh); 291 ext4_xattr_cache_insert(ext4_mb_cache, bh);
290 entry = BFIRST(bh); 292 entry = BFIRST(bh);
291 error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); 293 error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
292 if (error == -EIO) 294 if (error == -EIO)
@@ -409,6 +411,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
409 struct inode *inode = dentry->d_inode; 411 struct inode *inode = dentry->d_inode;
410 struct buffer_head *bh = NULL; 412 struct buffer_head *bh = NULL;
411 int error; 413 int error;
414 struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
412 415
413 ea_idebug(inode, "buffer=%p, buffer_size=%ld", 416 ea_idebug(inode, "buffer=%p, buffer_size=%ld",
414 buffer, (long)buffer_size); 417 buffer, (long)buffer_size);
@@ -430,7 +433,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
430 error = -EIO; 433 error = -EIO;
431 goto cleanup; 434 goto cleanup;
432 } 435 }
433 ext4_xattr_cache_insert(bh); 436 ext4_xattr_cache_insert(ext4_mb_cache, bh);
434 error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); 437 error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
435 438
436cleanup: 439cleanup:
@@ -526,8 +529,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
526{ 529{
527 struct mb_cache_entry *ce = NULL; 530 struct mb_cache_entry *ce = NULL;
528 int error = 0; 531 int error = 0;
532 struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
529 533
530 ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr); 534 ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
531 error = ext4_journal_get_write_access(handle, bh); 535 error = ext4_journal_get_write_access(handle, bh);
532 if (error) 536 if (error)
533 goto out; 537 goto out;
@@ -567,12 +571,13 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
567 size_t *min_offs, void *base, int *total) 571 size_t *min_offs, void *base, int *total)
568{ 572{
569 for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { 573 for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
570 *total += EXT4_XATTR_LEN(last->e_name_len);
571 if (!last->e_value_block && last->e_value_size) { 574 if (!last->e_value_block && last->e_value_size) {
572 size_t offs = le16_to_cpu(last->e_value_offs); 575 size_t offs = le16_to_cpu(last->e_value_offs);
573 if (offs < *min_offs) 576 if (offs < *min_offs)
574 *min_offs = offs; 577 *min_offs = offs;
575 } 578 }
579 if (total)
580 *total += EXT4_XATTR_LEN(last->e_name_len);
576 } 581 }
577 return (*min_offs - ((void *)last - base) - sizeof(__u32)); 582 return (*min_offs - ((void *)last - base) - sizeof(__u32));
578} 583}
@@ -745,13 +750,14 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
745 struct ext4_xattr_search *s = &bs->s; 750 struct ext4_xattr_search *s = &bs->s;
746 struct mb_cache_entry *ce = NULL; 751 struct mb_cache_entry *ce = NULL;
747 int error = 0; 752 int error = 0;
753 struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
748 754
749#define header(x) ((struct ext4_xattr_header *)(x)) 755#define header(x) ((struct ext4_xattr_header *)(x))
750 756
751 if (i->value && i->value_len > sb->s_blocksize) 757 if (i->value && i->value_len > sb->s_blocksize)
752 return -ENOSPC; 758 return -ENOSPC;
753 if (s->base) { 759 if (s->base) {
754 ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev, 760 ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
755 bs->bh->b_blocknr); 761 bs->bh->b_blocknr);
756 error = ext4_journal_get_write_access(handle, bs->bh); 762 error = ext4_journal_get_write_access(handle, bs->bh);
757 if (error) 763 if (error)
@@ -769,7 +775,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
769 if (!IS_LAST_ENTRY(s->first)) 775 if (!IS_LAST_ENTRY(s->first))
770 ext4_xattr_rehash(header(s->base), 776 ext4_xattr_rehash(header(s->base),
771 s->here); 777 s->here);
772 ext4_xattr_cache_insert(bs->bh); 778 ext4_xattr_cache_insert(ext4_mb_cache,
779 bs->bh);
773 } 780 }
774 unlock_buffer(bs->bh); 781 unlock_buffer(bs->bh);
775 if (error == -EIO) 782 if (error == -EIO)
@@ -905,7 +912,7 @@ getblk_failed:
905 memcpy(new_bh->b_data, s->base, new_bh->b_size); 912 memcpy(new_bh->b_data, s->base, new_bh->b_size);
906 set_buffer_uptodate(new_bh); 913 set_buffer_uptodate(new_bh);
907 unlock_buffer(new_bh); 914 unlock_buffer(new_bh);
908 ext4_xattr_cache_insert(new_bh); 915 ext4_xattr_cache_insert(ext4_mb_cache, new_bh);
909 error = ext4_handle_dirty_xattr_block(handle, 916 error = ext4_handle_dirty_xattr_block(handle,
910 inode, new_bh); 917 inode, new_bh);
911 if (error) 918 if (error)
@@ -1228,7 +1235,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
1228 struct ext4_xattr_block_find *bs = NULL; 1235 struct ext4_xattr_block_find *bs = NULL;
1229 char *buffer = NULL, *b_entry_name = NULL; 1236 char *buffer = NULL, *b_entry_name = NULL;
1230 size_t min_offs, free; 1237 size_t min_offs, free;
1231 int total_ino, total_blk; 1238 int total_ino;
1232 void *base, *start, *end; 1239 void *base, *start, *end;
1233 int extra_isize = 0, error = 0, tried_min_extra_isize = 0; 1240 int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
1234 int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); 1241 int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
@@ -1286,8 +1293,7 @@ retry:
1286 first = BFIRST(bh); 1293 first = BFIRST(bh);
1287 end = bh->b_data + bh->b_size; 1294 end = bh->b_data + bh->b_size;
1288 min_offs = end - base; 1295 min_offs = end - base;
1289 free = ext4_xattr_free_space(first, &min_offs, base, 1296 free = ext4_xattr_free_space(first, &min_offs, base, NULL);
1290 &total_blk);
1291 if (free < new_extra_isize) { 1297 if (free < new_extra_isize) {
1292 if (!tried_min_extra_isize && s_min_extra_isize) { 1298 if (!tried_min_extra_isize && s_min_extra_isize) {
1293 tried_min_extra_isize++; 1299 tried_min_extra_isize++;
@@ -1495,13 +1501,13 @@ ext4_xattr_put_super(struct super_block *sb)
1495 * Returns 0, or a negative error number on failure. 1501 * Returns 0, or a negative error number on failure.
1496 */ 1502 */
1497static void 1503static void
1498ext4_xattr_cache_insert(struct buffer_head *bh) 1504ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
1499{ 1505{
1500 __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); 1506 __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
1501 struct mb_cache_entry *ce; 1507 struct mb_cache_entry *ce;
1502 int error; 1508 int error;
1503 1509
1504 ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS); 1510 ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
1505 if (!ce) { 1511 if (!ce) {
1506 ea_bdebug(bh, "out of memory"); 1512 ea_bdebug(bh, "out of memory");
1507 return; 1513 return;
@@ -1573,12 +1579,13 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
1573{ 1579{
1574 __u32 hash = le32_to_cpu(header->h_hash); 1580 __u32 hash = le32_to_cpu(header->h_hash);
1575 struct mb_cache_entry *ce; 1581 struct mb_cache_entry *ce;
1582 struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
1576 1583
1577 if (!header->h_hash) 1584 if (!header->h_hash)
1578 return NULL; /* never share */ 1585 return NULL; /* never share */
1579 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); 1586 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
1580again: 1587again:
1581 ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev, 1588 ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
1582 hash); 1589 hash);
1583 while (ce) { 1590 while (ce) {
1584 struct buffer_head *bh; 1591 struct buffer_head *bh;
@@ -1676,19 +1683,17 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
1676 1683
1677#undef BLOCK_HASH_SHIFT 1684#undef BLOCK_HASH_SHIFT
1678 1685
1679int __init 1686#define HASH_BUCKET_BITS 10
1680ext4_init_xattr(void) 1687
1688struct mb_cache *
1689ext4_xattr_create_cache(char *name)
1681{ 1690{
1682 ext4_xattr_cache = mb_cache_create("ext4_xattr", 6); 1691 return mb_cache_create(name, HASH_BUCKET_BITS);
1683 if (!ext4_xattr_cache)
1684 return -ENOMEM;
1685 return 0;
1686} 1692}
1687 1693
1688void 1694void ext4_xattr_destroy_cache(struct mb_cache *cache)
1689ext4_exit_xattr(void)
1690{ 1695{
1691 if (ext4_xattr_cache) 1696 if (cache)
1692 mb_cache_destroy(ext4_xattr_cache); 1697 mb_cache_destroy(cache);
1693 ext4_xattr_cache = NULL;
1694} 1698}
1699
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 819d6398833f..29bedf5589f6 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -110,9 +110,6 @@ extern void ext4_xattr_put_super(struct super_block *);
110extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, 110extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
111 struct ext4_inode *raw_inode, handle_t *handle); 111 struct ext4_inode *raw_inode, handle_t *handle);
112 112
113extern int __init ext4_init_xattr(void);
114extern void ext4_exit_xattr(void);
115
116extern const struct xattr_handler *ext4_xattr_handlers[]; 113extern const struct xattr_handler *ext4_xattr_handlers[];
117 114
118extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, 115extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
@@ -124,6 +121,9 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
124 struct ext4_xattr_info *i, 121 struct ext4_xattr_info *i,
125 struct ext4_xattr_ibody_find *is); 122 struct ext4_xattr_ibody_find *is);
126 123
124extern struct mb_cache *ext4_xattr_create_cache(char *name);
125extern void ext4_xattr_destroy_cache(struct mb_cache *);
126
127#ifdef CONFIG_EXT4_FS_SECURITY 127#ifdef CONFIG_EXT4_FS_SECURITY
128extern int ext4_init_security(handle_t *handle, struct inode *inode, 128extern int ext4_init_security(handle_t *handle, struct inode *inode,
129 struct inode *dir, const struct qstr *qstr); 129 struct inode *dir, const struct qstr *qstr);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 4d67ed736dca..28cea76d78c6 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -260,7 +260,7 @@ void f2fs_evict_inode(struct inode *inode)
260 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 260 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
261 261
262 trace_f2fs_evict_inode(inode); 262 trace_f2fs_evict_inode(inode);
263 truncate_inode_pages(&inode->i_data, 0); 263 truncate_inode_pages_final(&inode->i_data);
264 264
265 if (inode->i_ino == F2FS_NODE_INO(sbi) || 265 if (inode->i_ino == F2FS_NODE_INO(sbi) ||
266 inode->i_ino == F2FS_META_INO(sbi)) 266 inode->i_ino == F2FS_META_INO(sbi))
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 1a85f83abd53..856bdf994c0a 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -568,6 +568,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
568 struct f2fs_mount_info org_mount_opt; 568 struct f2fs_mount_info org_mount_opt;
569 int err, active_logs; 569 int err, active_logs;
570 570
571 sync_filesystem(sb);
572
571 /* 573 /*
572 * Save the old mount options in case we 574 * Save the old mount options in case we
573 * need to restore them. 575 * need to restore them.
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 854b578f6695..b3361fe2bcb5 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -490,7 +490,7 @@ EXPORT_SYMBOL_GPL(fat_build_inode);
490 490
491static void fat_evict_inode(struct inode *inode) 491static void fat_evict_inode(struct inode *inode)
492{ 492{
493 truncate_inode_pages(&inode->i_data, 0); 493 truncate_inode_pages_final(&inode->i_data);
494 if (!inode->i_nlink) { 494 if (!inode->i_nlink) {
495 inode->i_size = 0; 495 inode->i_size = 0;
496 fat_truncate_blocks(inode, 0); 496 fat_truncate_blocks(inode, 0);
@@ -635,6 +635,8 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
635 struct msdos_sb_info *sbi = MSDOS_SB(sb); 635 struct msdos_sb_info *sbi = MSDOS_SB(sb);
636 *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME); 636 *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME);
637 637
638 sync_filesystem(sb);
639
638 /* make sure we update state on remount. */ 640 /* make sure we update state on remount. */
639 new_rdonly = *flags & MS_RDONLY; 641 new_rdonly = *flags & MS_RDONLY;
640 if (new_rdonly != (sb->s_flags & MS_RDONLY)) { 642 if (new_rdonly != (sb->s_flags & MS_RDONLY)) {
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ef6866592a0f..9ead1596399a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -272,9 +272,19 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
272 case F_SETFL: 272 case F_SETFL:
273 err = setfl(fd, filp, arg); 273 err = setfl(fd, filp, arg);
274 break; 274 break;
275#if BITS_PER_LONG != 32
276 /* 32-bit arches must use fcntl64() */
277 case F_GETLKP:
278#endif
275 case F_GETLK: 279 case F_GETLK:
276 err = fcntl_getlk(filp, (struct flock __user *) arg); 280 err = fcntl_getlk(filp, cmd, (struct flock __user *) arg);
277 break; 281 break;
282#if BITS_PER_LONG != 32
283 /* 32-bit arches must use fcntl64() */
284 case F_SETLKP:
285 case F_SETLKPW:
286#endif
287 /* Fallthrough */
278 case F_SETLK: 288 case F_SETLK:
279 case F_SETLKW: 289 case F_SETLKW:
280 err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg); 290 err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg);
@@ -388,17 +398,20 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
388 goto out1; 398 goto out1;
389 399
390 switch (cmd) { 400 switch (cmd) {
391 case F_GETLK64: 401 case F_GETLK64:
392 err = fcntl_getlk64(f.file, (struct flock64 __user *) arg); 402 case F_GETLKP:
393 break; 403 err = fcntl_getlk64(f.file, cmd, (struct flock64 __user *) arg);
394 case F_SETLK64: 404 break;
395 case F_SETLKW64: 405 case F_SETLK64:
396 err = fcntl_setlk64(fd, f.file, cmd, 406 case F_SETLKW64:
397 (struct flock64 __user *) arg); 407 case F_SETLKP:
398 break; 408 case F_SETLKPW:
399 default: 409 err = fcntl_setlk64(fd, f.file, cmd,
400 err = do_fcntl(fd, cmd, arg, f.file); 410 (struct flock64 __user *) arg);
401 break; 411 break;
412 default:
413 err = do_fcntl(fd, cmd, arg, f.file);
414 break;
402 } 415 }
403out1: 416out1:
404 fdput(f); 417 fdput(f);
diff --git a/fs/file.c b/fs/file.c
index eb56a13dab3e..b61293badfb1 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -497,7 +497,7 @@ repeat:
497 error = fd; 497 error = fd;
498#if 1 498#if 1
499 /* Sanity check */ 499 /* Sanity check */
500 if (rcu_dereference_raw(fdt->fd[fd]) != NULL) { 500 if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
501 printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); 501 printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
502 rcu_assign_pointer(fdt->fd[fd], NULL); 502 rcu_assign_pointer(fdt->fd[fd], NULL);
503 } 503 }
diff --git a/fs/file_table.c b/fs/file_table.c
index 5b24008ea4f6..01071c4d752e 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -235,7 +235,7 @@ static void __fput(struct file *file)
235 * in the file cleanup chain. 235 * in the file cleanup chain.
236 */ 236 */
237 eventpoll_release(file); 237 eventpoll_release(file);
238 locks_remove_flock(file); 238 locks_remove_file(file);
239 239
240 if (unlikely(file->f_flags & FASYNC)) { 240 if (unlikely(file->f_flags & FASYNC)) {
241 if (file->f_op->fasync) 241 if (file->f_op->fasync)
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 92567d95ba6a..5797d45a78cb 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -121,6 +121,7 @@ int unregister_filesystem(struct file_system_type * fs)
121 121
122EXPORT_SYMBOL(unregister_filesystem); 122EXPORT_SYMBOL(unregister_filesystem);
123 123
124#ifdef CONFIG_SYSFS_SYSCALL
124static int fs_index(const char __user * __name) 125static int fs_index(const char __user * __name)
125{ 126{
126 struct file_system_type * tmp; 127 struct file_system_type * tmp;
@@ -199,6 +200,7 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
199 } 200 }
200 return retval; 201 return retval;
201} 202}
203#endif
202 204
203int __init get_filesystem_list(char *buf) 205int __init get_filesystem_list(char *buf)
204{ 206{
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index f47df72cef17..363e3ae25f6b 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -354,7 +354,7 @@ static void vxfs_i_callback(struct rcu_head *head)
354void 354void
355vxfs_evict_inode(struct inode *ip) 355vxfs_evict_inode(struct inode *ip)
356{ 356{
357 truncate_inode_pages(&ip->i_data, 0); 357 truncate_inode_pages_final(&ip->i_data);
358 clear_inode(ip); 358 clear_inode(ip);
359 call_rcu(&ip->i_rcu, vxfs_i_callback); 359 call_rcu(&ip->i_rcu, vxfs_i_callback);
360} 360}
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 25d4099a4aea..99c7f0a37af4 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -192,7 +192,7 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp)
192 * vxfs_lookup - lookup pathname component 192 * vxfs_lookup - lookup pathname component
193 * @dip: dir in which we lookup 193 * @dip: dir in which we lookup
194 * @dp: dentry we lookup 194 * @dp: dentry we lookup
195 * @nd: lookup nameidata 195 * @flags: lookup flags
196 * 196 *
197 * Description: 197 * Description:
198 * vxfs_lookup tries to lookup the pathname component described 198 * vxfs_lookup tries to lookup the pathname component described
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index e37eb274e492..7ca8c75d50d3 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -124,6 +124,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
124 124
125static int vxfs_remount(struct super_block *sb, int *flags, char *data) 125static int vxfs_remount(struct super_block *sb, int *flags, char *data)
126{ 126{
127 sync_filesystem(sb);
127 *flags |= MS_RDONLY; 128 *flags |= MS_RDONLY;
128 return 0; 129 return 0;
129} 130}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d754e3cf99a8..be568b7311d6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -89,16 +89,31 @@ static inline struct inode *wb_inode(struct list_head *head)
89#define CREATE_TRACE_POINTS 89#define CREATE_TRACE_POINTS
90#include <trace/events/writeback.h> 90#include <trace/events/writeback.h>
91 91
92EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
93
94static void bdi_wakeup_thread(struct backing_dev_info *bdi)
95{
96 spin_lock_bh(&bdi->wb_lock);
97 if (test_bit(BDI_registered, &bdi->state))
98 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
99 spin_unlock_bh(&bdi->wb_lock);
100}
101
92static void bdi_queue_work(struct backing_dev_info *bdi, 102static void bdi_queue_work(struct backing_dev_info *bdi,
93 struct wb_writeback_work *work) 103 struct wb_writeback_work *work)
94{ 104{
95 trace_writeback_queue(bdi, work); 105 trace_writeback_queue(bdi, work);
96 106
97 spin_lock_bh(&bdi->wb_lock); 107 spin_lock_bh(&bdi->wb_lock);
108 if (!test_bit(BDI_registered, &bdi->state)) {
109 if (work->done)
110 complete(work->done);
111 goto out_unlock;
112 }
98 list_add_tail(&work->list, &bdi->work_list); 113 list_add_tail(&work->list, &bdi->work_list);
99 spin_unlock_bh(&bdi->wb_lock);
100
101 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); 114 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
115out_unlock:
116 spin_unlock_bh(&bdi->wb_lock);
102} 117}
103 118
104static void 119static void
@@ -114,7 +129,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
114 work = kzalloc(sizeof(*work), GFP_ATOMIC); 129 work = kzalloc(sizeof(*work), GFP_ATOMIC);
115 if (!work) { 130 if (!work) {
116 trace_writeback_nowork(bdi); 131 trace_writeback_nowork(bdi);
117 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); 132 bdi_wakeup_thread(bdi);
118 return; 133 return;
119 } 134 }
120 135
@@ -161,7 +176,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
161 * writeback as soon as there is no other work to do. 176 * writeback as soon as there is no other work to do.
162 */ 177 */
163 trace_writeback_wake_background(bdi); 178 trace_writeback_wake_background(bdi);
164 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); 179 bdi_wakeup_thread(bdi);
165} 180}
166 181
167/* 182/*
@@ -1017,7 +1032,7 @@ void bdi_writeback_workfn(struct work_struct *work)
1017 current->flags |= PF_SWAPWRITE; 1032 current->flags |= PF_SWAPWRITE;
1018 1033
1019 if (likely(!current_is_workqueue_rescuer() || 1034 if (likely(!current_is_workqueue_rescuer() ||
1020 list_empty(&bdi->bdi_list))) { 1035 !test_bit(BDI_registered, &bdi->state))) {
1021 /* 1036 /*
1022 * The normal path. Keep writing back @bdi until its 1037 * The normal path. Keep writing back @bdi until its
1023 * work_list is empty. Note that this path is also taken 1038 * work_list is empty. Note that this path is also taken
@@ -1039,10 +1054,10 @@ void bdi_writeback_workfn(struct work_struct *work)
1039 trace_writeback_pages_written(pages_written); 1054 trace_writeback_pages_written(pages_written);
1040 } 1055 }
1041 1056
1042 if (!list_empty(&bdi->work_list) || 1057 if (!list_empty(&bdi->work_list))
1043 (wb_has_dirty_io(wb) && dirty_writeback_interval)) 1058 mod_delayed_work(bdi_wq, &wb->dwork, 0);
1044 queue_delayed_work(bdi_wq, &wb->dwork, 1059 else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
1045 msecs_to_jiffies(dirty_writeback_interval * 10)); 1060 bdi_wakeup_thread_delayed(bdi);
1046 1061
1047 current->flags &= ~PF_SWAPWRITE; 1062 current->flags &= ~PF_SWAPWRITE;
1048} 1063}
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index b96a49b37d66..23e363f38302 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -95,7 +95,7 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
95 struct iovec iov = { .iov_base = buf, .iov_len = count }; 95 struct iovec iov = { .iov_base = buf, .iov_len = count };
96 struct fuse_io_priv io = { .async = 0, .file = file }; 96 struct fuse_io_priv io = { .async = 0, .file = file };
97 97
98 return fuse_direct_io(&io, &iov, 1, count, &pos, 0); 98 return fuse_direct_io(&io, &iov, 1, count, &pos, FUSE_DIO_CUSE);
99} 99}
100 100
101static ssize_t cuse_write(struct file *file, const char __user *buf, 101static ssize_t cuse_write(struct file *file, const char __user *buf,
@@ -109,7 +109,8 @@ static ssize_t cuse_write(struct file *file, const char __user *buf,
109 * No locking or generic_write_checks(), the server is 109 * No locking or generic_write_checks(), the server is
110 * responsible for locking and sanity checks. 110 * responsible for locking and sanity checks.
111 */ 111 */
112 return fuse_direct_io(&io, &iov, 1, count, &pos, 1); 112 return fuse_direct_io(&io, &iov, 1, count, &pos,
113 FUSE_DIO_WRITE | FUSE_DIO_CUSE);
113} 114}
114 115
115static int cuse_open(struct inode *inode, struct file *file) 116static int cuse_open(struct inode *inode, struct file *file)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 1d1292c581c3..5b4e035b364c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -839,6 +839,14 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
839 struct kstat *stat) 839 struct kstat *stat)
840{ 840{
841 unsigned int blkbits; 841 unsigned int blkbits;
842 struct fuse_conn *fc = get_fuse_conn(inode);
843
844 /* see the comment in fuse_change_attributes() */
845 if (fc->writeback_cache && S_ISREG(inode->i_mode)) {
846 attr->size = i_size_read(inode);
847 attr->mtime = inode->i_mtime.tv_sec;
848 attr->mtimensec = inode->i_mtime.tv_nsec;
849 }
842 850
843 stat->dev = inode->i_sb->s_dev; 851 stat->dev = inode->i_sb->s_dev;
844 stat->ino = attr->ino; 852 stat->ino = attr->ino;
@@ -1477,12 +1485,16 @@ static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd,
1477 FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR); 1485 FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR);
1478} 1486}
1479 1487
1480static bool update_mtime(unsigned ivalid) 1488static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
1481{ 1489{
1482 /* Always update if mtime is explicitly set */ 1490 /* Always update if mtime is explicitly set */
1483 if (ivalid & ATTR_MTIME_SET) 1491 if (ivalid & ATTR_MTIME_SET)
1484 return true; 1492 return true;
1485 1493
1494 /* Or if kernel i_mtime is the official one */
1495 if (trust_local_mtime)
1496 return true;
1497
1486 /* If it's an open(O_TRUNC) or an ftruncate(), don't update */ 1498 /* If it's an open(O_TRUNC) or an ftruncate(), don't update */
1487 if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE))) 1499 if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE)))
1488 return false; 1500 return false;
@@ -1491,7 +1503,8 @@ static bool update_mtime(unsigned ivalid)
1491 return true; 1503 return true;
1492} 1504}
1493 1505
1494static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg) 1506static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
1507 bool trust_local_mtime)
1495{ 1508{
1496 unsigned ivalid = iattr->ia_valid; 1509 unsigned ivalid = iattr->ia_valid;
1497 1510
@@ -1510,11 +1523,11 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
1510 if (!(ivalid & ATTR_ATIME_SET)) 1523 if (!(ivalid & ATTR_ATIME_SET))
1511 arg->valid |= FATTR_ATIME_NOW; 1524 arg->valid |= FATTR_ATIME_NOW;
1512 } 1525 }
1513 if ((ivalid & ATTR_MTIME) && update_mtime(ivalid)) { 1526 if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_mtime)) {
1514 arg->valid |= FATTR_MTIME; 1527 arg->valid |= FATTR_MTIME;
1515 arg->mtime = iattr->ia_mtime.tv_sec; 1528 arg->mtime = iattr->ia_mtime.tv_sec;
1516 arg->mtimensec = iattr->ia_mtime.tv_nsec; 1529 arg->mtimensec = iattr->ia_mtime.tv_nsec;
1517 if (!(ivalid & ATTR_MTIME_SET)) 1530 if (!(ivalid & ATTR_MTIME_SET) && !trust_local_mtime)
1518 arg->valid |= FATTR_MTIME_NOW; 1531 arg->valid |= FATTR_MTIME_NOW;
1519 } 1532 }
1520} 1533}
@@ -1563,6 +1576,63 @@ void fuse_release_nowrite(struct inode *inode)
1563 spin_unlock(&fc->lock); 1576 spin_unlock(&fc->lock);
1564} 1577}
1565 1578
1579static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req,
1580 struct inode *inode,
1581 struct fuse_setattr_in *inarg_p,
1582 struct fuse_attr_out *outarg_p)
1583{
1584 req->in.h.opcode = FUSE_SETATTR;
1585 req->in.h.nodeid = get_node_id(inode);
1586 req->in.numargs = 1;
1587 req->in.args[0].size = sizeof(*inarg_p);
1588 req->in.args[0].value = inarg_p;
1589 req->out.numargs = 1;
1590 if (fc->minor < 9)
1591 req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
1592 else
1593 req->out.args[0].size = sizeof(*outarg_p);
1594 req->out.args[0].value = outarg_p;
1595}
1596
1597/*
1598 * Flush inode->i_mtime to the server
1599 */
1600int fuse_flush_mtime(struct file *file, bool nofail)
1601{
1602 struct inode *inode = file->f_mapping->host;
1603 struct fuse_inode *fi = get_fuse_inode(inode);
1604 struct fuse_conn *fc = get_fuse_conn(inode);
1605 struct fuse_req *req = NULL;
1606 struct fuse_setattr_in inarg;
1607 struct fuse_attr_out outarg;
1608 int err;
1609
1610 if (nofail) {
1611 req = fuse_get_req_nofail_nopages(fc, file);
1612 } else {
1613 req = fuse_get_req_nopages(fc);
1614 if (IS_ERR(req))
1615 return PTR_ERR(req);
1616 }
1617
1618 memset(&inarg, 0, sizeof(inarg));
1619 memset(&outarg, 0, sizeof(outarg));
1620
1621 inarg.valid |= FATTR_MTIME;
1622 inarg.mtime = inode->i_mtime.tv_sec;
1623 inarg.mtimensec = inode->i_mtime.tv_nsec;
1624
1625 fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
1626 fuse_request_send(fc, req);
1627 err = req->out.h.error;
1628 fuse_put_request(fc, req);
1629
1630 if (!err)
1631 clear_bit(FUSE_I_MTIME_DIRTY, &fi->state);
1632
1633 return err;
1634}
1635
1566/* 1636/*
1567 * Set attributes, and at the same time refresh them. 1637 * Set attributes, and at the same time refresh them.
1568 * 1638 *
@@ -1580,8 +1650,10 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
1580 struct fuse_setattr_in inarg; 1650 struct fuse_setattr_in inarg;
1581 struct fuse_attr_out outarg; 1651 struct fuse_attr_out outarg;
1582 bool is_truncate = false; 1652 bool is_truncate = false;
1653 bool is_wb = fc->writeback_cache;
1583 loff_t oldsize; 1654 loff_t oldsize;
1584 int err; 1655 int err;
1656 bool trust_local_mtime = is_wb && S_ISREG(inode->i_mode);
1585 1657
1586 if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) 1658 if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
1587 attr->ia_valid |= ATTR_FORCE; 1659 attr->ia_valid |= ATTR_FORCE;
@@ -1610,7 +1682,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
1610 1682
1611 memset(&inarg, 0, sizeof(inarg)); 1683 memset(&inarg, 0, sizeof(inarg));
1612 memset(&outarg, 0, sizeof(outarg)); 1684 memset(&outarg, 0, sizeof(outarg));
1613 iattr_to_fattr(attr, &inarg); 1685 iattr_to_fattr(attr, &inarg, trust_local_mtime);
1614 if (file) { 1686 if (file) {
1615 struct fuse_file *ff = file->private_data; 1687 struct fuse_file *ff = file->private_data;
1616 inarg.valid |= FATTR_FH; 1688 inarg.valid |= FATTR_FH;
@@ -1621,17 +1693,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
1621 inarg.valid |= FATTR_LOCKOWNER; 1693 inarg.valid |= FATTR_LOCKOWNER;
1622 inarg.lock_owner = fuse_lock_owner_id(fc, current->files); 1694 inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
1623 } 1695 }
1624 req->in.h.opcode = FUSE_SETATTR; 1696 fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
1625 req->in.h.nodeid = get_node_id(inode);
1626 req->in.numargs = 1;
1627 req->in.args[0].size = sizeof(inarg);
1628 req->in.args[0].value = &inarg;
1629 req->out.numargs = 1;
1630 if (fc->minor < 9)
1631 req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
1632 else
1633 req->out.args[0].size = sizeof(outarg);
1634 req->out.args[0].value = &outarg;
1635 fuse_request_send(fc, req); 1697 fuse_request_send(fc, req);
1636 err = req->out.h.error; 1698 err = req->out.h.error;
1637 fuse_put_request(fc, req); 1699 fuse_put_request(fc, req);
@@ -1648,10 +1710,18 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
1648 } 1710 }
1649 1711
1650 spin_lock(&fc->lock); 1712 spin_lock(&fc->lock);
1713 /* the kernel maintains i_mtime locally */
1714 if (trust_local_mtime && (attr->ia_valid & ATTR_MTIME)) {
1715 inode->i_mtime = attr->ia_mtime;
1716 clear_bit(FUSE_I_MTIME_DIRTY, &fi->state);
1717 }
1718
1651 fuse_change_attributes_common(inode, &outarg.attr, 1719 fuse_change_attributes_common(inode, &outarg.attr,
1652 attr_timeout(&outarg)); 1720 attr_timeout(&outarg));
1653 oldsize = inode->i_size; 1721 oldsize = inode->i_size;
1654 i_size_write(inode, outarg.attr.size); 1722 /* see the comment in fuse_change_attributes() */
1723 if (!is_wb || is_truncate || !S_ISREG(inode->i_mode))
1724 i_size_write(inode, outarg.attr.size);
1655 1725
1656 if (is_truncate) { 1726 if (is_truncate) {
1657 /* NOTE: this may release/reacquire fc->lock */ 1727 /* NOTE: this may release/reacquire fc->lock */
@@ -1663,7 +1733,8 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
1663 * Only call invalidate_inode_pages2() after removing 1733 * Only call invalidate_inode_pages2() after removing
1664 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. 1734 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
1665 */ 1735 */
1666 if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { 1736 if ((is_truncate || !is_wb) &&
1737 S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
1667 truncate_pagecache(inode, outarg.attr.size); 1738 truncate_pagecache(inode, outarg.attr.size);
1668 invalidate_inode_pages2(inode->i_mapping); 1739 invalidate_inode_pages2(inode->i_mapping);
1669 } 1740 }
@@ -1875,6 +1946,17 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
1875 return err; 1946 return err;
1876} 1947}
1877 1948
1949static int fuse_update_time(struct inode *inode, struct timespec *now,
1950 int flags)
1951{
1952 if (flags & S_MTIME) {
1953 inode->i_mtime = *now;
1954 set_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state);
1955 BUG_ON(!S_ISREG(inode->i_mode));
1956 }
1957 return 0;
1958}
1959
1878static const struct inode_operations fuse_dir_inode_operations = { 1960static const struct inode_operations fuse_dir_inode_operations = {
1879 .lookup = fuse_lookup, 1961 .lookup = fuse_lookup,
1880 .mkdir = fuse_mkdir, 1962 .mkdir = fuse_mkdir,
@@ -1914,6 +1996,7 @@ static const struct inode_operations fuse_common_inode_operations = {
1914 .getxattr = fuse_getxattr, 1996 .getxattr = fuse_getxattr,
1915 .listxattr = fuse_listxattr, 1997 .listxattr = fuse_listxattr,
1916 .removexattr = fuse_removexattr, 1998 .removexattr = fuse_removexattr,
1999 .update_time = fuse_update_time,
1917}; 2000};
1918 2001
1919static const struct inode_operations fuse_symlink_inode_operations = { 2002static const struct inode_operations fuse_symlink_inode_operations = {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 77bcc303c3ae..65df7d8be4f5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -188,6 +188,22 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
188} 188}
189EXPORT_SYMBOL_GPL(fuse_do_open); 189EXPORT_SYMBOL_GPL(fuse_do_open);
190 190
191static void fuse_link_write_file(struct file *file)
192{
193 struct inode *inode = file_inode(file);
194 struct fuse_conn *fc = get_fuse_conn(inode);
195 struct fuse_inode *fi = get_fuse_inode(inode);
196 struct fuse_file *ff = file->private_data;
197 /*
198 * file may be written through mmap, so chain it onto the
199 * inodes's write_file list
200 */
201 spin_lock(&fc->lock);
202 if (list_empty(&ff->write_entry))
203 list_add(&ff->write_entry, &fi->write_files);
204 spin_unlock(&fc->lock);
205}
206
191void fuse_finish_open(struct inode *inode, struct file *file) 207void fuse_finish_open(struct inode *inode, struct file *file)
192{ 208{
193 struct fuse_file *ff = file->private_data; 209 struct fuse_file *ff = file->private_data;
@@ -208,6 +224,8 @@ void fuse_finish_open(struct inode *inode, struct file *file)
208 spin_unlock(&fc->lock); 224 spin_unlock(&fc->lock);
209 fuse_invalidate_attr(inode); 225 fuse_invalidate_attr(inode);
210 } 226 }
227 if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
228 fuse_link_write_file(file);
211} 229}
212 230
213int fuse_open_common(struct inode *inode, struct file *file, bool isdir) 231int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
@@ -292,6 +310,15 @@ static int fuse_open(struct inode *inode, struct file *file)
292 310
293static int fuse_release(struct inode *inode, struct file *file) 311static int fuse_release(struct inode *inode, struct file *file)
294{ 312{
313 struct fuse_conn *fc = get_fuse_conn(inode);
314
315 /* see fuse_vma_close() for !writeback_cache case */
316 if (fc->writeback_cache)
317 filemap_write_and_wait(file->f_mapping);
318
319 if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state))
320 fuse_flush_mtime(file, true);
321
295 fuse_release_common(file, FUSE_RELEASE); 322 fuse_release_common(file, FUSE_RELEASE);
296 323
297 /* return value is ignored by VFS */ 324 /* return value is ignored by VFS */
@@ -333,12 +360,13 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
333} 360}
334 361
335/* 362/*
336 * Check if page is under writeback 363 * Check if any page in a range is under writeback
337 * 364 *
338 * This is currently done by walking the list of writepage requests 365 * This is currently done by walking the list of writepage requests
339 * for the inode, which can be pretty inefficient. 366 * for the inode, which can be pretty inefficient.
340 */ 367 */
341static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) 368static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
369 pgoff_t idx_to)
342{ 370{
343 struct fuse_conn *fc = get_fuse_conn(inode); 371 struct fuse_conn *fc = get_fuse_conn(inode);
344 struct fuse_inode *fi = get_fuse_inode(inode); 372 struct fuse_inode *fi = get_fuse_inode(inode);
@@ -351,8 +379,8 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
351 379
352 BUG_ON(req->inode != inode); 380 BUG_ON(req->inode != inode);
353 curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; 381 curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
354 if (curr_index <= index && 382 if (idx_from < curr_index + req->num_pages &&
355 index < curr_index + req->num_pages) { 383 curr_index <= idx_to) {
356 found = true; 384 found = true;
357 break; 385 break;
358 } 386 }
@@ -362,6 +390,11 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
362 return found; 390 return found;
363} 391}
364 392
393static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
394{
395 return fuse_range_is_writeback(inode, index, index);
396}
397
365/* 398/*
366 * Wait for page writeback to be completed. 399 * Wait for page writeback to be completed.
367 * 400 *
@@ -376,6 +409,21 @@ static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
376 return 0; 409 return 0;
377} 410}
378 411
412/*
413 * Wait for all pending writepages on the inode to finish.
414 *
415 * This is currently done by blocking further writes with FUSE_NOWRITE
416 * and waiting for all sent writes to complete.
417 *
418 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
419 * could conflict with truncation.
420 */
421static void fuse_sync_writes(struct inode *inode)
422{
423 fuse_set_nowrite(inode);
424 fuse_release_nowrite(inode);
425}
426
379static int fuse_flush(struct file *file, fl_owner_t id) 427static int fuse_flush(struct file *file, fl_owner_t id)
380{ 428{
381 struct inode *inode = file_inode(file); 429 struct inode *inode = file_inode(file);
@@ -391,6 +439,14 @@ static int fuse_flush(struct file *file, fl_owner_t id)
391 if (fc->no_flush) 439 if (fc->no_flush)
392 return 0; 440 return 0;
393 441
442 err = filemap_write_and_wait(file->f_mapping);
443 if (err)
444 return err;
445
446 mutex_lock(&inode->i_mutex);
447 fuse_sync_writes(inode);
448 mutex_unlock(&inode->i_mutex);
449
394 req = fuse_get_req_nofail_nopages(fc, file); 450 req = fuse_get_req_nofail_nopages(fc, file);
395 memset(&inarg, 0, sizeof(inarg)); 451 memset(&inarg, 0, sizeof(inarg));
396 inarg.fh = ff->fh; 452 inarg.fh = ff->fh;
@@ -411,21 +467,6 @@ static int fuse_flush(struct file *file, fl_owner_t id)
411 return err; 467 return err;
412} 468}
413 469
414/*
415 * Wait for all pending writepages on the inode to finish.
416 *
417 * This is currently done by blocking further writes with FUSE_NOWRITE
418 * and waiting for all sent writes to complete.
419 *
420 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
421 * could conflict with truncation.
422 */
423static void fuse_sync_writes(struct inode *inode)
424{
425 fuse_set_nowrite(inode);
426 fuse_release_nowrite(inode);
427}
428
429int fuse_fsync_common(struct file *file, loff_t start, loff_t end, 470int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
430 int datasync, int isdir) 471 int datasync, int isdir)
431{ 472{
@@ -459,6 +500,12 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
459 500
460 fuse_sync_writes(inode); 501 fuse_sync_writes(inode);
461 502
503 if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) {
504 int err = fuse_flush_mtime(file, false);
505 if (err)
506 goto out;
507 }
508
462 req = fuse_get_req_nopages(fc); 509 req = fuse_get_req_nopages(fc);
463 if (IS_ERR(req)) { 510 if (IS_ERR(req)) {
464 err = PTR_ERR(req); 511 err = PTR_ERR(req);
@@ -655,7 +702,33 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
655 spin_unlock(&fc->lock); 702 spin_unlock(&fc->lock);
656} 703}
657 704
658static int fuse_readpage(struct file *file, struct page *page) 705static void fuse_short_read(struct fuse_req *req, struct inode *inode,
706 u64 attr_ver)
707{
708 size_t num_read = req->out.args[0].size;
709 struct fuse_conn *fc = get_fuse_conn(inode);
710
711 if (fc->writeback_cache) {
712 /*
713 * A hole in a file. Some data after the hole are in page cache,
714 * but have not reached the client fs yet. So, the hole is not
715 * present there.
716 */
717 int i;
718 int start_idx = num_read >> PAGE_CACHE_SHIFT;
719 size_t off = num_read & (PAGE_CACHE_SIZE - 1);
720
721 for (i = start_idx; i < req->num_pages; i++) {
722 zero_user_segment(req->pages[i], off, PAGE_CACHE_SIZE);
723 off = 0;
724 }
725 } else {
726 loff_t pos = page_offset(req->pages[0]) + num_read;
727 fuse_read_update_size(inode, pos, attr_ver);
728 }
729}
730
731static int fuse_do_readpage(struct file *file, struct page *page)
659{ 732{
660 struct fuse_io_priv io = { .async = 0, .file = file }; 733 struct fuse_io_priv io = { .async = 0, .file = file };
661 struct inode *inode = page->mapping->host; 734 struct inode *inode = page->mapping->host;
@@ -667,10 +740,6 @@ static int fuse_readpage(struct file *file, struct page *page)
667 u64 attr_ver; 740 u64 attr_ver;
668 int err; 741 int err;
669 742
670 err = -EIO;
671 if (is_bad_inode(inode))
672 goto out;
673
674 /* 743 /*
675 * Page writeback can extend beyond the lifetime of the 744 * Page writeback can extend beyond the lifetime of the
676 * page-cache page, so make sure we read a properly synced 745 * page-cache page, so make sure we read a properly synced
@@ -679,9 +748,8 @@ static int fuse_readpage(struct file *file, struct page *page)
679 fuse_wait_on_page_writeback(inode, page->index); 748 fuse_wait_on_page_writeback(inode, page->index);
680 749
681 req = fuse_get_req(fc, 1); 750 req = fuse_get_req(fc, 1);
682 err = PTR_ERR(req);
683 if (IS_ERR(req)) 751 if (IS_ERR(req))
684 goto out; 752 return PTR_ERR(req);
685 753
686 attr_ver = fuse_get_attr_version(fc); 754 attr_ver = fuse_get_attr_version(fc);
687 755
@@ -692,18 +760,32 @@ static int fuse_readpage(struct file *file, struct page *page)
692 req->page_descs[0].length = count; 760 req->page_descs[0].length = count;
693 num_read = fuse_send_read(req, &io, pos, count, NULL); 761 num_read = fuse_send_read(req, &io, pos, count, NULL);
694 err = req->out.h.error; 762 err = req->out.h.error;
695 fuse_put_request(fc, req);
696 763
697 if (!err) { 764 if (!err) {
698 /* 765 /*
699 * Short read means EOF. If file size is larger, truncate it 766 * Short read means EOF. If file size is larger, truncate it
700 */ 767 */
701 if (num_read < count) 768 if (num_read < count)
702 fuse_read_update_size(inode, pos + num_read, attr_ver); 769 fuse_short_read(req, inode, attr_ver);
703 770
704 SetPageUptodate(page); 771 SetPageUptodate(page);
705 } 772 }
706 773
774 fuse_put_request(fc, req);
775
776 return err;
777}
778
779static int fuse_readpage(struct file *file, struct page *page)
780{
781 struct inode *inode = page->mapping->host;
782 int err;
783
784 err = -EIO;
785 if (is_bad_inode(inode))
786 goto out;
787
788 err = fuse_do_readpage(file, page);
707 fuse_invalidate_atime(inode); 789 fuse_invalidate_atime(inode);
708 out: 790 out:
709 unlock_page(page); 791 unlock_page(page);
@@ -726,13 +808,9 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
726 /* 808 /*
727 * Short read means EOF. If file size is larger, truncate it 809 * Short read means EOF. If file size is larger, truncate it
728 */ 810 */
729 if (!req->out.h.error && num_read < count) { 811 if (!req->out.h.error && num_read < count)
730 loff_t pos; 812 fuse_short_read(req, inode, req->misc.read.attr_ver);
731 813
732 pos = page_offset(req->pages[0]) + num_read;
733 fuse_read_update_size(inode, pos,
734 req->misc.read.attr_ver);
735 }
736 fuse_invalidate_atime(inode); 814 fuse_invalidate_atime(inode);
737 } 815 }
738 816
@@ -922,16 +1000,21 @@ static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,
922 return req->misc.write.out.size; 1000 return req->misc.write.out.size;
923} 1001}
924 1002
925void fuse_write_update_size(struct inode *inode, loff_t pos) 1003bool fuse_write_update_size(struct inode *inode, loff_t pos)
926{ 1004{
927 struct fuse_conn *fc = get_fuse_conn(inode); 1005 struct fuse_conn *fc = get_fuse_conn(inode);
928 struct fuse_inode *fi = get_fuse_inode(inode); 1006 struct fuse_inode *fi = get_fuse_inode(inode);
1007 bool ret = false;
929 1008
930 spin_lock(&fc->lock); 1009 spin_lock(&fc->lock);
931 fi->attr_version = ++fc->attr_version; 1010 fi->attr_version = ++fc->attr_version;
932 if (pos > inode->i_size) 1011 if (pos > inode->i_size) {
933 i_size_write(inode, pos); 1012 i_size_write(inode, pos);
1013 ret = true;
1014 }
934 spin_unlock(&fc->lock); 1015 spin_unlock(&fc->lock);
1016
1017 return ret;
935} 1018}
936 1019
937static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, 1020static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
@@ -1116,6 +1199,15 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1116 struct iov_iter i; 1199 struct iov_iter i;
1117 loff_t endbyte = 0; 1200 loff_t endbyte = 0;
1118 1201
1202 if (get_fuse_conn(inode)->writeback_cache) {
1203 /* Update size (EOF optimization) and mode (SUID clearing) */
1204 err = fuse_update_attributes(mapping->host, NULL, file, NULL);
1205 if (err)
1206 return err;
1207
1208 return generic_file_aio_write(iocb, iov, nr_segs, pos);
1209 }
1210
1119 WARN_ON(iocb->ki_pos != pos); 1211 WARN_ON(iocb->ki_pos != pos);
1120 1212
1121 ocount = 0; 1213 ocount = 0;
@@ -1289,13 +1381,18 @@ static inline int fuse_iter_npages(const struct iov_iter *ii_p)
1289 1381
1290ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, 1382ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
1291 unsigned long nr_segs, size_t count, loff_t *ppos, 1383 unsigned long nr_segs, size_t count, loff_t *ppos,
1292 int write) 1384 int flags)
1293{ 1385{
1386 int write = flags & FUSE_DIO_WRITE;
1387 int cuse = flags & FUSE_DIO_CUSE;
1294 struct file *file = io->file; 1388 struct file *file = io->file;
1389 struct inode *inode = file->f_mapping->host;
1295 struct fuse_file *ff = file->private_data; 1390 struct fuse_file *ff = file->private_data;
1296 struct fuse_conn *fc = ff->fc; 1391 struct fuse_conn *fc = ff->fc;
1297 size_t nmax = write ? fc->max_write : fc->max_read; 1392 size_t nmax = write ? fc->max_write : fc->max_read;
1298 loff_t pos = *ppos; 1393 loff_t pos = *ppos;
1394 pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT;
1395 pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT;
1299 ssize_t res = 0; 1396 ssize_t res = 0;
1300 struct fuse_req *req; 1397 struct fuse_req *req;
1301 struct iov_iter ii; 1398 struct iov_iter ii;
@@ -1309,6 +1406,14 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
1309 if (IS_ERR(req)) 1406 if (IS_ERR(req))
1310 return PTR_ERR(req); 1407 return PTR_ERR(req);
1311 1408
1409 if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
1410 if (!write)
1411 mutex_lock(&inode->i_mutex);
1412 fuse_sync_writes(inode);
1413 if (!write)
1414 mutex_unlock(&inode->i_mutex);
1415 }
1416
1312 while (count) { 1417 while (count) {
1313 size_t nres; 1418 size_t nres;
1314 fl_owner_t owner = current->files; 1419 fl_owner_t owner = current->files;
@@ -1397,7 +1502,8 @@ static ssize_t __fuse_direct_write(struct fuse_io_priv *io,
1397 1502
1398 res = generic_write_checks(file, ppos, &count, 0); 1503 res = generic_write_checks(file, ppos, &count, 0);
1399 if (!res) 1504 if (!res)
1400 res = fuse_direct_io(io, iov, nr_segs, count, ppos, 1); 1505 res = fuse_direct_io(io, iov, nr_segs, count, ppos,
1506 FUSE_DIO_WRITE);
1401 1507
1402 fuse_invalidate_attr(inode); 1508 fuse_invalidate_attr(inode);
1403 1509
@@ -1885,6 +1991,77 @@ out:
1885 return err; 1991 return err;
1886} 1992}
1887 1993
1994/*
1995 * It's worthy to make sure that space is reserved on disk for the write,
1996 * but how to implement it without killing performance need more thinking.
1997 */
1998static int fuse_write_begin(struct file *file, struct address_space *mapping,
1999 loff_t pos, unsigned len, unsigned flags,
2000 struct page **pagep, void **fsdata)
2001{
2002 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2003 struct fuse_conn *fc = get_fuse_conn(file->f_dentry->d_inode);
2004 struct page *page;
2005 loff_t fsize;
2006 int err = -ENOMEM;
2007
2008 WARN_ON(!fc->writeback_cache);
2009
2010 page = grab_cache_page_write_begin(mapping, index, flags);
2011 if (!page)
2012 goto error;
2013
2014 fuse_wait_on_page_writeback(mapping->host, page->index);
2015
2016 if (PageUptodate(page) || len == PAGE_CACHE_SIZE)
2017 goto success;
2018 /*
2019 * Check if the start this page comes after the end of file, in which
2020 * case the readpage can be optimized away.
2021 */
2022 fsize = i_size_read(mapping->host);
2023 if (fsize <= (pos & PAGE_CACHE_MASK)) {
2024 size_t off = pos & ~PAGE_CACHE_MASK;
2025 if (off)
2026 zero_user_segment(page, 0, off);
2027 goto success;
2028 }
2029 err = fuse_do_readpage(file, page);
2030 if (err)
2031 goto cleanup;
2032success:
2033 *pagep = page;
2034 return 0;
2035
2036cleanup:
2037 unlock_page(page);
2038 page_cache_release(page);
2039error:
2040 return err;
2041}
2042
2043static int fuse_write_end(struct file *file, struct address_space *mapping,
2044 loff_t pos, unsigned len, unsigned copied,
2045 struct page *page, void *fsdata)
2046{
2047 struct inode *inode = page->mapping->host;
2048
2049 if (!PageUptodate(page)) {
2050 /* Zero any unwritten bytes at the end of the page */
2051 size_t endoff = (pos + copied) & ~PAGE_CACHE_MASK;
2052 if (endoff)
2053 zero_user_segment(page, endoff, PAGE_CACHE_SIZE);
2054 SetPageUptodate(page);
2055 }
2056
2057 fuse_write_update_size(inode, pos + copied);
2058 set_page_dirty(page);
2059 unlock_page(page);
2060 page_cache_release(page);
2061
2062 return copied;
2063}
2064
1888static int fuse_launder_page(struct page *page) 2065static int fuse_launder_page(struct page *page)
1889{ 2066{
1890 int err = 0; 2067 int err = 0;
@@ -1946,20 +2123,9 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
1946 2123
1947static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) 2124static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
1948{ 2125{
1949 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { 2126 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1950 struct inode *inode = file_inode(file); 2127 fuse_link_write_file(file);
1951 struct fuse_conn *fc = get_fuse_conn(inode); 2128
1952 struct fuse_inode *fi = get_fuse_inode(inode);
1953 struct fuse_file *ff = file->private_data;
1954 /*
1955 * file may be written through mmap, so chain it onto the
1956 * inodes's write_file list
1957 */
1958 spin_lock(&fc->lock);
1959 if (list_empty(&ff->write_entry))
1960 list_add(&ff->write_entry, &fi->write_files);
1961 spin_unlock(&fc->lock);
1962 }
1963 file_accessed(file); 2129 file_accessed(file);
1964 vma->vm_ops = &fuse_file_vm_ops; 2130 vma->vm_ops = &fuse_file_vm_ops;
1965 return 0; 2131 return 0;
@@ -2606,7 +2772,7 @@ static void fuse_register_polled_file(struct fuse_conn *fc,
2606{ 2772{
2607 spin_lock(&fc->lock); 2773 spin_lock(&fc->lock);
2608 if (RB_EMPTY_NODE(&ff->polled_node)) { 2774 if (RB_EMPTY_NODE(&ff->polled_node)) {
2609 struct rb_node **link, *parent; 2775 struct rb_node **link, *uninitialized_var(parent);
2610 2776
2611 link = fuse_find_polled_node(fc, ff->kh, &parent); 2777 link = fuse_find_polled_node(fc, ff->kh, &parent);
2612 BUG_ON(*link); 2778 BUG_ON(*link);
@@ -2850,8 +3016,16 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
2850 goto out; 3016 goto out;
2851 3017
2852 /* we could have extended the file */ 3018 /* we could have extended the file */
2853 if (!(mode & FALLOC_FL_KEEP_SIZE)) 3019 if (!(mode & FALLOC_FL_KEEP_SIZE)) {
2854 fuse_write_update_size(inode, offset + length); 3020 bool changed = fuse_write_update_size(inode, offset + length);
3021
3022 if (changed && fc->writeback_cache) {
3023 struct fuse_inode *fi = get_fuse_inode(inode);
3024
3025 inode->i_mtime = current_fs_time(inode->i_sb);
3026 set_bit(FUSE_I_MTIME_DIRTY, &fi->state);
3027 }
3028 }
2855 3029
2856 if (mode & FALLOC_FL_PUNCH_HOLE) 3030 if (mode & FALLOC_FL_PUNCH_HOLE)
2857 truncate_pagecache_range(inode, offset, offset + length - 1); 3031 truncate_pagecache_range(inode, offset, offset + length - 1);
@@ -2915,6 +3089,8 @@ static const struct address_space_operations fuse_file_aops = {
2915 .set_page_dirty = __set_page_dirty_nobuffers, 3089 .set_page_dirty = __set_page_dirty_nobuffers,
2916 .bmap = fuse_bmap, 3090 .bmap = fuse_bmap,
2917 .direct_IO = fuse_direct_IO, 3091 .direct_IO = fuse_direct_IO,
3092 .write_begin = fuse_write_begin,
3093 .write_end = fuse_write_end,
2918}; 3094};
2919 3095
2920void fuse_init_file_inode(struct inode *inode) 3096void fuse_init_file_inode(struct inode *inode)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 2da5db2c8bdb..a257ed8ebee6 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -119,6 +119,8 @@ enum {
119 FUSE_I_INIT_RDPLUS, 119 FUSE_I_INIT_RDPLUS,
120 /** An operation changing file size is in progress */ 120 /** An operation changing file size is in progress */
121 FUSE_I_SIZE_UNSTABLE, 121 FUSE_I_SIZE_UNSTABLE,
122 /** i_mtime has been updated locally; a flush to userspace needed */
123 FUSE_I_MTIME_DIRTY,
122}; 124};
123 125
124struct fuse_conn; 126struct fuse_conn;
@@ -480,6 +482,9 @@ struct fuse_conn {
480 /** Set if bdi is valid */ 482 /** Set if bdi is valid */
481 unsigned bdi_initialized:1; 483 unsigned bdi_initialized:1;
482 484
485 /** write-back cache policy (default is write-through) */
486 unsigned writeback_cache:1;
487
483 /* 488 /*
484 * The following bitfields are only for optimization purposes 489 * The following bitfields are only for optimization purposes
485 * and hence races in setting them will not cause malfunction 490 * and hence races in setting them will not cause malfunction
@@ -863,9 +868,20 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
863 868
864int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, 869int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
865 bool isdir); 870 bool isdir);
871
872/**
873 * fuse_direct_io() flags
874 */
875
876/** If set, it is WRITE; otherwise - READ */
877#define FUSE_DIO_WRITE (1 << 0)
878
879/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */
880#define FUSE_DIO_CUSE (1 << 1)
881
866ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, 882ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
867 unsigned long nr_segs, size_t count, loff_t *ppos, 883 unsigned long nr_segs, size_t count, loff_t *ppos,
868 int write); 884 int flags);
869long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, 885long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
870 unsigned int flags); 886 unsigned int flags);
871long fuse_ioctl_common(struct file *file, unsigned int cmd, 887long fuse_ioctl_common(struct file *file, unsigned int cmd,
@@ -873,7 +889,9 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd,
873unsigned fuse_file_poll(struct file *file, poll_table *wait); 889unsigned fuse_file_poll(struct file *file, poll_table *wait);
874int fuse_dev_release(struct inode *inode, struct file *file); 890int fuse_dev_release(struct inode *inode, struct file *file);
875 891
876void fuse_write_update_size(struct inode *inode, loff_t pos); 892bool fuse_write_update_size(struct inode *inode, loff_t pos);
893
894int fuse_flush_mtime(struct file *file, bool nofail);
877 895
878int fuse_do_setattr(struct inode *inode, struct iattr *attr, 896int fuse_do_setattr(struct inode *inode, struct iattr *attr,
879 struct file *file); 897 struct file *file);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index d468643a68b2..8d611696fcad 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -123,7 +123,7 @@ static void fuse_destroy_inode(struct inode *inode)
123 123
124static void fuse_evict_inode(struct inode *inode) 124static void fuse_evict_inode(struct inode *inode)
125{ 125{
126 truncate_inode_pages(&inode->i_data, 0); 126 truncate_inode_pages_final(&inode->i_data);
127 clear_inode(inode); 127 clear_inode(inode);
128 if (inode->i_sb->s_flags & MS_ACTIVE) { 128 if (inode->i_sb->s_flags & MS_ACTIVE) {
129 struct fuse_conn *fc = get_fuse_conn(inode); 129 struct fuse_conn *fc = get_fuse_conn(inode);
@@ -135,6 +135,7 @@ static void fuse_evict_inode(struct inode *inode)
135 135
136static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) 136static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
137{ 137{
138 sync_filesystem(sb);
138 if (*flags & MS_MANDLOCK) 139 if (*flags & MS_MANDLOCK)
139 return -EINVAL; 140 return -EINVAL;
140 141
@@ -170,8 +171,11 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
170 inode->i_blocks = attr->blocks; 171 inode->i_blocks = attr->blocks;
171 inode->i_atime.tv_sec = attr->atime; 172 inode->i_atime.tv_sec = attr->atime;
172 inode->i_atime.tv_nsec = attr->atimensec; 173 inode->i_atime.tv_nsec = attr->atimensec;
173 inode->i_mtime.tv_sec = attr->mtime; 174 /* mtime from server may be stale due to local buffered write */
174 inode->i_mtime.tv_nsec = attr->mtimensec; 175 if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) {
176 inode->i_mtime.tv_sec = attr->mtime;
177 inode->i_mtime.tv_nsec = attr->mtimensec;
178 }
175 inode->i_ctime.tv_sec = attr->ctime; 179 inode->i_ctime.tv_sec = attr->ctime;
176 inode->i_ctime.tv_nsec = attr->ctimensec; 180 inode->i_ctime.tv_nsec = attr->ctimensec;
177 181
@@ -197,6 +201,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
197{ 201{
198 struct fuse_conn *fc = get_fuse_conn(inode); 202 struct fuse_conn *fc = get_fuse_conn(inode);
199 struct fuse_inode *fi = get_fuse_inode(inode); 203 struct fuse_inode *fi = get_fuse_inode(inode);
204 bool is_wb = fc->writeback_cache;
200 loff_t oldsize; 205 loff_t oldsize;
201 struct timespec old_mtime; 206 struct timespec old_mtime;
202 207
@@ -211,10 +216,16 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
211 fuse_change_attributes_common(inode, attr, attr_valid); 216 fuse_change_attributes_common(inode, attr, attr_valid);
212 217
213 oldsize = inode->i_size; 218 oldsize = inode->i_size;
214 i_size_write(inode, attr->size); 219 /*
220 * In case of writeback_cache enabled, the cached writes beyond EOF
221 * extend local i_size without keeping userspace server in sync. So,
222 * attr->size coming from server can be stale. We cannot trust it.
223 */
224 if (!is_wb || !S_ISREG(inode->i_mode))
225 i_size_write(inode, attr->size);
215 spin_unlock(&fc->lock); 226 spin_unlock(&fc->lock);
216 227
217 if (S_ISREG(inode->i_mode)) { 228 if (!is_wb && S_ISREG(inode->i_mode)) {
218 bool inval = false; 229 bool inval = false;
219 230
220 if (oldsize != attr->size) { 231 if (oldsize != attr->size) {
@@ -243,6 +254,8 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
243{ 254{
244 inode->i_mode = attr->mode & S_IFMT; 255 inode->i_mode = attr->mode & S_IFMT;
245 inode->i_size = attr->size; 256 inode->i_size = attr->size;
257 inode->i_mtime.tv_sec = attr->mtime;
258 inode->i_mtime.tv_nsec = attr->mtimensec;
246 if (S_ISREG(inode->i_mode)) { 259 if (S_ISREG(inode->i_mode)) {
247 fuse_init_common(inode); 260 fuse_init_common(inode);
248 fuse_init_file_inode(inode); 261 fuse_init_file_inode(inode);
@@ -289,7 +302,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
289 return NULL; 302 return NULL;
290 303
291 if ((inode->i_state & I_NEW)) { 304 if ((inode->i_state & I_NEW)) {
292 inode->i_flags |= S_NOATIME|S_NOCMTIME; 305 inode->i_flags |= S_NOATIME;
306 if (!fc->writeback_cache || !S_ISREG(inode->i_mode))
307 inode->i_flags |= S_NOCMTIME;
293 inode->i_generation = generation; 308 inode->i_generation = generation;
294 inode->i_data.backing_dev_info = &fc->bdi; 309 inode->i_data.backing_dev_info = &fc->bdi;
295 fuse_init_inode(inode, attr); 310 fuse_init_inode(inode, attr);
@@ -873,6 +888,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
873 } 888 }
874 if (arg->flags & FUSE_ASYNC_DIO) 889 if (arg->flags & FUSE_ASYNC_DIO)
875 fc->async_dio = 1; 890 fc->async_dio = 1;
891 if (arg->flags & FUSE_WRITEBACK_CACHE)
892 fc->writeback_cache = 1;
876 } else { 893 } else {
877 ra_pages = fc->max_read / PAGE_CACHE_SIZE; 894 ra_pages = fc->max_read / PAGE_CACHE_SIZE;
878 fc->no_lock = 1; 895 fc->no_lock = 1;
@@ -900,7 +917,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
900 FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | 917 FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
901 FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | 918 FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
902 FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | 919 FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
903 FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO; 920 FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
921 FUSE_WRITEBACK_CACHE;
904 req->in.h.opcode = FUSE_INIT; 922 req->in.h.opcode = FUSE_INIT;
905 req->in.numargs = 1; 923 req->in.numargs = 1;
906 req->in.args[0].size = sizeof(*arg); 924 req->in.args[0].size = sizeof(*arg);
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index ba9456685f47..3088e2a38e30 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -64,18 +64,6 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
64 return acl; 64 return acl;
65} 65}
66 66
67static int gfs2_set_mode(struct inode *inode, umode_t mode)
68{
69 int error = 0;
70
71 if (mode != inode->i_mode) {
72 inode->i_mode = mode;
73 mark_inode_dirty(inode);
74 }
75
76 return error;
77}
78
79int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) 67int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
80{ 68{
81 int error; 69 int error;
@@ -85,8 +73,8 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
85 73
86 BUG_ON(name == NULL); 74 BUG_ON(name == NULL);
87 75
88 if (acl->a_count > GFS2_ACL_MAX_ENTRIES) 76 if (acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
89 return -EINVAL; 77 return -E2BIG;
90 78
91 if (type == ACL_TYPE_ACCESS) { 79 if (type == ACL_TYPE_ACCESS) {
92 umode_t mode = inode->i_mode; 80 umode_t mode = inode->i_mode;
@@ -98,9 +86,10 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
98 if (error == 0) 86 if (error == 0)
99 acl = NULL; 87 acl = NULL;
100 88
101 error = gfs2_set_mode(inode, mode); 89 if (mode != inode->i_mode) {
102 if (error) 90 inode->i_mode = mode;
103 return error; 91 mark_inode_dirty(inode);
92 }
104 } 93 }
105 94
106 if (acl) { 95 if (acl) {
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 301260c999ba..2d65ec4cd4be 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -14,7 +14,7 @@
14 14
15#define GFS2_POSIX_ACL_ACCESS "posix_acl_access" 15#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
16#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" 16#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
17#define GFS2_ACL_MAX_ENTRIES 25 17#define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12)
18 18
19extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type); 19extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type);
20extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); 20extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 49436fa7cd4f..ce62dcac90b6 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -21,6 +21,7 @@
21#include <linux/gfs2_ondisk.h> 21#include <linux/gfs2_ondisk.h>
22#include <linux/backing-dev.h> 22#include <linux/backing-dev.h>
23#include <linux/aio.h> 23#include <linux/aio.h>
24#include <trace/events/writeback.h>
24 25
25#include "gfs2.h" 26#include "gfs2.h"
26#include "incore.h" 27#include "incore.h"
@@ -230,13 +231,11 @@ static int gfs2_writepages(struct address_space *mapping,
230static int gfs2_write_jdata_pagevec(struct address_space *mapping, 231static int gfs2_write_jdata_pagevec(struct address_space *mapping,
231 struct writeback_control *wbc, 232 struct writeback_control *wbc,
232 struct pagevec *pvec, 233 struct pagevec *pvec,
233 int nr_pages, pgoff_t end) 234 int nr_pages, pgoff_t end,
235 pgoff_t *done_index)
234{ 236{
235 struct inode *inode = mapping->host; 237 struct inode *inode = mapping->host;
236 struct gfs2_sbd *sdp = GFS2_SB(inode); 238 struct gfs2_sbd *sdp = GFS2_SB(inode);
237 loff_t i_size = i_size_read(inode);
238 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
239 unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
240 unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize); 239 unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
241 int i; 240 int i;
242 int ret; 241 int ret;
@@ -248,40 +247,83 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
248 for(i = 0; i < nr_pages; i++) { 247 for(i = 0; i < nr_pages; i++) {
249 struct page *page = pvec->pages[i]; 248 struct page *page = pvec->pages[i];
250 249
250 /*
251 * At this point, the page may be truncated or
252 * invalidated (changing page->mapping to NULL), or
253 * even swizzled back from swapper_space to tmpfs file
254 * mapping. However, page->index will not change
255 * because we have a reference on the page.
256 */
257 if (page->index > end) {
258 /*
259 * can't be range_cyclic (1st pass) because
260 * end == -1 in that case.
261 */
262 ret = 1;
263 break;
264 }
265
266 *done_index = page->index;
267
251 lock_page(page); 268 lock_page(page);
252 269
253 if (unlikely(page->mapping != mapping)) { 270 if (unlikely(page->mapping != mapping)) {
271continue_unlock:
254 unlock_page(page); 272 unlock_page(page);
255 continue; 273 continue;
256 } 274 }
257 275
258 if (!wbc->range_cyclic && page->index > end) { 276 if (!PageDirty(page)) {
259 ret = 1; 277 /* someone wrote it for us */
260 unlock_page(page); 278 goto continue_unlock;
261 continue;
262 } 279 }
263 280
264 if (wbc->sync_mode != WB_SYNC_NONE) 281 if (PageWriteback(page)) {
265 wait_on_page_writeback(page); 282 if (wbc->sync_mode != WB_SYNC_NONE)
266 283 wait_on_page_writeback(page);
267 if (PageWriteback(page) || 284 else
268 !clear_page_dirty_for_io(page)) { 285 goto continue_unlock;
269 unlock_page(page);
270 continue;
271 } 286 }
272 287
273 /* Is the page fully outside i_size? (truncate in progress) */ 288 BUG_ON(PageWriteback(page));
274 if (page->index > end_index || (page->index == end_index && !offset)) { 289 if (!clear_page_dirty_for_io(page))
275 page->mapping->a_ops->invalidatepage(page, 0, 290 goto continue_unlock;
276 PAGE_CACHE_SIZE); 291
277 unlock_page(page); 292 trace_wbc_writepage(wbc, mapping->backing_dev_info);
278 continue;
279 }
280 293
281 ret = __gfs2_jdata_writepage(page, wbc); 294 ret = __gfs2_jdata_writepage(page, wbc);
295 if (unlikely(ret)) {
296 if (ret == AOP_WRITEPAGE_ACTIVATE) {
297 unlock_page(page);
298 ret = 0;
299 } else {
300
301 /*
302 * done_index is set past this page,
303 * so media errors will not choke
304 * background writeout for the entire
305 * file. This has consequences for
306 * range_cyclic semantics (ie. it may
307 * not be suitable for data integrity
308 * writeout).
309 */
310 *done_index = page->index + 1;
311 ret = 1;
312 break;
313 }
314 }
282 315
283 if (ret || (--(wbc->nr_to_write) <= 0)) 316 /*
317 * We stop writing back only if we are not doing
318 * integrity sync. In case of integrity sync we have to
319 * keep going until we have written all the pages
320 * we tagged for writeback prior to entering this loop.
321 */
322 if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) {
284 ret = 1; 323 ret = 1;
324 break;
325 }
326
285 } 327 }
286 gfs2_trans_end(sdp); 328 gfs2_trans_end(sdp);
287 return ret; 329 return ret;
@@ -306,51 +348,69 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
306 int done = 0; 348 int done = 0;
307 struct pagevec pvec; 349 struct pagevec pvec;
308 int nr_pages; 350 int nr_pages;
351 pgoff_t uninitialized_var(writeback_index);
309 pgoff_t index; 352 pgoff_t index;
310 pgoff_t end; 353 pgoff_t end;
311 int scanned = 0; 354 pgoff_t done_index;
355 int cycled;
312 int range_whole = 0; 356 int range_whole = 0;
357 int tag;
313 358
314 pagevec_init(&pvec, 0); 359 pagevec_init(&pvec, 0);
315 if (wbc->range_cyclic) { 360 if (wbc->range_cyclic) {
316 index = mapping->writeback_index; /* Start from prev offset */ 361 writeback_index = mapping->writeback_index; /* prev offset */
362 index = writeback_index;
363 if (index == 0)
364 cycled = 1;
365 else
366 cycled = 0;
317 end = -1; 367 end = -1;
318 } else { 368 } else {
319 index = wbc->range_start >> PAGE_CACHE_SHIFT; 369 index = wbc->range_start >> PAGE_CACHE_SHIFT;
320 end = wbc->range_end >> PAGE_CACHE_SHIFT; 370 end = wbc->range_end >> PAGE_CACHE_SHIFT;
321 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 371 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
322 range_whole = 1; 372 range_whole = 1;
323 scanned = 1; 373 cycled = 1; /* ignore range_cyclic tests */
324 } 374 }
375 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
376 tag = PAGECACHE_TAG_TOWRITE;
377 else
378 tag = PAGECACHE_TAG_DIRTY;
325 379
326retry: 380retry:
327 while (!done && (index <= end) && 381 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
328 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 382 tag_pages_for_writeback(mapping, index, end);
329 PAGECACHE_TAG_DIRTY, 383 done_index = index;
330 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 384 while (!done && (index <= end)) {
331 scanned = 1; 385 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
332 ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end); 386 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
387 if (nr_pages == 0)
388 break;
389
390 ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end, &done_index);
333 if (ret) 391 if (ret)
334 done = 1; 392 done = 1;
335 if (ret > 0) 393 if (ret > 0)
336 ret = 0; 394 ret = 0;
337
338 pagevec_release(&pvec); 395 pagevec_release(&pvec);
339 cond_resched(); 396 cond_resched();
340 } 397 }
341 398
342 if (!scanned && !done) { 399 if (!cycled && !done) {
343 /* 400 /*
401 * range_cyclic:
344 * We hit the last page and there is more work to be done: wrap 402 * We hit the last page and there is more work to be done: wrap
345 * back to the start of the file 403 * back to the start of the file
346 */ 404 */
347 scanned = 1; 405 cycled = 1;
348 index = 0; 406 index = 0;
407 end = writeback_index - 1;
349 goto retry; 408 goto retry;
350 } 409 }
351 410
352 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 411 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
353 mapping->writeback_index = index; 412 mapping->writeback_index = done_index;
413
354 return ret; 414 return ret;
355} 415}
356 416
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index fe0500c0af7a..c62d4b9f51dc 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1328,6 +1328,121 @@ int gfs2_file_dealloc(struct gfs2_inode *ip)
1328} 1328}
1329 1329
1330/** 1330/**
1331 * gfs2_free_journal_extents - Free cached journal bmap info
1332 * @jd: The journal
1333 *
1334 */
1335
1336void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
1337{
1338 struct gfs2_journal_extent *jext;
1339
1340 while(!list_empty(&jd->extent_list)) {
1341 jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
1342 list_del(&jext->list);
1343 kfree(jext);
1344 }
1345}
1346
1347/**
1348 * gfs2_add_jextent - Add or merge a new extent to extent cache
1349 * @jd: The journal descriptor
1350 * @lblock: The logical block at start of new extent
1351 * @pblock: The physical block at start of new extent
1352 * @blocks: Size of extent in fs blocks
1353 *
1354 * Returns: 0 on success or -ENOMEM
1355 */
1356
1357static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
1358{
1359 struct gfs2_journal_extent *jext;
1360
1361 if (!list_empty(&jd->extent_list)) {
1362 jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
1363 if ((jext->dblock + jext->blocks) == dblock) {
1364 jext->blocks += blocks;
1365 return 0;
1366 }
1367 }
1368
1369 jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
1370 if (jext == NULL)
1371 return -ENOMEM;
1372 jext->dblock = dblock;
1373 jext->lblock = lblock;
1374 jext->blocks = blocks;
1375 list_add_tail(&jext->list, &jd->extent_list);
1376 jd->nr_extents++;
1377 return 0;
1378}
1379
1380/**
1381 * gfs2_map_journal_extents - Cache journal bmap info
1382 * @sdp: The super block
1383 * @jd: The journal to map
1384 *
1385 * Create a reusable "extent" mapping from all logical
1386 * blocks to all physical blocks for the given journal. This will save
1387 * us time when writing journal blocks. Most journals will have only one
1388 * extent that maps all their logical blocks. That's because gfs2.mkfs
1389 * arranges the journal blocks sequentially to maximize performance.
1390 * So the extent would map the first block for the entire file length.
1391 * However, gfs2_jadd can happen while file activity is happening, so
1392 * those journals may not be sequential. Less likely is the case where
1393 * the users created their own journals by mounting the metafs and
1394 * laying it out. But it's still possible. These journals might have
1395 * several extents.
1396 *
1397 * Returns: 0 on success, or error on failure
1398 */
1399
1400int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
1401{
1402 u64 lblock = 0;
1403 u64 lblock_stop;
1404 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
1405 struct buffer_head bh;
1406 unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1407 u64 size;
1408 int rc;
1409
1410 lblock_stop = i_size_read(jd->jd_inode) >> shift;
1411 size = (lblock_stop - lblock) << shift;
1412 jd->nr_extents = 0;
1413 WARN_ON(!list_empty(&jd->extent_list));
1414
1415 do {
1416 bh.b_state = 0;
1417 bh.b_blocknr = 0;
1418 bh.b_size = size;
1419 rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
1420 if (rc || !buffer_mapped(&bh))
1421 goto fail;
1422 rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
1423 if (rc)
1424 goto fail;
1425 size -= bh.b_size;
1426 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1427 } while(size > 0);
1428
1429 fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid,
1430 jd->nr_extents);
1431 return 0;
1432
1433fail:
1434 fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
1435 rc, jd->jd_jid,
1436 (unsigned long long)(i_size_read(jd->jd_inode) - size),
1437 jd->nr_extents);
1438 fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
1439 rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
1440 bh.b_state, (unsigned long long)bh.b_size);
1441 gfs2_free_journal_extents(jd);
1442 return rc;
1443}
1444
1445/**
1331 * gfs2_write_alloc_required - figure out if a write will require an allocation 1446 * gfs2_write_alloc_required - figure out if a write will require an allocation
1332 * @ip: the file being written to 1447 * @ip: the file being written to
1333 * @offset: the offset to write to 1448 * @offset: the offset to write to
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 42fea03e2bd9..81ded5e2aaa2 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -55,5 +55,7 @@ extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
55extern int gfs2_file_dealloc(struct gfs2_inode *ip); 55extern int gfs2_file_dealloc(struct gfs2_inode *ip);
56extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, 56extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
57 unsigned int len); 57 unsigned int len);
58extern int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd);
59extern void gfs2_free_journal_extents(struct gfs2_jdesc *jd);
58 60
59#endif /* __BMAP_DOT_H__ */ 61#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index fa32655449c8..1a349f9a9685 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -53,6 +53,8 @@
53 * but never before the maximum hash table size has been reached. 53 * but never before the maximum hash table size has been reached.
54 */ 54 */
55 55
56#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
57
56#include <linux/slab.h> 58#include <linux/slab.h>
57#include <linux/spinlock.h> 59#include <linux/spinlock.h>
58#include <linux/buffer_head.h> 60#include <linux/buffer_head.h>
@@ -507,8 +509,8 @@ static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
507 goto error; 509 goto error;
508 return 0; 510 return 0;
509error: 511error:
510 printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg, 512 pr_warn("%s: %s (%s)\n",
511 first ? "first in block" : "not first in block"); 513 __func__, msg, first ? "first in block" : "not first in block");
512 return -EIO; 514 return -EIO;
513} 515}
514 516
@@ -531,8 +533,7 @@ static int gfs2_dirent_offset(const void *buf)
531 } 533 }
532 return offset; 534 return offset;
533wrong_type: 535wrong_type:
534 printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n", 536 pr_warn("%s: wrong block type %u\n", __func__, be32_to_cpu(h->mh_type));
535 be32_to_cpu(h->mh_type));
536 return -1; 537 return -1;
537} 538}
538 539
@@ -728,7 +729,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
728 729
729 error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp); 730 error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
730 if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) { 731 if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
731 /* printk(KERN_INFO "block num=%llu\n", leaf_no); */ 732 /* pr_info("block num=%llu\n", leaf_no); */
732 error = -EIO; 733 error = -EIO;
733 } 734 }
734 735
@@ -1006,7 +1007,8 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
1006 len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth)); 1007 len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth));
1007 half_len = len >> 1; 1008 half_len = len >> 1;
1008 if (!half_len) { 1009 if (!half_len) {
1009 printk(KERN_WARNING "i_depth %u lf_depth %u index %u\n", dip->i_depth, be16_to_cpu(oleaf->lf_depth), index); 1010 pr_warn("i_depth %u lf_depth %u index %u\n",
1011 dip->i_depth, be16_to_cpu(oleaf->lf_depth), index);
1010 gfs2_consist_inode(dip); 1012 gfs2_consist_inode(dip);
1011 error = -EIO; 1013 error = -EIO;
1012 goto fail_brelse; 1014 goto fail_brelse;
@@ -1684,6 +1686,14 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1684 return 0; 1686 return 0;
1685} 1687}
1686 1688
1689static u16 gfs2_inode_ra_len(const struct gfs2_inode *ip)
1690{
1691 u64 where = ip->i_no_addr + 1;
1692 if (ip->i_eattr == where)
1693 return 1;
1694 return 0;
1695}
1696
1687/** 1697/**
1688 * gfs2_dir_add - Add new filename into directory 1698 * gfs2_dir_add - Add new filename into directory
1689 * @inode: The directory inode 1699 * @inode: The directory inode
@@ -1721,6 +1731,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1721 dent = gfs2_init_dirent(inode, dent, name, bh); 1731 dent = gfs2_init_dirent(inode, dent, name, bh);
1722 gfs2_inum_out(nip, dent); 1732 gfs2_inum_out(nip, dent);
1723 dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode)); 1733 dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode));
1734 dent->de_rahead = cpu_to_be16(gfs2_inode_ra_len(nip));
1724 tv = CURRENT_TIME; 1735 tv = CURRENT_TIME;
1725 if (ip->i_diskflags & GFS2_DIF_EXHASH) { 1736 if (ip->i_diskflags & GFS2_DIF_EXHASH) {
1726 leaf = (struct gfs2_leaf *)bh->b_data; 1737 leaf = (struct gfs2_leaf *)bh->b_data;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index efc078f0ee4e..6c794085abac 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -811,6 +811,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
811 loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1); 811 loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
812 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; 812 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
813 loff_t max_chunk_size = UINT_MAX & bsize_mask; 813 loff_t max_chunk_size = UINT_MAX & bsize_mask;
814 struct gfs2_holder gh;
815
814 next = (next + 1) << sdp->sd_sb.sb_bsize_shift; 816 next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
815 817
816 /* We only support the FALLOC_FL_KEEP_SIZE mode */ 818 /* We only support the FALLOC_FL_KEEP_SIZE mode */
@@ -831,8 +833,10 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
831 if (error) 833 if (error)
832 return error; 834 return error;
833 835
834 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); 836 mutex_lock(&inode->i_mutex);
835 error = gfs2_glock_nq(&ip->i_gh); 837
838 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
839 error = gfs2_glock_nq(&gh);
836 if (unlikely(error)) 840 if (unlikely(error))
837 goto out_uninit; 841 goto out_uninit;
838 842
@@ -900,9 +904,10 @@ out_trans_fail:
900out_qunlock: 904out_qunlock:
901 gfs2_quota_unlock(ip); 905 gfs2_quota_unlock(ip);
902out_unlock: 906out_unlock:
903 gfs2_glock_dq(&ip->i_gh); 907 gfs2_glock_dq(&gh);
904out_uninit: 908out_uninit:
905 gfs2_holder_uninit(&ip->i_gh); 909 gfs2_holder_uninit(&gh);
910 mutex_unlock(&inode->i_mutex);
906 return error; 911 return error;
907} 912}
908 913
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ca0be6c69a26..aec7f73832f0 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/sched.h> 12#include <linux/sched.h>
11#include <linux/slab.h> 13#include <linux/slab.h>
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
@@ -468,7 +470,7 @@ retry:
468 do_xmote(gl, gh, LM_ST_UNLOCKED); 470 do_xmote(gl, gh, LM_ST_UNLOCKED);
469 break; 471 break;
470 default: /* Everything else */ 472 default: /* Everything else */
471 printk(KERN_ERR "GFS2: wanted %u got %u\n", gl->gl_target, state); 473 pr_err("wanted %u got %u\n", gl->gl_target, state);
472 GLOCK_BUG_ON(gl, 1); 474 GLOCK_BUG_ON(gl, 1);
473 } 475 }
474 spin_unlock(&gl->gl_spin); 476 spin_unlock(&gl->gl_spin);
@@ -542,7 +544,7 @@ __acquires(&gl->gl_spin)
542 /* lock_dlm */ 544 /* lock_dlm */
543 ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags); 545 ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
544 if (ret) { 546 if (ret) {
545 printk(KERN_ERR "GFS2: lm_lock ret %d\n", ret); 547 pr_err("lm_lock ret %d\n", ret);
546 GLOCK_BUG_ON(gl, 1); 548 GLOCK_BUG_ON(gl, 1);
547 } 549 }
548 } else { /* lock_nolock */ 550 } else { /* lock_nolock */
@@ -935,7 +937,7 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
935 vaf.fmt = fmt; 937 vaf.fmt = fmt;
936 vaf.va = &args; 938 vaf.va = &args;
937 939
938 printk(KERN_ERR " %pV", &vaf); 940 pr_err("%pV", &vaf);
939 } 941 }
940 942
941 va_end(args); 943 va_end(args);
@@ -1010,13 +1012,13 @@ do_cancel:
1010 return; 1012 return;
1011 1013
1012trap_recursive: 1014trap_recursive:
1013 printk(KERN_ERR "original: %pSR\n", (void *)gh2->gh_ip); 1015 pr_err("original: %pSR\n", (void *)gh2->gh_ip);
1014 printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid)); 1016 pr_err("pid: %d\n", pid_nr(gh2->gh_owner_pid));
1015 printk(KERN_ERR "lock type: %d req lock state : %d\n", 1017 pr_err("lock type: %d req lock state : %d\n",
1016 gh2->gh_gl->gl_name.ln_type, gh2->gh_state); 1018 gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
1017 printk(KERN_ERR "new: %pSR\n", (void *)gh->gh_ip); 1019 pr_err("new: %pSR\n", (void *)gh->gh_ip);
1018 printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid)); 1020 pr_err("pid: %d\n", pid_nr(gh->gh_owner_pid));
1019 printk(KERN_ERR "lock type: %d req lock state : %d\n", 1021 pr_err("lock type: %d req lock state : %d\n",
1020 gh->gh_gl->gl_name.ln_type, gh->gh_state); 1022 gh->gh_gl->gl_name.ln_type, gh->gh_state);
1021 gfs2_dump_glock(NULL, gl); 1023 gfs2_dump_glock(NULL, gl);
1022 BUG(); 1024 BUG();
@@ -1045,9 +1047,13 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
1045 1047
1046 spin_lock(&gl->gl_spin); 1048 spin_lock(&gl->gl_spin);
1047 add_to_queue(gh); 1049 add_to_queue(gh);
1048 if ((LM_FLAG_NOEXP & gh->gh_flags) && 1050 if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) &&
1049 test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) 1051 test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) {
1050 set_bit(GLF_REPLY_PENDING, &gl->gl_flags); 1052 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
1053 gl->gl_lockref.count++;
1054 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1055 gl->gl_lockref.count--;
1056 }
1051 run_queue(gl, 1); 1057 run_queue(gl, 1);
1052 spin_unlock(&gl->gl_spin); 1058 spin_unlock(&gl->gl_spin);
1053 1059
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 3bf0631b5d56..54b66809e818 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -82,6 +82,8 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
82 struct gfs2_trans tr; 82 struct gfs2_trans tr;
83 83
84 memset(&tr, 0, sizeof(tr)); 84 memset(&tr, 0, sizeof(tr));
85 INIT_LIST_HEAD(&tr.tr_buf);
86 INIT_LIST_HEAD(&tr.tr_databuf);
85 tr.tr_revokes = atomic_read(&gl->gl_ail_count); 87 tr.tr_revokes = atomic_read(&gl->gl_ail_count);
86 88
87 if (!tr.tr_revokes) 89 if (!tr.tr_revokes)
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index cf0e34400f71..bdf70c18610c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -52,7 +52,7 @@ struct gfs2_log_header_host {
52 */ 52 */
53 53
54struct gfs2_log_operations { 54struct gfs2_log_operations {
55 void (*lo_before_commit) (struct gfs2_sbd *sdp); 55 void (*lo_before_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
56 void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); 56 void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
57 void (*lo_before_scan) (struct gfs2_jdesc *jd, 57 void (*lo_before_scan) (struct gfs2_jdesc *jd,
58 struct gfs2_log_header_host *head, int pass); 58 struct gfs2_log_header_host *head, int pass);
@@ -371,6 +371,7 @@ enum {
371 GIF_ALLOC_FAILED = 2, 371 GIF_ALLOC_FAILED = 2,
372 GIF_SW_PAGED = 3, 372 GIF_SW_PAGED = 3,
373 GIF_ORDERED = 4, 373 GIF_ORDERED = 4,
374 GIF_FREE_VFS_INODE = 5,
374}; 375};
375 376
376struct gfs2_inode { 377struct gfs2_inode {
@@ -462,11 +463,11 @@ struct gfs2_trans {
462 unsigned int tr_blocks; 463 unsigned int tr_blocks;
463 unsigned int tr_revokes; 464 unsigned int tr_revokes;
464 unsigned int tr_reserved; 465 unsigned int tr_reserved;
466 unsigned int tr_touched:1;
467 unsigned int tr_attached:1;
465 468
466 struct gfs2_holder tr_t_gh; 469 struct gfs2_holder tr_t_gh;
467 470
468 int tr_touched;
469 int tr_attached;
470 471
471 unsigned int tr_num_buf_new; 472 unsigned int tr_num_buf_new;
472 unsigned int tr_num_databuf_new; 473 unsigned int tr_num_databuf_new;
@@ -476,6 +477,8 @@ struct gfs2_trans {
476 unsigned int tr_num_revoke_rm; 477 unsigned int tr_num_revoke_rm;
477 478
478 struct list_head tr_list; 479 struct list_head tr_list;
480 struct list_head tr_databuf;
481 struct list_head tr_buf;
479 482
480 unsigned int tr_first; 483 unsigned int tr_first;
481 struct list_head tr_ail1_list; 484 struct list_head tr_ail1_list;
@@ -483,7 +486,7 @@ struct gfs2_trans {
483}; 486};
484 487
485struct gfs2_journal_extent { 488struct gfs2_journal_extent {
486 struct list_head extent_list; 489 struct list_head list;
487 490
488 unsigned int lblock; /* First logical block */ 491 unsigned int lblock; /* First logical block */
489 u64 dblock; /* First disk block */ 492 u64 dblock; /* First disk block */
@@ -493,6 +496,7 @@ struct gfs2_journal_extent {
493struct gfs2_jdesc { 496struct gfs2_jdesc {
494 struct list_head jd_list; 497 struct list_head jd_list;
495 struct list_head extent_list; 498 struct list_head extent_list;
499 unsigned int nr_extents;
496 struct work_struct jd_work; 500 struct work_struct jd_work;
497 struct inode *jd_inode; 501 struct inode *jd_inode;
498 unsigned long jd_flags; 502 unsigned long jd_flags;
@@ -500,6 +504,15 @@ struct gfs2_jdesc {
500 unsigned int jd_jid; 504 unsigned int jd_jid;
501 unsigned int jd_blocks; 505 unsigned int jd_blocks;
502 int jd_recover_error; 506 int jd_recover_error;
507 /* Replay stuff */
508
509 unsigned int jd_found_blocks;
510 unsigned int jd_found_revokes;
511 unsigned int jd_replayed_blocks;
512
513 struct list_head jd_revoke_list;
514 unsigned int jd_replay_tail;
515
503}; 516};
504 517
505struct gfs2_statfs_change_host { 518struct gfs2_statfs_change_host {
@@ -746,19 +759,12 @@ struct gfs2_sbd {
746 759
747 struct gfs2_trans *sd_log_tr; 760 struct gfs2_trans *sd_log_tr;
748 unsigned int sd_log_blks_reserved; 761 unsigned int sd_log_blks_reserved;
749 unsigned int sd_log_commited_buf;
750 unsigned int sd_log_commited_databuf;
751 int sd_log_commited_revoke; 762 int sd_log_commited_revoke;
752 763
753 atomic_t sd_log_pinned; 764 atomic_t sd_log_pinned;
754 unsigned int sd_log_num_buf;
755 unsigned int sd_log_num_revoke; 765 unsigned int sd_log_num_revoke;
756 unsigned int sd_log_num_rg;
757 unsigned int sd_log_num_databuf;
758 766
759 struct list_head sd_log_le_buf;
760 struct list_head sd_log_le_revoke; 767 struct list_head sd_log_le_revoke;
761 struct list_head sd_log_le_databuf;
762 struct list_head sd_log_le_ordered; 768 struct list_head sd_log_le_ordered;
763 spinlock_t sd_ordered_lock; 769 spinlock_t sd_ordered_lock;
764 770
@@ -786,15 +792,6 @@ struct gfs2_sbd {
786 struct list_head sd_ail1_list; 792 struct list_head sd_ail1_list;
787 struct list_head sd_ail2_list; 793 struct list_head sd_ail2_list;
788 794
789 /* Replay stuff */
790
791 struct list_head sd_revoke_list;
792 unsigned int sd_replay_tail;
793
794 unsigned int sd_found_blocks;
795 unsigned int sd_found_revokes;
796 unsigned int sd_replayed_blocks;
797
798 /* For quiescing the filesystem */ 795 /* For quiescing the filesystem */
799 struct gfs2_holder sd_freeze_gh; 796 struct gfs2_holder sd_freeze_gh;
800 797
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5c524180c98e..28cc7bf6575a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -376,12 +376,11 @@ static void munge_mode_uid_gid(const struct gfs2_inode *dip,
376 inode->i_gid = current_fsgid(); 376 inode->i_gid = current_fsgid();
377} 377}
378 378
379static int alloc_dinode(struct gfs2_inode *ip, u32 flags) 379static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks)
380{ 380{
381 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 381 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
382 struct gfs2_alloc_parms ap = { .target = RES_DINODE, .aflags = flags, }; 382 struct gfs2_alloc_parms ap = { .target = *dblocks, .aflags = flags, };
383 int error; 383 int error;
384 int dblocks = 1;
385 384
386 error = gfs2_quota_lock_check(ip); 385 error = gfs2_quota_lock_check(ip);
387 if (error) 386 if (error)
@@ -391,11 +390,11 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags)
391 if (error) 390 if (error)
392 goto out_quota; 391 goto out_quota;
393 392
394 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 0); 393 error = gfs2_trans_begin(sdp, (*dblocks * RES_RG_BIT) + RES_STATFS + RES_QUOTA, 0);
395 if (error) 394 if (error)
396 goto out_ipreserv; 395 goto out_ipreserv;
397 396
398 error = gfs2_alloc_blocks(ip, &ip->i_no_addr, &dblocks, 1, &ip->i_generation); 397 error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation);
399 ip->i_no_formal_ino = ip->i_generation; 398 ip->i_no_formal_ino = ip->i_generation;
400 ip->i_inode.i_ino = ip->i_no_addr; 399 ip->i_inode.i_ino = ip->i_no_addr;
401 ip->i_goal = ip->i_no_addr; 400 ip->i_goal = ip->i_no_addr;
@@ -428,6 +427,33 @@ static void gfs2_init_dir(struct buffer_head *dibh,
428} 427}
429 428
430/** 429/**
430 * gfs2_init_xattr - Initialise an xattr block for a new inode
431 * @ip: The inode in question
432 *
433 * This sets up an empty xattr block for a new inode, ready to
434 * take any ACLs, LSM xattrs, etc.
435 */
436
437static void gfs2_init_xattr(struct gfs2_inode *ip)
438{
439 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
440 struct buffer_head *bh;
441 struct gfs2_ea_header *ea;
442
443 bh = gfs2_meta_new(ip->i_gl, ip->i_eattr);
444 gfs2_trans_add_meta(ip->i_gl, bh);
445 gfs2_metatype_set(bh, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
446 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
447
448 ea = GFS2_EA_BH2FIRST(bh);
449 ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
450 ea->ea_type = GFS2_EATYPE_UNUSED;
451 ea->ea_flags = GFS2_EAFLAG_LAST;
452
453 brelse(bh);
454}
455
456/**
431 * init_dinode - Fill in a new dinode structure 457 * init_dinode - Fill in a new dinode structure
432 * @dip: The directory this inode is being created in 458 * @dip: The directory this inode is being created in
433 * @ip: The inode 459 * @ip: The inode
@@ -545,13 +571,6 @@ static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
545 return err; 571 return err;
546} 572}
547 573
548static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
549 const struct qstr *qstr)
550{
551 return security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
552 &gfs2_initxattrs, NULL);
553}
554
555/** 574/**
556 * gfs2_create_inode - Create a new inode 575 * gfs2_create_inode - Create a new inode
557 * @dir: The parent directory 576 * @dir: The parent directory
@@ -578,8 +597,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
578 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 597 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
579 struct gfs2_glock *io_gl; 598 struct gfs2_glock *io_gl;
580 struct dentry *d; 599 struct dentry *d;
581 int error; 600 int error, free_vfs_inode = 0;
582 u32 aflags = 0; 601 u32 aflags = 0;
602 unsigned blocks = 1;
583 struct gfs2_diradd da = { .bh = NULL, }; 603 struct gfs2_diradd da = { .bh = NULL, };
584 604
585 if (!name->len || name->len > GFS2_FNAMESIZE) 605 if (!name->len || name->len > GFS2_FNAMESIZE)
@@ -676,10 +696,15 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
676 (dip->i_diskflags & GFS2_DIF_TOPDIR)) 696 (dip->i_diskflags & GFS2_DIF_TOPDIR))
677 aflags |= GFS2_AF_ORLOV; 697 aflags |= GFS2_AF_ORLOV;
678 698
679 error = alloc_dinode(ip, aflags); 699 if (default_acl || acl)
700 blocks++;
701
702 error = alloc_dinode(ip, aflags, &blocks);
680 if (error) 703 if (error)
681 goto fail_free_inode; 704 goto fail_free_inode;
682 705
706 gfs2_set_inode_blocks(inode, blocks);
707
683 error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); 708 error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
684 if (error) 709 if (error)
685 goto fail_free_inode; 710 goto fail_free_inode;
@@ -689,10 +714,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
689 if (error) 714 if (error)
690 goto fail_free_inode; 715 goto fail_free_inode;
691 716
692 error = gfs2_trans_begin(sdp, RES_DINODE, 0); 717 error = gfs2_trans_begin(sdp, blocks, 0);
693 if (error) 718 if (error)
694 goto fail_gunlock2; 719 goto fail_gunlock2;
695 720
721 if (blocks > 1) {
722 ip->i_eattr = ip->i_no_addr + 1;
723 gfs2_init_xattr(ip);
724 }
696 init_dinode(dip, ip, symname); 725 init_dinode(dip, ip, symname);
697 gfs2_trans_end(sdp); 726 gfs2_trans_end(sdp);
698 727
@@ -722,7 +751,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
722 if (error) 751 if (error)
723 goto fail_gunlock3; 752 goto fail_gunlock3;
724 753
725 error = gfs2_security_init(dip, ip, name); 754 error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name,
755 &gfs2_initxattrs, NULL);
726 if (error) 756 if (error)
727 goto fail_gunlock3; 757 goto fail_gunlock3;
728 758
@@ -758,15 +788,16 @@ fail_free_acls:
758 if (acl) 788 if (acl)
759 posix_acl_release(acl); 789 posix_acl_release(acl);
760fail_free_vfs_inode: 790fail_free_vfs_inode:
761 free_inode_nonrcu(inode); 791 free_vfs_inode = 1;
762 inode = NULL;
763fail_gunlock: 792fail_gunlock:
764 gfs2_dir_no_add(&da); 793 gfs2_dir_no_add(&da);
765 gfs2_glock_dq_uninit(ghs); 794 gfs2_glock_dq_uninit(ghs);
766 if (inode && !IS_ERR(inode)) { 795 if (inode && !IS_ERR(inode)) {
767 clear_nlink(inode); 796 clear_nlink(inode);
768 mark_inode_dirty(inode); 797 if (!free_vfs_inode)
769 set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags); 798 mark_inode_dirty(inode);
799 set_bit(free_vfs_inode ? GIF_FREE_VFS_INODE : GIF_ALLOC_FAILED,
800 &GFS2_I(inode)->i_flags);
770 iput(inode); 801 iput(inode);
771 } 802 }
772fail: 803fail:
@@ -1263,6 +1294,10 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
1263 } 1294 }
1264 1295
1265 tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1); 1296 tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
1297 if (!tmp) {
1298 error = -ENOENT;
1299 break;
1300 }
1266 if (IS_ERR(tmp)) { 1301 if (IS_ERR(tmp)) {
1267 error = PTR_ERR(tmp); 1302 error = PTR_ERR(tmp);
1268 break; 1303 break;
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 2a6ba06bee6f..c1eb555dc588 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/fs.h> 12#include <linux/fs.h>
11#include <linux/dlm.h> 13#include <linux/dlm.h>
12#include <linux/slab.h> 14#include <linux/slab.h>
@@ -176,7 +178,7 @@ static void gdlm_bast(void *arg, int mode)
176 gfs2_glock_cb(gl, LM_ST_SHARED); 178 gfs2_glock_cb(gl, LM_ST_SHARED);
177 break; 179 break;
178 default: 180 default:
179 printk(KERN_ERR "unknown bast mode %d", mode); 181 pr_err("unknown bast mode %d\n", mode);
180 BUG(); 182 BUG();
181 } 183 }
182} 184}
@@ -195,7 +197,7 @@ static int make_mode(const unsigned int lmstate)
195 case LM_ST_SHARED: 197 case LM_ST_SHARED:
196 return DLM_LOCK_PR; 198 return DLM_LOCK_PR;
197 } 199 }
198 printk(KERN_ERR "unknown LM state %d", lmstate); 200 pr_err("unknown LM state %d\n", lmstate);
199 BUG(); 201 BUG();
200 return -1; 202 return -1;
201} 203}
@@ -308,7 +310,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
308 error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, 310 error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
309 NULL, gl); 311 NULL, gl);
310 if (error) { 312 if (error) {
311 printk(KERN_ERR "gdlm_unlock %x,%llx err=%d\n", 313 pr_err("gdlm_unlock %x,%llx err=%d\n",
312 gl->gl_name.ln_type, 314 gl->gl_name.ln_type,
313 (unsigned long long)gl->gl_name.ln_number, error); 315 (unsigned long long)gl->gl_name.ln_number, error);
314 return; 316 return;
@@ -1102,7 +1104,7 @@ static void gdlm_recover_slot(void *arg, struct dlm_slot *slot)
1102 } 1104 }
1103 1105
1104 if (ls->ls_recover_submit[jid]) { 1106 if (ls->ls_recover_submit[jid]) {
1105 fs_info(sdp, "recover_slot jid %d gen %u prev %u", 1107 fs_info(sdp, "recover_slot jid %d gen %u prev %u\n",
1106 jid, ls->ls_recover_block, ls->ls_recover_submit[jid]); 1108 jid, ls->ls_recover_block, ls->ls_recover_submit[jid]);
1107 } 1109 }
1108 ls->ls_recover_submit[jid] = ls->ls_recover_block; 1110 ls->ls_recover_submit[jid] = ls->ls_recover_block;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 9dcb9777a5f8..4a14d504ef83 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -18,6 +18,7 @@
18#include <linux/kthread.h> 18#include <linux/kthread.h>
19#include <linux/freezer.h> 19#include <linux/freezer.h>
20#include <linux/bio.h> 20#include <linux/bio.h>
21#include <linux/blkdev.h>
21#include <linux/writeback.h> 22#include <linux/writeback.h>
22#include <linux/list_sort.h> 23#include <linux/list_sort.h>
23 24
@@ -145,8 +146,10 @@ void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
145{ 146{
146 struct list_head *head = &sdp->sd_ail1_list; 147 struct list_head *head = &sdp->sd_ail1_list;
147 struct gfs2_trans *tr; 148 struct gfs2_trans *tr;
149 struct blk_plug plug;
148 150
149 trace_gfs2_ail_flush(sdp, wbc, 1); 151 trace_gfs2_ail_flush(sdp, wbc, 1);
152 blk_start_plug(&plug);
150 spin_lock(&sdp->sd_ail_lock); 153 spin_lock(&sdp->sd_ail_lock);
151restart: 154restart:
152 list_for_each_entry_reverse(tr, head, tr_list) { 155 list_for_each_entry_reverse(tr, head, tr_list) {
@@ -156,6 +159,7 @@ restart:
156 goto restart; 159 goto restart;
157 } 160 }
158 spin_unlock(&sdp->sd_ail_lock); 161 spin_unlock(&sdp->sd_ail_lock);
162 blk_finish_plug(&plug);
159 trace_gfs2_ail_flush(sdp, wbc, 0); 163 trace_gfs2_ail_flush(sdp, wbc, 0);
160} 164}
161 165
@@ -410,24 +414,22 @@ static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer
410static unsigned int calc_reserved(struct gfs2_sbd *sdp) 414static unsigned int calc_reserved(struct gfs2_sbd *sdp)
411{ 415{
412 unsigned int reserved = 0; 416 unsigned int reserved = 0;
413 unsigned int mbuf_limit, metabufhdrs_needed; 417 unsigned int mbuf;
414 unsigned int dbuf_limit, databufhdrs_needed; 418 unsigned int dbuf;
415 unsigned int revokes = 0; 419 struct gfs2_trans *tr = sdp->sd_log_tr;
416 420
417 mbuf_limit = buf_limit(sdp); 421 if (tr) {
418 metabufhdrs_needed = (sdp->sd_log_commited_buf + 422 mbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm;
419 (mbuf_limit - 1)) / mbuf_limit; 423 dbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm;
420 dbuf_limit = databuf_limit(sdp); 424 reserved = mbuf + dbuf;
421 databufhdrs_needed = (sdp->sd_log_commited_databuf + 425 /* Account for header blocks */
422 (dbuf_limit - 1)) / dbuf_limit; 426 reserved += DIV_ROUND_UP(mbuf, buf_limit(sdp));
427 reserved += DIV_ROUND_UP(dbuf, databuf_limit(sdp));
428 }
423 429
424 if (sdp->sd_log_commited_revoke > 0) 430 if (sdp->sd_log_commited_revoke > 0)
425 revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, 431 reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
426 sizeof(u64)); 432 sizeof(u64));
427
428 reserved = sdp->sd_log_commited_buf + metabufhdrs_needed +
429 sdp->sd_log_commited_databuf + databufhdrs_needed +
430 revokes;
431 /* One for the overall header */ 433 /* One for the overall header */
432 if (reserved) 434 if (reserved)
433 reserved++; 435 reserved++;
@@ -682,36 +684,25 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
682 } 684 }
683 trace_gfs2_log_flush(sdp, 1); 685 trace_gfs2_log_flush(sdp, 1);
684 686
687 sdp->sd_log_flush_head = sdp->sd_log_head;
688 sdp->sd_log_flush_wrapped = 0;
685 tr = sdp->sd_log_tr; 689 tr = sdp->sd_log_tr;
686 if (tr) { 690 if (tr) {
687 sdp->sd_log_tr = NULL; 691 sdp->sd_log_tr = NULL;
688 INIT_LIST_HEAD(&tr->tr_ail1_list); 692 INIT_LIST_HEAD(&tr->tr_ail1_list);
689 INIT_LIST_HEAD(&tr->tr_ail2_list); 693 INIT_LIST_HEAD(&tr->tr_ail2_list);
694 tr->tr_first = sdp->sd_log_flush_head;
690 } 695 }
691 696
692 if (sdp->sd_log_num_buf != sdp->sd_log_commited_buf) {
693 printk(KERN_INFO "GFS2: log buf %u %u\n", sdp->sd_log_num_buf,
694 sdp->sd_log_commited_buf);
695 gfs2_assert_withdraw(sdp, 0);
696 }
697 if (sdp->sd_log_num_databuf != sdp->sd_log_commited_databuf) {
698 printk(KERN_INFO "GFS2: log databuf %u %u\n",
699 sdp->sd_log_num_databuf, sdp->sd_log_commited_databuf);
700 gfs2_assert_withdraw(sdp, 0);
701 }
702 gfs2_assert_withdraw(sdp, 697 gfs2_assert_withdraw(sdp,
703 sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke); 698 sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
704 699
705 sdp->sd_log_flush_head = sdp->sd_log_head;
706 sdp->sd_log_flush_wrapped = 0;
707 if (tr)
708 tr->tr_first = sdp->sd_log_flush_head;
709
710 gfs2_ordered_write(sdp); 700 gfs2_ordered_write(sdp);
711 lops_before_commit(sdp); 701 lops_before_commit(sdp, tr);
712 gfs2_log_flush_bio(sdp, WRITE); 702 gfs2_log_flush_bio(sdp, WRITE);
713 703
714 if (sdp->sd_log_head != sdp->sd_log_flush_head) { 704 if (sdp->sd_log_head != sdp->sd_log_flush_head) {
705 log_flush_wait(sdp);
715 log_write_header(sdp, 0); 706 log_write_header(sdp, 0);
716 } else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ 707 } else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
717 atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ 708 atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
@@ -723,8 +714,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
723 gfs2_log_lock(sdp); 714 gfs2_log_lock(sdp);
724 sdp->sd_log_head = sdp->sd_log_flush_head; 715 sdp->sd_log_head = sdp->sd_log_flush_head;
725 sdp->sd_log_blks_reserved = 0; 716 sdp->sd_log_blks_reserved = 0;
726 sdp->sd_log_commited_buf = 0;
727 sdp->sd_log_commited_databuf = 0;
728 sdp->sd_log_commited_revoke = 0; 717 sdp->sd_log_commited_revoke = 0;
729 718
730 spin_lock(&sdp->sd_ail_lock); 719 spin_lock(&sdp->sd_ail_lock);
@@ -740,34 +729,54 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
740 kfree(tr); 729 kfree(tr);
741} 730}
742 731
732/**
733 * gfs2_merge_trans - Merge a new transaction into a cached transaction
734 * @old: Original transaction to be expanded
735 * @new: New transaction to be merged
736 */
737
738static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new)
739{
740 WARN_ON_ONCE(old->tr_attached != 1);
741
742 old->tr_num_buf_new += new->tr_num_buf_new;
743 old->tr_num_databuf_new += new->tr_num_databuf_new;
744 old->tr_num_buf_rm += new->tr_num_buf_rm;
745 old->tr_num_databuf_rm += new->tr_num_databuf_rm;
746 old->tr_num_revoke += new->tr_num_revoke;
747 old->tr_num_revoke_rm += new->tr_num_revoke_rm;
748
749 list_splice_tail_init(&new->tr_databuf, &old->tr_databuf);
750 list_splice_tail_init(&new->tr_buf, &old->tr_buf);
751}
752
743static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) 753static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
744{ 754{
745 unsigned int reserved; 755 unsigned int reserved;
746 unsigned int unused; 756 unsigned int unused;
757 unsigned int maxres;
747 758
748 gfs2_log_lock(sdp); 759 gfs2_log_lock(sdp);
749 760
750 sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm; 761 if (sdp->sd_log_tr) {
751 sdp->sd_log_commited_databuf += tr->tr_num_databuf_new - 762 gfs2_merge_trans(sdp->sd_log_tr, tr);
752 tr->tr_num_databuf_rm; 763 } else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) {
753 gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) || 764 gfs2_assert_withdraw(sdp, tr->tr_t_gh.gh_gl);
754 (((int)sdp->sd_log_commited_databuf) >= 0)); 765 sdp->sd_log_tr = tr;
766 tr->tr_attached = 1;
767 }
768
755 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; 769 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
756 reserved = calc_reserved(sdp); 770 reserved = calc_reserved(sdp);
757 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved); 771 maxres = sdp->sd_log_blks_reserved + tr->tr_reserved;
758 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved; 772 gfs2_assert_withdraw(sdp, maxres >= reserved);
773 unused = maxres - reserved;
759 atomic_add(unused, &sdp->sd_log_blks_free); 774 atomic_add(unused, &sdp->sd_log_blks_free);
760 trace_gfs2_log_blocks(sdp, unused); 775 trace_gfs2_log_blocks(sdp, unused);
761 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= 776 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
762 sdp->sd_jdesc->jd_blocks); 777 sdp->sd_jdesc->jd_blocks);
763 sdp->sd_log_blks_reserved = reserved; 778 sdp->sd_log_blks_reserved = reserved;
764 779
765 if (sdp->sd_log_tr == NULL &&
766 (tr->tr_num_buf_new || tr->tr_num_databuf_new)) {
767 gfs2_assert_withdraw(sdp, tr->tr_t_gh.gh_gl);
768 sdp->sd_log_tr = tr;
769 tr->tr_attached = 1;
770 }
771 gfs2_log_unlock(sdp); 780 gfs2_log_unlock(sdp);
772} 781}
773 782
@@ -807,10 +816,7 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
807 down_write(&sdp->sd_log_flush_lock); 816 down_write(&sdp->sd_log_flush_lock);
808 817
809 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); 818 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
810 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
811 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); 819 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
812 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
813 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
814 gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list)); 820 gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
815 821
816 sdp->sd_log_flush_head = sdp->sd_log_head; 822 sdp->sd_log_flush_head = sdp->sd_log_head;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 76693793cedd..a294d8d8bcd4 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -146,8 +146,8 @@ static u64 gfs2_log_bmap(struct gfs2_sbd *sdp)
146 struct gfs2_journal_extent *je; 146 struct gfs2_journal_extent *je;
147 u64 block; 147 u64 block;
148 148
149 list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) { 149 list_for_each_entry(je, &sdp->sd_jdesc->extent_list, list) {
150 if (lbn >= je->lblock && lbn < je->lblock + je->blocks) { 150 if ((lbn >= je->lblock) && (lbn < (je->lblock + je->blocks))) {
151 block = je->dblock + lbn - je->lblock; 151 block = je->dblock + lbn - je->lblock;
152 gfs2_log_incr_head(sdp); 152 gfs2_log_incr_head(sdp);
153 return block; 153 return block;
@@ -491,44 +491,40 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
491 gfs2_log_unlock(sdp); 491 gfs2_log_unlock(sdp);
492} 492}
493 493
494static void buf_lo_before_commit(struct gfs2_sbd *sdp) 494static void buf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
495{ 495{
496 unsigned int limit = buf_limit(sdp); /* 503 for 4k blocks */ 496 unsigned int limit = buf_limit(sdp); /* 503 for 4k blocks */
497 497 unsigned int nbuf;
498 gfs2_before_commit(sdp, limit, sdp->sd_log_num_buf, 498 if (tr == NULL)
499 &sdp->sd_log_le_buf, 0); 499 return;
500 nbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm;
501 gfs2_before_commit(sdp, limit, nbuf, &tr->tr_buf, 0);
500} 502}
501 503
502static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) 504static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
503{ 505{
504 struct list_head *head = &sdp->sd_log_le_buf; 506 struct list_head *head;
505 struct gfs2_bufdata *bd; 507 struct gfs2_bufdata *bd;
506 508
507 if (tr == NULL) { 509 if (tr == NULL)
508 gfs2_assert(sdp, list_empty(head));
509 return; 510 return;
510 }
511 511
512 head = &tr->tr_buf;
512 while (!list_empty(head)) { 513 while (!list_empty(head)) {
513 bd = list_entry(head->next, struct gfs2_bufdata, bd_list); 514 bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
514 list_del_init(&bd->bd_list); 515 list_del_init(&bd->bd_list);
515 sdp->sd_log_num_buf--;
516
517 gfs2_unpin(sdp, bd->bd_bh, tr); 516 gfs2_unpin(sdp, bd->bd_bh, tr);
518 } 517 }
519 gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
520} 518}
521 519
522static void buf_lo_before_scan(struct gfs2_jdesc *jd, 520static void buf_lo_before_scan(struct gfs2_jdesc *jd,
523 struct gfs2_log_header_host *head, int pass) 521 struct gfs2_log_header_host *head, int pass)
524{ 522{
525 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
526
527 if (pass != 0) 523 if (pass != 0)
528 return; 524 return;
529 525
530 sdp->sd_found_blocks = 0; 526 jd->jd_found_blocks = 0;
531 sdp->sd_replayed_blocks = 0; 527 jd->jd_replayed_blocks = 0;
532} 528}
533 529
534static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, 530static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
@@ -551,9 +547,9 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
551 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { 547 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
552 blkno = be64_to_cpu(*ptr++); 548 blkno = be64_to_cpu(*ptr++);
553 549
554 sdp->sd_found_blocks++; 550 jd->jd_found_blocks++;
555 551
556 if (gfs2_revoke_check(sdp, blkno, start)) 552 if (gfs2_revoke_check(jd, blkno, start))
557 continue; 553 continue;
558 554
559 error = gfs2_replay_read_block(jd, start, &bh_log); 555 error = gfs2_replay_read_block(jd, start, &bh_log);
@@ -574,7 +570,7 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
574 if (error) 570 if (error)
575 break; 571 break;
576 572
577 sdp->sd_replayed_blocks++; 573 jd->jd_replayed_blocks++;
578 } 574 }
579 575
580 return error; 576 return error;
@@ -617,10 +613,10 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
617 gfs2_meta_sync(ip->i_gl); 613 gfs2_meta_sync(ip->i_gl);
618 614
619 fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n", 615 fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
620 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); 616 jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks);
621} 617}
622 618
623static void revoke_lo_before_commit(struct gfs2_sbd *sdp) 619static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
624{ 620{
625 struct gfs2_meta_header *mh; 621 struct gfs2_meta_header *mh;
626 unsigned int offset; 622 unsigned int offset;
@@ -679,13 +675,11 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
679static void revoke_lo_before_scan(struct gfs2_jdesc *jd, 675static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
680 struct gfs2_log_header_host *head, int pass) 676 struct gfs2_log_header_host *head, int pass)
681{ 677{
682 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
683
684 if (pass != 0) 678 if (pass != 0)
685 return; 679 return;
686 680
687 sdp->sd_found_revokes = 0; 681 jd->jd_found_revokes = 0;
688 sdp->sd_replay_tail = head->lh_tail; 682 jd->jd_replay_tail = head->lh_tail;
689} 683}
690 684
691static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, 685static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
@@ -717,13 +711,13 @@ static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
717 while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) { 711 while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) {
718 blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset)); 712 blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
719 713
720 error = gfs2_revoke_add(sdp, blkno, start); 714 error = gfs2_revoke_add(jd, blkno, start);
721 if (error < 0) { 715 if (error < 0) {
722 brelse(bh); 716 brelse(bh);
723 return error; 717 return error;
724 } 718 }
725 else if (error) 719 else if (error)
726 sdp->sd_found_revokes++; 720 jd->jd_found_revokes++;
727 721
728 if (!--revokes) 722 if (!--revokes)
729 break; 723 break;
@@ -743,16 +737,16 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
743 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); 737 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
744 738
745 if (error) { 739 if (error) {
746 gfs2_revoke_clean(sdp); 740 gfs2_revoke_clean(jd);
747 return; 741 return;
748 } 742 }
749 if (pass != 1) 743 if (pass != 1)
750 return; 744 return;
751 745
752 fs_info(sdp, "jid=%u: Found %u revoke tags\n", 746 fs_info(sdp, "jid=%u: Found %u revoke tags\n",
753 jd->jd_jid, sdp->sd_found_revokes); 747 jd->jd_jid, jd->jd_found_revokes);
754 748
755 gfs2_revoke_clean(sdp); 749 gfs2_revoke_clean(jd);
756} 750}
757 751
758/** 752/**
@@ -760,12 +754,14 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
760 * 754 *
761 */ 755 */
762 756
763static void databuf_lo_before_commit(struct gfs2_sbd *sdp) 757static void databuf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
764{ 758{
765 unsigned int limit = buf_limit(sdp) / 2; 759 unsigned int limit = databuf_limit(sdp);
766 760 unsigned int nbuf;
767 gfs2_before_commit(sdp, limit, sdp->sd_log_num_databuf, 761 if (tr == NULL)
768 &sdp->sd_log_le_databuf, 1); 762 return;
763 nbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm;
764 gfs2_before_commit(sdp, limit, nbuf, &tr->tr_databuf, 1);
769} 765}
770 766
771static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, 767static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
@@ -789,9 +785,9 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
789 blkno = be64_to_cpu(*ptr++); 785 blkno = be64_to_cpu(*ptr++);
790 esc = be64_to_cpu(*ptr++); 786 esc = be64_to_cpu(*ptr++);
791 787
792 sdp->sd_found_blocks++; 788 jd->jd_found_blocks++;
793 789
794 if (gfs2_revoke_check(sdp, blkno, start)) 790 if (gfs2_revoke_check(jd, blkno, start))
795 continue; 791 continue;
796 792
797 error = gfs2_replay_read_block(jd, start, &bh_log); 793 error = gfs2_replay_read_block(jd, start, &bh_log);
@@ -811,7 +807,7 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
811 brelse(bh_log); 807 brelse(bh_log);
812 brelse(bh_ip); 808 brelse(bh_ip);
813 809
814 sdp->sd_replayed_blocks++; 810 jd->jd_replayed_blocks++;
815 } 811 }
816 812
817 return error; 813 return error;
@@ -835,26 +831,23 @@ static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
835 gfs2_meta_sync(ip->i_gl); 831 gfs2_meta_sync(ip->i_gl);
836 832
837 fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n", 833 fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
838 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); 834 jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks);
839} 835}
840 836
841static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) 837static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
842{ 838{
843 struct list_head *head = &sdp->sd_log_le_databuf; 839 struct list_head *head;
844 struct gfs2_bufdata *bd; 840 struct gfs2_bufdata *bd;
845 841
846 if (tr == NULL) { 842 if (tr == NULL)
847 gfs2_assert(sdp, list_empty(head));
848 return; 843 return;
849 }
850 844
845 head = &tr->tr_databuf;
851 while (!list_empty(head)) { 846 while (!list_empty(head)) {
852 bd = list_entry(head->next, struct gfs2_bufdata, bd_list); 847 bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
853 list_del_init(&bd->bd_list); 848 list_del_init(&bd->bd_list);
854 sdp->sd_log_num_databuf--;
855 gfs2_unpin(sdp, bd->bd_bh, tr); 849 gfs2_unpin(sdp, bd->bd_bh, tr);
856 } 850 }
857 gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
858} 851}
859 852
860 853
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 9ca2e6438419..a65a7ba32ffd 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -46,12 +46,13 @@ static inline unsigned int databuf_limit(struct gfs2_sbd *sdp)
46 return limit; 46 return limit;
47} 47}
48 48
49static inline void lops_before_commit(struct gfs2_sbd *sdp) 49static inline void lops_before_commit(struct gfs2_sbd *sdp,
50 struct gfs2_trans *tr)
50{ 51{
51 int x; 52 int x;
52 for (x = 0; gfs2_log_ops[x]; x++) 53 for (x = 0; gfs2_log_ops[x]; x++)
53 if (gfs2_log_ops[x]->lo_before_commit) 54 if (gfs2_log_ops[x]->lo_before_commit)
54 gfs2_log_ops[x]->lo_before_commit(sdp); 55 gfs2_log_ops[x]->lo_before_commit(sdp, tr);
55} 56}
56 57
57static inline void lops_after_commit(struct gfs2_sbd *sdp, 58static inline void lops_after_commit(struct gfs2_sbd *sdp,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index c272e73063de..82b6ac829656 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/slab.h> 12#include <linux/slab.h>
11#include <linux/spinlock.h> 13#include <linux/spinlock.h>
12#include <linux/completion.h> 14#include <linux/completion.h>
@@ -165,7 +167,7 @@ static int __init init_gfs2_fs(void)
165 167
166 gfs2_register_debugfs(); 168 gfs2_register_debugfs();
167 169
168 printk("GFS2 installed\n"); 170 pr_info("GFS2 installed\n");
169 171
170 return 0; 172 return 0;
171 173
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index c7f24690ed05..2cf09b63a6b4 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -97,6 +97,11 @@ const struct address_space_operations gfs2_meta_aops = {
97 .releasepage = gfs2_releasepage, 97 .releasepage = gfs2_releasepage,
98}; 98};
99 99
100const struct address_space_operations gfs2_rgrp_aops = {
101 .writepage = gfs2_aspace_writepage,
102 .releasepage = gfs2_releasepage,
103};
104
100/** 105/**
101 * gfs2_getbuf - Get a buffer with a given address space 106 * gfs2_getbuf - Get a buffer with a given address space
102 * @gl: the glock 107 * @gl: the glock
@@ -267,15 +272,10 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
267 trace_gfs2_pin(bd, 0); 272 trace_gfs2_pin(bd, 0);
268 atomic_dec(&sdp->sd_log_pinned); 273 atomic_dec(&sdp->sd_log_pinned);
269 list_del_init(&bd->bd_list); 274 list_del_init(&bd->bd_list);
270 if (meta) { 275 if (meta)
271 gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
272 sdp->sd_log_num_buf--;
273 tr->tr_num_buf_rm++; 276 tr->tr_num_buf_rm++;
274 } else { 277 else
275 gfs2_assert_warn(sdp, sdp->sd_log_num_databuf);
276 sdp->sd_log_num_databuf--;
277 tr->tr_num_databuf_rm++; 278 tr->tr_num_databuf_rm++;
278 }
279 tr->tr_touched = 1; 279 tr->tr_touched = 1;
280 was_pinned = 1; 280 was_pinned = 1;
281 brelse(bh); 281 brelse(bh);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 4823b934208a..ac5d8027d335 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -38,12 +38,15 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
38} 38}
39 39
40extern const struct address_space_operations gfs2_meta_aops; 40extern const struct address_space_operations gfs2_meta_aops;
41extern const struct address_space_operations gfs2_rgrp_aops;
41 42
42static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) 43static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
43{ 44{
44 struct inode *inode = mapping->host; 45 struct inode *inode = mapping->host;
45 if (mapping->a_ops == &gfs2_meta_aops) 46 if (mapping->a_ops == &gfs2_meta_aops)
46 return (((struct gfs2_glock *)mapping) - 1)->gl_sbd; 47 return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
48 else if (mapping->a_ops == &gfs2_rgrp_aops)
49 return container_of(mapping, struct gfs2_sbd, sd_aspace);
47 else 50 else
48 return inode->i_sb->s_fs_info; 51 return inode->i_sb->s_fs_info;
49} 52}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c6872d09561a..22f954051bb8 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/sched.h> 12#include <linux/sched.h>
11#include <linux/slab.h> 13#include <linux/slab.h>
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
@@ -104,7 +106,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
104 mapping = &sdp->sd_aspace; 106 mapping = &sdp->sd_aspace;
105 107
106 address_space_init_once(mapping); 108 address_space_init_once(mapping);
107 mapping->a_ops = &gfs2_meta_aops; 109 mapping->a_ops = &gfs2_rgrp_aops;
108 mapping->host = sb->s_bdev->bd_inode; 110 mapping->host = sb->s_bdev->bd_inode;
109 mapping->flags = 0; 111 mapping->flags = 0;
110 mapping_set_gfp_mask(mapping, GFP_NOFS); 112 mapping_set_gfp_mask(mapping, GFP_NOFS);
@@ -114,9 +116,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
114 116
115 spin_lock_init(&sdp->sd_log_lock); 117 spin_lock_init(&sdp->sd_log_lock);
116 atomic_set(&sdp->sd_log_pinned, 0); 118 atomic_set(&sdp->sd_log_pinned, 0);
117 INIT_LIST_HEAD(&sdp->sd_log_le_buf);
118 INIT_LIST_HEAD(&sdp->sd_log_le_revoke); 119 INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
119 INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
120 INIT_LIST_HEAD(&sdp->sd_log_le_ordered); 120 INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
121 spin_lock_init(&sdp->sd_ordered_lock); 121 spin_lock_init(&sdp->sd_ordered_lock);
122 122
@@ -130,8 +130,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
130 atomic_set(&sdp->sd_log_in_flight, 0); 130 atomic_set(&sdp->sd_log_in_flight, 0);
131 init_waitqueue_head(&sdp->sd_log_flush_wait); 131 init_waitqueue_head(&sdp->sd_log_flush_wait);
132 132
133 INIT_LIST_HEAD(&sdp->sd_revoke_list);
134
135 return sdp; 133 return sdp;
136} 134}
137 135
@@ -154,7 +152,7 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent)
154 if (sb->sb_magic != GFS2_MAGIC || 152 if (sb->sb_magic != GFS2_MAGIC ||
155 sb->sb_type != GFS2_METATYPE_SB) { 153 sb->sb_type != GFS2_METATYPE_SB) {
156 if (!silent) 154 if (!silent)
157 printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n"); 155 pr_warn("not a GFS2 filesystem\n");
158 return -EINVAL; 156 return -EINVAL;
159 } 157 }
160 158
@@ -176,7 +174,7 @@ static void end_bio_io_page(struct bio *bio, int error)
176 if (!error) 174 if (!error)
177 SetPageUptodate(page); 175 SetPageUptodate(page);
178 else 176 else
179 printk(KERN_WARNING "gfs2: error %d reading superblock\n", error); 177 pr_warn("error %d reading superblock\n", error);
180 unlock_page(page); 178 unlock_page(page);
181} 179}
182 180
@@ -519,67 +517,6 @@ out:
519 return ret; 517 return ret;
520} 518}
521 519
522/**
523 * map_journal_extents - create a reusable "extent" mapping from all logical
524 * blocks to all physical blocks for the given journal. This will save
525 * us time when writing journal blocks. Most journals will have only one
526 * extent that maps all their logical blocks. That's because gfs2.mkfs
527 * arranges the journal blocks sequentially to maximize performance.
528 * So the extent would map the first block for the entire file length.
529 * However, gfs2_jadd can happen while file activity is happening, so
530 * those journals may not be sequential. Less likely is the case where
531 * the users created their own journals by mounting the metafs and
532 * laying it out. But it's still possible. These journals might have
533 * several extents.
534 *
535 * TODO: This should be done in bigger chunks rather than one block at a time,
536 * but since it's only done at mount time, I'm not worried about the
537 * time it takes.
538 */
539static int map_journal_extents(struct gfs2_sbd *sdp)
540{
541 struct gfs2_jdesc *jd = sdp->sd_jdesc;
542 unsigned int lb;
543 u64 db, prev_db; /* logical block, disk block, prev disk block */
544 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
545 struct gfs2_journal_extent *jext = NULL;
546 struct buffer_head bh;
547 int rc = 0;
548
549 prev_db = 0;
550
551 for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) {
552 bh.b_state = 0;
553 bh.b_blocknr = 0;
554 bh.b_size = 1 << ip->i_inode.i_blkbits;
555 rc = gfs2_block_map(jd->jd_inode, lb, &bh, 0);
556 db = bh.b_blocknr;
557 if (rc || !db) {
558 printk(KERN_INFO "GFS2 journal mapping error %d: lb="
559 "%u db=%llu\n", rc, lb, (unsigned long long)db);
560 break;
561 }
562 if (!prev_db || db != prev_db + 1) {
563 jext = kzalloc(sizeof(struct gfs2_journal_extent),
564 GFP_KERNEL);
565 if (!jext) {
566 printk(KERN_INFO "GFS2 error: out of memory "
567 "mapping journal extents.\n");
568 rc = -ENOMEM;
569 break;
570 }
571 jext->dblock = db;
572 jext->lblock = lb;
573 jext->blocks = 1;
574 list_add_tail(&jext->extent_list, &jd->extent_list);
575 } else {
576 jext->blocks++;
577 }
578 prev_db = db;
579 }
580 return rc;
581}
582
583static void gfs2_others_may_mount(struct gfs2_sbd *sdp) 520static void gfs2_others_may_mount(struct gfs2_sbd *sdp)
584{ 521{
585 char *message = "FIRSTMOUNT=Done"; 522 char *message = "FIRSTMOUNT=Done";
@@ -638,6 +575,8 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
638 break; 575 break;
639 576
640 INIT_LIST_HEAD(&jd->extent_list); 577 INIT_LIST_HEAD(&jd->extent_list);
578 INIT_LIST_HEAD(&jd->jd_revoke_list);
579
641 INIT_WORK(&jd->jd_work, gfs2_recover_func); 580 INIT_WORK(&jd->jd_work, gfs2_recover_func);
642 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); 581 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
643 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { 582 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
@@ -781,7 +720,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
781 atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); 720 atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
782 721
783 /* Map the extents for this journal's blocks */ 722 /* Map the extents for this journal's blocks */
784 map_journal_extents(sdp); 723 gfs2_map_journal_extents(sdp, sdp->sd_jdesc);
785 } 724 }
786 trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free)); 725 trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free));
787 726
@@ -1008,7 +947,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1008 lm = &gfs2_dlm_ops; 947 lm = &gfs2_dlm_ops;
1009#endif 948#endif
1010 } else { 949 } else {
1011 printk(KERN_INFO "GFS2: can't find protocol %s\n", proto); 950 pr_info("can't find protocol %s\n", proto);
1012 return -ENOENT; 951 return -ENOENT;
1013 } 952 }
1014 953
@@ -1115,7 +1054,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1115 1054
1116 sdp = init_sbd(sb); 1055 sdp = init_sbd(sb);
1117 if (!sdp) { 1056 if (!sdp) {
1118 printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n"); 1057 pr_warn("can't alloc struct gfs2_sbd\n");
1119 return -ENOMEM; 1058 return -ENOMEM;
1120 } 1059 }
1121 sdp->sd_args = *args; 1060 sdp->sd_args = *args;
@@ -1363,7 +1302,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
1363 1302
1364 error = gfs2_mount_args(&args, data); 1303 error = gfs2_mount_args(&args, data);
1365 if (error) { 1304 if (error) {
1366 printk(KERN_WARNING "GFS2: can't parse mount arguments\n"); 1305 pr_warn("can't parse mount arguments\n");
1367 goto error_super; 1306 goto error_super;
1368 } 1307 }
1369 1308
@@ -1413,15 +1352,15 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
1413 1352
1414 error = kern_path(dev_name, LOOKUP_FOLLOW, &path); 1353 error = kern_path(dev_name, LOOKUP_FOLLOW, &path);
1415 if (error) { 1354 if (error) {
1416 printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n", 1355 pr_warn("path_lookup on %s returned error %d\n",
1417 dev_name, error); 1356 dev_name, error);
1418 return ERR_PTR(error); 1357 return ERR_PTR(error);
1419 } 1358 }
1420 s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags, 1359 s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags,
1421 path.dentry->d_inode->i_sb->s_bdev); 1360 path.dentry->d_inode->i_sb->s_bdev);
1422 path_put(&path); 1361 path_put(&path);
1423 if (IS_ERR(s)) { 1362 if (IS_ERR(s)) {
1424 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); 1363 pr_warn("gfs2 mount does not exist\n");
1425 return ERR_CAST(s); 1364 return ERR_CAST(s);
1426 } 1365 }
1427 if ((flags ^ s->s_flags) & MS_RDONLY) { 1366 if ((flags ^ s->s_flags) & MS_RDONLY) {
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 8bec0e3192dd..c4effff7cf55 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -36,6 +36,8 @@
36 * the quota file, so it is not being constantly read. 36 * the quota file, so it is not being constantly read.
37 */ 37 */
38 38
39#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
40
39#include <linux/sched.h> 41#include <linux/sched.h>
40#include <linux/slab.h> 42#include <linux/slab.h>
41#include <linux/mm.h> 43#include <linux/mm.h>
@@ -330,6 +332,7 @@ static int slot_get(struct gfs2_quota_data *qd)
330 if (bit < sdp->sd_quota_slots) { 332 if (bit < sdp->sd_quota_slots) {
331 set_bit(bit, sdp->sd_quota_bitmap); 333 set_bit(bit, sdp->sd_quota_bitmap);
332 qd->qd_slot = bit; 334 qd->qd_slot = bit;
335 error = 0;
333out: 336out:
334 qd->qd_slot_count++; 337 qd->qd_slot_count++;
335 } 338 }
@@ -1081,10 +1084,10 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
1081{ 1084{
1082 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 1085 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
1083 1086
1084 printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n", 1087 fs_info(sdp, "quota %s for %s %u\n",
1085 sdp->sd_fsname, type, 1088 type,
1086 (qd->qd_id.type == USRQUOTA) ? "user" : "group", 1089 (qd->qd_id.type == USRQUOTA) ? "user" : "group",
1087 from_kqid(&init_user_ns, qd->qd_id)); 1090 from_kqid(&init_user_ns, qd->qd_id));
1088 1091
1089 return 0; 1092 return 0;
1090} 1093}
@@ -1242,14 +1245,13 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
1242 bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long)); 1245 bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long));
1243 bm_size *= sizeof(unsigned long); 1246 bm_size *= sizeof(unsigned long);
1244 error = -ENOMEM; 1247 error = -ENOMEM;
1245 sdp->sd_quota_bitmap = kmalloc(bm_size, GFP_NOFS|__GFP_NOWARN); 1248 sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN);
1246 if (sdp->sd_quota_bitmap == NULL) 1249 if (sdp->sd_quota_bitmap == NULL)
1247 sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS, PAGE_KERNEL); 1250 sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS |
1251 __GFP_ZERO, PAGE_KERNEL);
1248 if (!sdp->sd_quota_bitmap) 1252 if (!sdp->sd_quota_bitmap)
1249 return error; 1253 return error;
1250 1254
1251 memset(sdp->sd_quota_bitmap, 0, bm_size);
1252
1253 for (x = 0; x < blocks; x++) { 1255 for (x = 0; x < blocks; x++) {
1254 struct buffer_head *bh; 1256 struct buffer_head *bh;
1255 const struct gfs2_quota_change *qc; 1257 const struct gfs2_quota_change *qc;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 963b2d75200c..7ad4094d68c0 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -52,9 +52,9 @@ int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
52 return error; 52 return error;
53} 53}
54 54
55int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) 55int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
56{ 56{
57 struct list_head *head = &sdp->sd_revoke_list; 57 struct list_head *head = &jd->jd_revoke_list;
58 struct gfs2_revoke_replay *rr; 58 struct gfs2_revoke_replay *rr;
59 int found = 0; 59 int found = 0;
60 60
@@ -81,13 +81,13 @@ int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
81 return 1; 81 return 1;
82} 82}
83 83
84int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) 84int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
85{ 85{
86 struct gfs2_revoke_replay *rr; 86 struct gfs2_revoke_replay *rr;
87 int wrap, a, b, revoke; 87 int wrap, a, b, revoke;
88 int found = 0; 88 int found = 0;
89 89
90 list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) { 90 list_for_each_entry(rr, &jd->jd_revoke_list, rr_list) {
91 if (rr->rr_blkno == blkno) { 91 if (rr->rr_blkno == blkno) {
92 found = 1; 92 found = 1;
93 break; 93 break;
@@ -97,17 +97,17 @@ int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
97 if (!found) 97 if (!found)
98 return 0; 98 return 0;
99 99
100 wrap = (rr->rr_where < sdp->sd_replay_tail); 100 wrap = (rr->rr_where < jd->jd_replay_tail);
101 a = (sdp->sd_replay_tail < where); 101 a = (jd->jd_replay_tail < where);
102 b = (where < rr->rr_where); 102 b = (where < rr->rr_where);
103 revoke = (wrap) ? (a || b) : (a && b); 103 revoke = (wrap) ? (a || b) : (a && b);
104 104
105 return revoke; 105 return revoke;
106} 106}
107 107
108void gfs2_revoke_clean(struct gfs2_sbd *sdp) 108void gfs2_revoke_clean(struct gfs2_jdesc *jd)
109{ 109{
110 struct list_head *head = &sdp->sd_revoke_list; 110 struct list_head *head = &jd->jd_revoke_list;
111 struct gfs2_revoke_replay *rr; 111 struct gfs2_revoke_replay *rr;
112 112
113 while (!list_empty(head)) { 113 while (!list_empty(head)) {
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 2226136c7647..6142836cce96 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -23,9 +23,9 @@ static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
23extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, 23extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
24 struct buffer_head **bh); 24 struct buffer_head **bh);
25 25
26extern int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); 26extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
27extern int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); 27extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
28extern void gfs2_revoke_clean(struct gfs2_sbd *sdp); 28extern void gfs2_revoke_clean(struct gfs2_jdesc *jd);
29 29
30extern int gfs2_find_jhead(struct gfs2_jdesc *jd, 30extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
31 struct gfs2_log_header_host *head); 31 struct gfs2_log_header_host *head);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index a1da21349235..281a7716e3f3 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/slab.h> 12#include <linux/slab.h>
11#include <linux/spinlock.h> 13#include <linux/spinlock.h>
12#include <linux/completion.h> 14#include <linux/completion.h>
@@ -99,12 +101,12 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
99 cur_state = (*byte1 >> bit) & GFS2_BIT_MASK; 101 cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
100 102
101 if (unlikely(!valid_change[new_state * 4 + cur_state])) { 103 if (unlikely(!valid_change[new_state * 4 + cur_state])) {
102 printk(KERN_WARNING "GFS2: buf_blk = 0x%x old_state=%d, " 104 pr_warn("buf_blk = 0x%x old_state=%d, new_state=%d\n",
103 "new_state=%d\n", rbm->offset, cur_state, new_state); 105 rbm->offset, cur_state, new_state);
104 printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%x\n", 106 pr_warn("rgrp=0x%llx bi_start=0x%x\n",
105 (unsigned long long)rbm->rgd->rd_addr, bi->bi_start); 107 (unsigned long long)rbm->rgd->rd_addr, bi->bi_start);
106 printk(KERN_WARNING "GFS2: bi_offset=0x%x bi_len=0x%x\n", 108 pr_warn("bi_offset=0x%x bi_len=0x%x\n",
107 bi->bi_offset, bi->bi_len); 109 bi->bi_offset, bi->bi_len);
108 dump_stack(); 110 dump_stack();
109 gfs2_consist_rgrpd(rbm->rgd); 111 gfs2_consist_rgrpd(rbm->rgd);
110 return; 112 return;
@@ -736,11 +738,11 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
736 738
737static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd) 739static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
738{ 740{
739 printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)rgd->rd_addr); 741 pr_info("ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
740 printk(KERN_INFO " ri_length = %u\n", rgd->rd_length); 742 pr_info("ri_length = %u\n", rgd->rd_length);
741 printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0); 743 pr_info("ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
742 printk(KERN_INFO " ri_data = %u\n", rgd->rd_data); 744 pr_info("ri_data = %u\n", rgd->rd_data);
743 printk(KERN_INFO " ri_bitbytes = %u\n", rgd->rd_bitbytes); 745 pr_info("ri_bitbytes = %u\n", rgd->rd_bitbytes);
744} 746}
745 747
746/** 748/**
@@ -1102,7 +1104,7 @@ static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1102 * Returns: errno 1104 * Returns: errno
1103 */ 1105 */
1104 1106
1105int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) 1107static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1106{ 1108{
1107 struct gfs2_sbd *sdp = rgd->rd_sbd; 1109 struct gfs2_sbd *sdp = rgd->rd_sbd;
1108 struct gfs2_glock *gl = rgd->rd_gl; 1110 struct gfs2_glock *gl = rgd->rd_gl;
@@ -1169,7 +1171,7 @@ fail:
1169 return error; 1171 return error;
1170} 1172}
1171 1173
1172int update_rgrp_lvb(struct gfs2_rgrpd *rgd) 1174static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1173{ 1175{
1174 u32 rl_flags; 1176 u32 rl_flags;
1175 1177
@@ -2278,7 +2280,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2278 } 2280 }
2279 } 2281 }
2280 if (rbm.rgd->rd_free < *nblocks) { 2282 if (rbm.rgd->rd_free < *nblocks) {
2281 printk(KERN_WARNING "nblocks=%u\n", *nblocks); 2283 pr_warn("nblocks=%u\n", *nblocks);
2282 goto rgrp_error; 2284 goto rgrp_error;
2283 } 2285 }
2284 2286
@@ -2296,7 +2298,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2296 2298
2297 gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0); 2299 gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2298 if (dinode) 2300 if (dinode)
2299 gfs2_trans_add_unrevoke(sdp, block, 1); 2301 gfs2_trans_add_unrevoke(sdp, block, *nblocks);
2300 2302
2301 gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid); 2303 gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2302 2304
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 60f60f6181f3..de8afad89e51 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/bio.h> 12#include <linux/bio.h>
11#include <linux/sched.h> 13#include <linux/sched.h>
12#include <linux/slab.h> 14#include <linux/slab.h>
@@ -175,8 +177,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
175 break; 177 break;
176 case Opt_debug: 178 case Opt_debug:
177 if (args->ar_errors == GFS2_ERRORS_PANIC) { 179 if (args->ar_errors == GFS2_ERRORS_PANIC) {
178 printk(KERN_WARNING "GFS2: -o debug and -o errors=panic " 180 pr_warn("-o debug and -o errors=panic are mutually exclusive\n");
179 "are mutually exclusive.\n");
180 return -EINVAL; 181 return -EINVAL;
181 } 182 }
182 args->ar_debug = 1; 183 args->ar_debug = 1;
@@ -228,21 +229,21 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
228 case Opt_commit: 229 case Opt_commit:
229 rv = match_int(&tmp[0], &args->ar_commit); 230 rv = match_int(&tmp[0], &args->ar_commit);
230 if (rv || args->ar_commit <= 0) { 231 if (rv || args->ar_commit <= 0) {
231 printk(KERN_WARNING "GFS2: commit mount option requires a positive numeric argument\n"); 232 pr_warn("commit mount option requires a positive numeric argument\n");
232 return rv ? rv : -EINVAL; 233 return rv ? rv : -EINVAL;
233 } 234 }
234 break; 235 break;
235 case Opt_statfs_quantum: 236 case Opt_statfs_quantum:
236 rv = match_int(&tmp[0], &args->ar_statfs_quantum); 237 rv = match_int(&tmp[0], &args->ar_statfs_quantum);
237 if (rv || args->ar_statfs_quantum < 0) { 238 if (rv || args->ar_statfs_quantum < 0) {
238 printk(KERN_WARNING "GFS2: statfs_quantum mount option requires a non-negative numeric argument\n"); 239 pr_warn("statfs_quantum mount option requires a non-negative numeric argument\n");
239 return rv ? rv : -EINVAL; 240 return rv ? rv : -EINVAL;
240 } 241 }
241 break; 242 break;
242 case Opt_quota_quantum: 243 case Opt_quota_quantum:
243 rv = match_int(&tmp[0], &args->ar_quota_quantum); 244 rv = match_int(&tmp[0], &args->ar_quota_quantum);
244 if (rv || args->ar_quota_quantum <= 0) { 245 if (rv || args->ar_quota_quantum <= 0) {
245 printk(KERN_WARNING "GFS2: quota_quantum mount option requires a positive numeric argument\n"); 246 pr_warn("quota_quantum mount option requires a positive numeric argument\n");
246 return rv ? rv : -EINVAL; 247 return rv ? rv : -EINVAL;
247 } 248 }
248 break; 249 break;
@@ -250,7 +251,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
250 rv = match_int(&tmp[0], &args->ar_statfs_percent); 251 rv = match_int(&tmp[0], &args->ar_statfs_percent);
251 if (rv || args->ar_statfs_percent < 0 || 252 if (rv || args->ar_statfs_percent < 0 ||
252 args->ar_statfs_percent > 100) { 253 args->ar_statfs_percent > 100) {
253 printk(KERN_WARNING "statfs_percent mount option requires a numeric argument between 0 and 100\n"); 254 pr_warn("statfs_percent mount option requires a numeric argument between 0 and 100\n");
254 return rv ? rv : -EINVAL; 255 return rv ? rv : -EINVAL;
255 } 256 }
256 break; 257 break;
@@ -259,8 +260,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
259 break; 260 break;
260 case Opt_err_panic: 261 case Opt_err_panic:
261 if (args->ar_debug) { 262 if (args->ar_debug) {
262 printk(KERN_WARNING "GFS2: -o debug and -o errors=panic " 263 pr_warn("-o debug and -o errors=panic are mutually exclusive\n");
263 "are mutually exclusive.\n");
264 return -EINVAL; 264 return -EINVAL;
265 } 265 }
266 args->ar_errors = GFS2_ERRORS_PANIC; 266 args->ar_errors = GFS2_ERRORS_PANIC;
@@ -279,7 +279,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
279 break; 279 break;
280 case Opt_error: 280 case Opt_error:
281 default: 281 default:
282 printk(KERN_WARNING "GFS2: invalid mount option: %s\n", o); 282 pr_warn("invalid mount option: %s\n", o);
283 return -EINVAL; 283 return -EINVAL;
284 } 284 }
285 } 285 }
@@ -295,9 +295,8 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
295 295
296void gfs2_jindex_free(struct gfs2_sbd *sdp) 296void gfs2_jindex_free(struct gfs2_sbd *sdp)
297{ 297{
298 struct list_head list, *head; 298 struct list_head list;
299 struct gfs2_jdesc *jd; 299 struct gfs2_jdesc *jd;
300 struct gfs2_journal_extent *jext;
301 300
302 spin_lock(&sdp->sd_jindex_spin); 301 spin_lock(&sdp->sd_jindex_spin);
303 list_add(&list, &sdp->sd_jindex_list); 302 list_add(&list, &sdp->sd_jindex_list);
@@ -307,14 +306,7 @@ void gfs2_jindex_free(struct gfs2_sbd *sdp)
307 306
308 while (!list_empty(&list)) { 307 while (!list_empty(&list)) {
309 jd = list_entry(list.next, struct gfs2_jdesc, jd_list); 308 jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
310 head = &jd->extent_list; 309 gfs2_free_journal_extents(jd);
311 while (!list_empty(head)) {
312 jext = list_entry(head->next,
313 struct gfs2_journal_extent,
314 extent_list);
315 list_del(&jext->extent_list);
316 kfree(jext);
317 }
318 list_del(&jd->jd_list); 310 list_del(&jd->jd_list);
319 iput(jd->jd_inode); 311 iput(jd->jd_inode);
320 kfree(jd); 312 kfree(jd);
@@ -1175,6 +1167,8 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1175 struct gfs2_tune *gt = &sdp->sd_tune; 1167 struct gfs2_tune *gt = &sdp->sd_tune;
1176 int error; 1168 int error;
1177 1169
1170 sync_filesystem(sb);
1171
1178 spin_lock(&gt->gt_spin); 1172 spin_lock(&gt->gt_spin);
1179 args.ar_commit = gt->gt_logd_secs; 1173 args.ar_commit = gt->gt_logd_secs;
1180 args.ar_quota_quantum = gt->gt_quota_quantum; 1174 args.ar_quota_quantum = gt->gt_quota_quantum;
@@ -1256,7 +1250,7 @@ static int gfs2_drop_inode(struct inode *inode)
1256{ 1250{
1257 struct gfs2_inode *ip = GFS2_I(inode); 1251 struct gfs2_inode *ip = GFS2_I(inode);
1258 1252
1259 if (inode->i_nlink) { 1253 if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) && inode->i_nlink) {
1260 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; 1254 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
1261 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) 1255 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
1262 clear_nlink(inode); 1256 clear_nlink(inode);
@@ -1471,6 +1465,11 @@ static void gfs2_evict_inode(struct inode *inode)
1471 struct gfs2_holder gh; 1465 struct gfs2_holder gh;
1472 int error; 1466 int error;
1473 1467
1468 if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) {
1469 clear_inode(inode);
1470 return;
1471 }
1472
1474 if (inode->i_nlink || (sb->s_flags & MS_RDONLY)) 1473 if (inode->i_nlink || (sb->s_flags & MS_RDONLY))
1475 goto out; 1474 goto out;
1476 1475
@@ -1558,7 +1557,7 @@ out_unlock:
1558 fs_warn(sdp, "gfs2_evict_inode: %d\n", error); 1557 fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
1559out: 1558out:
1560 /* Case 3 starts here */ 1559 /* Case 3 starts here */
1561 truncate_inode_pages(&inode->i_data, 0); 1560 truncate_inode_pages_final(&inode->i_data);
1562 gfs2_rs_delete(ip, NULL); 1561 gfs2_rs_delete(ip, NULL);
1563 gfs2_ordered_del_inode(ip); 1562 gfs2_ordered_del_inode(ip);
1564 clear_inode(inode); 1563 clear_inode(inode);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index d09f6edda0ff..de25d5577e5d 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/sched.h> 12#include <linux/sched.h>
11#include <linux/spinlock.h> 13#include <linux/spinlock.h>
12#include <linux/completion.h> 14#include <linux/completion.h>
@@ -138,9 +140,8 @@ static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
138 if (simple_strtol(buf, NULL, 0) != 1) 140 if (simple_strtol(buf, NULL, 0) != 1)
139 return -EINVAL; 141 return -EINVAL;
140 142
141 gfs2_lm_withdraw(sdp, 143 gfs2_lm_withdraw(sdp, "withdrawing from cluster at user's request\n");
142 "GFS2: fsid=%s: withdrawing from cluster at user's request\n", 144
143 sdp->sd_fsname);
144 return len; 145 return len;
145} 146}
146 147
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 2b20d7046bf3..bead90d27bad 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/sched.h> 12#include <linux/sched.h>
11#include <linux/slab.h> 13#include <linux/slab.h>
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
@@ -51,6 +53,9 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
51 if (revokes) 53 if (revokes)
52 tr->tr_reserved += gfs2_struct2blk(sdp, revokes, 54 tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
53 sizeof(u64)); 55 sizeof(u64));
56 INIT_LIST_HEAD(&tr->tr_databuf);
57 INIT_LIST_HEAD(&tr->tr_buf);
58
54 sb_start_intwrite(sdp->sd_vfs); 59 sb_start_intwrite(sdp->sd_vfs);
55 gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh); 60 gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
56 61
@@ -96,14 +101,13 @@ static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
96 101
97static void gfs2_print_trans(const struct gfs2_trans *tr) 102static void gfs2_print_trans(const struct gfs2_trans *tr)
98{ 103{
99 printk(KERN_WARNING "GFS2: Transaction created at: %pSR\n", 104 pr_warn("Transaction created at: %pSR\n", (void *)tr->tr_ip);
100 (void *)tr->tr_ip); 105 pr_warn("blocks=%u revokes=%u reserved=%u touched=%u\n",
101 printk(KERN_WARNING "GFS2: blocks=%u revokes=%u reserved=%u touched=%d\n", 106 tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched);
102 tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched); 107 pr_warn("Buf %u/%u Databuf %u/%u Revoke %u/%u\n",
103 printk(KERN_WARNING "GFS2: Buf %u/%u Databuf %u/%u Revoke %u/%u\n", 108 tr->tr_num_buf_new, tr->tr_num_buf_rm,
104 tr->tr_num_buf_new, tr->tr_num_buf_rm, 109 tr->tr_num_databuf_new, tr->tr_num_databuf_rm,
105 tr->tr_num_databuf_new, tr->tr_num_databuf_rm, 110 tr->tr_num_revoke, tr->tr_num_revoke_rm);
106 tr->tr_num_revoke, tr->tr_num_revoke_rm);
107} 111}
108 112
109void gfs2_trans_end(struct gfs2_sbd *sdp) 113void gfs2_trans_end(struct gfs2_sbd *sdp)
@@ -210,8 +214,7 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
210 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); 214 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
211 gfs2_pin(sdp, bd->bd_bh); 215 gfs2_pin(sdp, bd->bd_bh);
212 tr->tr_num_databuf_new++; 216 tr->tr_num_databuf_new++;
213 sdp->sd_log_num_databuf++; 217 list_add_tail(&bd->bd_list, &tr->tr_databuf);
214 list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf);
215 } 218 }
216 gfs2_log_unlock(sdp); 219 gfs2_log_unlock(sdp);
217 unlock_buffer(bh); 220 unlock_buffer(bh);
@@ -230,16 +233,14 @@ static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
230 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); 233 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
231 mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; 234 mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
232 if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { 235 if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) {
233 printk(KERN_ERR 236 pr_err("Attempting to add uninitialised block to journal (inplace block=%lld)\n",
234 "Attempting to add uninitialised block to journal (inplace block=%lld)\n",
235 (unsigned long long)bd->bd_bh->b_blocknr); 237 (unsigned long long)bd->bd_bh->b_blocknr);
236 BUG(); 238 BUG();
237 } 239 }
238 gfs2_pin(sdp, bd->bd_bh); 240 gfs2_pin(sdp, bd->bd_bh);
239 mh->__pad0 = cpu_to_be64(0); 241 mh->__pad0 = cpu_to_be64(0);
240 mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); 242 mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
241 sdp->sd_log_num_buf++; 243 list_add(&bd->bd_list, &tr->tr_buf);
242 list_add(&bd->bd_list, &sdp->sd_log_le_buf);
243 tr->tr_num_buf_new++; 244 tr->tr_num_buf_new++;
244} 245}
245 246
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index f7109f689e61..86d2035ac669 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/spinlock.h> 12#include <linux/spinlock.h>
11#include <linux/completion.h> 13#include <linux/completion.h>
12#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
@@ -30,22 +32,27 @@ mempool_t *gfs2_page_pool __read_mostly;
30 32
31void gfs2_assert_i(struct gfs2_sbd *sdp) 33void gfs2_assert_i(struct gfs2_sbd *sdp)
32{ 34{
33 printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n", 35 fs_emerg(sdp, "fatal assertion failed\n");
34 sdp->sd_fsname);
35} 36}
36 37
37int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) 38int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...)
38{ 39{
39 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 40 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
40 const struct lm_lockops *lm = ls->ls_ops; 41 const struct lm_lockops *lm = ls->ls_ops;
41 va_list args; 42 va_list args;
43 struct va_format vaf;
42 44
43 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW && 45 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
44 test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) 46 test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
45 return 0; 47 return 0;
46 48
47 va_start(args, fmt); 49 va_start(args, fmt);
48 vprintk(fmt, args); 50
51 vaf.fmt = fmt;
52 vaf.va = &args;
53
54 fs_err(sdp, "%pV", &vaf);
55
49 va_end(args); 56 va_end(args);
50 57
51 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) { 58 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
@@ -66,7 +73,7 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
66 } 73 }
67 74
68 if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) 75 if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
69 panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname); 76 panic("GFS2: fsid=%s: panic requested\n", sdp->sd_fsname);
70 77
71 return -1; 78 return -1;
72} 79}
@@ -82,10 +89,9 @@ int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
82{ 89{
83 int me; 90 int me;
84 me = gfs2_lm_withdraw(sdp, 91 me = gfs2_lm_withdraw(sdp,
85 "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n" 92 "fatal: assertion \"%s\" failed\n"
86 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 93 " function = %s, file = %s, line = %u\n",
87 sdp->sd_fsname, assertion, 94 assertion, function, file, line);
88 sdp->sd_fsname, function, file, line);
89 dump_stack(); 95 dump_stack();
90 return (me) ? -1 : -2; 96 return (me) ? -1 : -2;
91} 97}
@@ -105,11 +111,8 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
105 return -2; 111 return -2;
106 112
107 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) 113 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW)
108 printk(KERN_WARNING 114 fs_warn(sdp, "warning: assertion \"%s\" failed at function = %s, file = %s, line = %u\n",
109 "GFS2: fsid=%s: warning: assertion \"%s\" failed\n" 115 assertion, function, file, line);
110 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
111 sdp->sd_fsname, assertion,
112 sdp->sd_fsname, function, file, line);
113 116
114 if (sdp->sd_args.ar_debug) 117 if (sdp->sd_args.ar_debug)
115 BUG(); 118 BUG();
@@ -138,10 +141,8 @@ int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
138{ 141{
139 int rv; 142 int rv;
140 rv = gfs2_lm_withdraw(sdp, 143 rv = gfs2_lm_withdraw(sdp,
141 "GFS2: fsid=%s: fatal: filesystem consistency error\n" 144 "fatal: filesystem consistency error - function = %s, file = %s, line = %u\n",
142 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 145 function, file, line);
143 sdp->sd_fsname,
144 sdp->sd_fsname, function, file, line);
145 return rv; 146 return rv;
146} 147}
147 148
@@ -157,13 +158,12 @@ int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
157 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 158 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
158 int rv; 159 int rv;
159 rv = gfs2_lm_withdraw(sdp, 160 rv = gfs2_lm_withdraw(sdp,
160 "GFS2: fsid=%s: fatal: filesystem consistency error\n" 161 "fatal: filesystem consistency error\n"
161 "GFS2: fsid=%s: inode = %llu %llu\n" 162 " inode = %llu %llu\n"
162 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 163 " function = %s, file = %s, line = %u\n",
163 sdp->sd_fsname, 164 (unsigned long long)ip->i_no_formal_ino,
164 sdp->sd_fsname, (unsigned long long)ip->i_no_formal_ino, 165 (unsigned long long)ip->i_no_addr,
165 (unsigned long long)ip->i_no_addr, 166 function, file, line);
166 sdp->sd_fsname, function, file, line);
167 return rv; 167 return rv;
168} 168}
169 169
@@ -179,12 +179,11 @@ int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
179 struct gfs2_sbd *sdp = rgd->rd_sbd; 179 struct gfs2_sbd *sdp = rgd->rd_sbd;
180 int rv; 180 int rv;
181 rv = gfs2_lm_withdraw(sdp, 181 rv = gfs2_lm_withdraw(sdp,
182 "GFS2: fsid=%s: fatal: filesystem consistency error\n" 182 "fatal: filesystem consistency error\n"
183 "GFS2: fsid=%s: RG = %llu\n" 183 " RG = %llu\n"
184 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 184 " function = %s, file = %s, line = %u\n",
185 sdp->sd_fsname, 185 (unsigned long long)rgd->rd_addr,
186 sdp->sd_fsname, (unsigned long long)rgd->rd_addr, 186 function, file, line);
187 sdp->sd_fsname, function, file, line);
188 return rv; 187 return rv;
189} 188}
190 189
@@ -200,12 +199,11 @@ int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
200{ 199{
201 int me; 200 int me;
202 me = gfs2_lm_withdraw(sdp, 201 me = gfs2_lm_withdraw(sdp,
203 "GFS2: fsid=%s: fatal: invalid metadata block\n" 202 "fatal: invalid metadata block\n"
204 "GFS2: fsid=%s: bh = %llu (%s)\n" 203 " bh = %llu (%s)\n"
205 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 204 " function = %s, file = %s, line = %u\n",
206 sdp->sd_fsname, 205 (unsigned long long)bh->b_blocknr, type,
207 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, 206 function, file, line);
208 sdp->sd_fsname, function, file, line);
209 return (me) ? -1 : -2; 207 return (me) ? -1 : -2;
210} 208}
211 209
@@ -221,12 +219,11 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
221{ 219{
222 int me; 220 int me;
223 me = gfs2_lm_withdraw(sdp, 221 me = gfs2_lm_withdraw(sdp,
224 "GFS2: fsid=%s: fatal: invalid metadata block\n" 222 "fatal: invalid metadata block\n"
225 "GFS2: fsid=%s: bh = %llu (type: exp=%u, found=%u)\n" 223 " bh = %llu (type: exp=%u, found=%u)\n"
226 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 224 " function = %s, file = %s, line = %u\n",
227 sdp->sd_fsname, 225 (unsigned long long)bh->b_blocknr, type, t,
228 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t, 226 function, file, line);
229 sdp->sd_fsname, function, file, line);
230 return (me) ? -1 : -2; 227 return (me) ? -1 : -2;
231} 228}
232 229
@@ -241,10 +238,9 @@ int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
241{ 238{
242 int rv; 239 int rv;
243 rv = gfs2_lm_withdraw(sdp, 240 rv = gfs2_lm_withdraw(sdp,
244 "GFS2: fsid=%s: fatal: I/O error\n" 241 "fatal: I/O error\n"
245 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 242 " function = %s, file = %s, line = %u\n",
246 sdp->sd_fsname, 243 function, file, line);
247 sdp->sd_fsname, function, file, line);
248 return rv; 244 return rv;
249} 245}
250 246
@@ -259,12 +255,11 @@ int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
259{ 255{
260 int rv; 256 int rv;
261 rv = gfs2_lm_withdraw(sdp, 257 rv = gfs2_lm_withdraw(sdp,
262 "GFS2: fsid=%s: fatal: I/O error\n" 258 "fatal: I/O error\n"
263 "GFS2: fsid=%s: block = %llu\n" 259 " block = %llu\n"
264 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 260 " function = %s, file = %s, line = %u\n",
265 sdp->sd_fsname, 261 (unsigned long long)bh->b_blocknr,
266 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, 262 function, file, line);
267 sdp->sd_fsname, function, file, line);
268 return rv; 263 return rv;
269} 264}
270 265
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index b7ffb09b99ea..cbdcbdf39614 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -10,22 +10,23 @@
10#ifndef __UTIL_DOT_H__ 10#ifndef __UTIL_DOT_H__
11#define __UTIL_DOT_H__ 11#define __UTIL_DOT_H__
12 12
13#ifdef pr_fmt
14#undef pr_fmt
15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16#endif
17
13#include <linux/mempool.h> 18#include <linux/mempool.h>
14 19
15#include "incore.h" 20#include "incore.h"
16 21
17#define fs_printk(level, fs, fmt, arg...) \ 22#define fs_emerg(fs, fmt, ...) \
18 printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg) 23 pr_emerg("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
19 24#define fs_warn(fs, fmt, ...) \
20#define fs_info(fs, fmt, arg...) \ 25 pr_warn("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
21 fs_printk(KERN_INFO , fs , fmt , ## arg) 26#define fs_err(fs, fmt, ...) \
22 27 pr_err("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
23#define fs_warn(fs, fmt, arg...) \ 28#define fs_info(fs, fmt, ...) \
24 fs_printk(KERN_WARNING , fs , fmt , ## arg) 29 pr_info("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
25
26#define fs_err(fs, fmt, arg...) \
27 fs_printk(KERN_ERR, fs , fmt , ## arg)
28
29 30
30void gfs2_assert_i(struct gfs2_sbd *sdp); 31void gfs2_assert_i(struct gfs2_sbd *sdp);
31 32
@@ -85,7 +86,7 @@ static inline int gfs2_meta_check(struct gfs2_sbd *sdp,
85 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; 86 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
86 u32 magic = be32_to_cpu(mh->mh_magic); 87 u32 magic = be32_to_cpu(mh->mh_magic);
87 if (unlikely(magic != GFS2_MAGIC)) { 88 if (unlikely(magic != GFS2_MAGIC)) {
88 printk(KERN_ERR "GFS2: Magic number missing at %llu\n", 89 pr_err("Magic number missing at %llu\n",
89 (unsigned long long)bh->b_blocknr); 90 (unsigned long long)bh->b_blocknr);
90 return -EIO; 91 return -EIO;
91 } 92 }
@@ -164,7 +165,7 @@ static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
164#define gfs2_tune_get(sdp, field) \ 165#define gfs2_tune_get(sdp, field) \
165gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) 166gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
166 167
167int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...); 168__printf(2, 3)
169int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...);
168 170
169#endif /* __UTIL_DOT_H__ */ 171#endif /* __UTIL_DOT_H__ */
170
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 380ab31b5e0f..9e2fecd62f62 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -547,7 +547,7 @@ out:
547 547
548void hfs_evict_inode(struct inode *inode) 548void hfs_evict_inode(struct inode *inode)
549{ 549{
550 truncate_inode_pages(&inode->i_data, 0); 550 truncate_inode_pages_final(&inode->i_data);
551 clear_inode(inode); 551 clear_inode(inode);
552 if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) { 552 if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) {
553 HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL; 553 HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 2d2039e754cd..eee7206c38d1 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -112,6 +112,7 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
112 112
113static int hfs_remount(struct super_block *sb, int *flags, char *data) 113static int hfs_remount(struct super_block *sb, int *flags, char *data)
114{ 114{
115 sync_filesystem(sb);
115 *flags |= MS_NODIRATIME; 116 *flags |= MS_NODIRATIME;
116 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 117 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
117 return 0; 118 return 0;
diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
index 0f47890299c4..caf89a7be0a1 100644
--- a/fs/hfsplus/attributes.c
+++ b/fs/hfsplus/attributes.c
@@ -11,7 +11,7 @@
11 11
12static struct kmem_cache *hfsplus_attr_tree_cachep; 12static struct kmem_cache *hfsplus_attr_tree_cachep;
13 13
14int hfsplus_create_attr_tree_cache(void) 14int __init hfsplus_create_attr_tree_cache(void)
15{ 15{
16 if (hfsplus_attr_tree_cachep) 16 if (hfsplus_attr_tree_cachep)
17 return -EEXIST; 17 return -EEXIST;
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index fbb212fbb1ef..a7aafb35b624 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -227,10 +227,8 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
227 u32 ablock, dblock, mask; 227 u32 ablock, dblock, mask;
228 sector_t sector; 228 sector_t sector;
229 int was_dirty = 0; 229 int was_dirty = 0;
230 int shift;
231 230
232 /* Convert inode block to disk allocation block */ 231 /* Convert inode block to disk allocation block */
233 shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
234 ablock = iblock >> sbi->fs_shift; 232 ablock = iblock >> sbi->fs_shift;
235 233
236 if (iblock >= hip->fs_blocks) { 234 if (iblock >= hip->fs_blocks) {
@@ -498,11 +496,13 @@ int hfsplus_file_extend(struct inode *inode)
498 goto insert_extent; 496 goto insert_extent;
499 } 497 }
500out: 498out:
501 mutex_unlock(&hip->extents_lock);
502 if (!res) { 499 if (!res) {
503 hip->alloc_blocks += len; 500 hip->alloc_blocks += len;
501 mutex_unlock(&hip->extents_lock);
504 hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY); 502 hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
503 return 0;
505 } 504 }
505 mutex_unlock(&hip->extents_lock);
506 return res; 506 return res;
507 507
508insert_extent: 508insert_extent:
@@ -556,11 +556,13 @@ void hfsplus_file_truncate(struct inode *inode)
556 556
557 blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >> 557 blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
558 HFSPLUS_SB(sb)->alloc_blksz_shift; 558 HFSPLUS_SB(sb)->alloc_blksz_shift;
559
560 mutex_lock(&hip->extents_lock);
561
559 alloc_cnt = hip->alloc_blocks; 562 alloc_cnt = hip->alloc_blocks;
560 if (blk_cnt == alloc_cnt) 563 if (blk_cnt == alloc_cnt)
561 goto out; 564 goto out_unlock;
562 565
563 mutex_lock(&hip->extents_lock);
564 res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); 566 res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
565 if (res) { 567 if (res) {
566 mutex_unlock(&hip->extents_lock); 568 mutex_unlock(&hip->extents_lock);
@@ -592,10 +594,10 @@ void hfsplus_file_truncate(struct inode *inode)
592 hfs_brec_remove(&fd); 594 hfs_brec_remove(&fd);
593 } 595 }
594 hfs_find_exit(&fd); 596 hfs_find_exit(&fd);
595 mutex_unlock(&hip->extents_lock);
596 597
597 hip->alloc_blocks = blk_cnt; 598 hip->alloc_blocks = blk_cnt;
598out: 599out_unlock:
600 mutex_unlock(&hip->extents_lock);
599 hip->phys_size = inode->i_size; 601 hip->phys_size = inode->i_size;
600 hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> 602 hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >>
601 sb->s_blocksize_bits; 603 sb->s_blocksize_bits;
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 62d571eb69ba..83dc29286b10 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -367,7 +367,7 @@ typedef int (*search_strategy_t)(struct hfs_bnode *,
367 */ 367 */
368 368
369/* attributes.c */ 369/* attributes.c */
370int hfsplus_create_attr_tree_cache(void); 370int __init hfsplus_create_attr_tree_cache(void);
371void hfsplus_destroy_attr_tree_cache(void); 371void hfsplus_destroy_attr_tree_cache(void);
372hfsplus_attr_entry *hfsplus_alloc_attr_entry(void); 372hfsplus_attr_entry *hfsplus_alloc_attr_entry(void);
373void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry_p); 373void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry_p);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 80875aa640ef..a513d2d36be9 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -161,7 +161,7 @@ static int hfsplus_write_inode(struct inode *inode,
161static void hfsplus_evict_inode(struct inode *inode) 161static void hfsplus_evict_inode(struct inode *inode)
162{ 162{
163 hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); 163 hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino);
164 truncate_inode_pages(&inode->i_data, 0); 164 truncate_inode_pages_final(&inode->i_data);
165 clear_inode(inode); 165 clear_inode(inode);
166 if (HFSPLUS_IS_RSRC(inode)) { 166 if (HFSPLUS_IS_RSRC(inode)) {
167 HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL; 167 HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
@@ -323,6 +323,7 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
323 323
324static int hfsplus_remount(struct super_block *sb, int *flags, char *data) 324static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
325{ 325{
326 sync_filesystem(sb);
326 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 327 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
327 return 0; 328 return 0;
328 if (!(*flags & MS_RDONLY)) { 329 if (!(*flags & MS_RDONLY)) {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index fe649d325b1f..9c470fde9878 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -230,7 +230,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
230 230
231static void hostfs_evict_inode(struct inode *inode) 231static void hostfs_evict_inode(struct inode *inode)
232{ 232{
233 truncate_inode_pages(&inode->i_data, 0); 233 truncate_inode_pages_final(&inode->i_data);
234 clear_inode(inode); 234 clear_inode(inode);
235 if (HOSTFS_I(inode)->fd != -1) { 235 if (HOSTFS_I(inode)->fd != -1) {
236 close_file(&HOSTFS_I(inode)->fd); 236 close_file(&HOSTFS_I(inode)->fd);
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 9edeeb0ea97e..50a427313835 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -304,7 +304,7 @@ void hpfs_write_if_changed(struct inode *inode)
304 304
305void hpfs_evict_inode(struct inode *inode) 305void hpfs_evict_inode(struct inode *inode)
306{ 306{
307 truncate_inode_pages(&inode->i_data, 0); 307 truncate_inode_pages_final(&inode->i_data);
308 clear_inode(inode); 308 clear_inode(inode);
309 if (!inode->i_nlink) { 309 if (!inode->i_nlink) {
310 hpfs_lock(inode->i_sb); 310 hpfs_lock(inode->i_sb);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 4534ff688b76..fe3463a43236 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -421,6 +421,8 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
421 struct hpfs_sb_info *sbi = hpfs_sb(s); 421 struct hpfs_sb_info *sbi = hpfs_sb(s);
422 char *new_opts = kstrdup(data, GFP_KERNEL); 422 char *new_opts = kstrdup(data, GFP_KERNEL);
423 423
424 sync_filesystem(s);
425
424 *flags |= MS_NOATIME; 426 *flags |= MS_NOATIME;
425 427
426 hpfs_lock(s); 428 hpfs_lock(s);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d19b30ababf1..204027520937 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -366,7 +366,13 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
366 366
367static void hugetlbfs_evict_inode(struct inode *inode) 367static void hugetlbfs_evict_inode(struct inode *inode)
368{ 368{
369 struct resv_map *resv_map;
370
369 truncate_hugepages(inode, 0); 371 truncate_hugepages(inode, 0);
372 resv_map = (struct resv_map *)inode->i_mapping->private_data;
373 /* root inode doesn't have the resv_map, so we should check it */
374 if (resv_map)
375 resv_map_release(&resv_map->refs);
370 clear_inode(inode); 376 clear_inode(inode);
371} 377}
372 378
@@ -476,6 +482,11 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
476 umode_t mode, dev_t dev) 482 umode_t mode, dev_t dev)
477{ 483{
478 struct inode *inode; 484 struct inode *inode;
485 struct resv_map *resv_map;
486
487 resv_map = resv_map_alloc();
488 if (!resv_map)
489 return NULL;
479 490
480 inode = new_inode(sb); 491 inode = new_inode(sb);
481 if (inode) { 492 if (inode) {
@@ -487,7 +498,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
487 inode->i_mapping->a_ops = &hugetlbfs_aops; 498 inode->i_mapping->a_ops = &hugetlbfs_aops;
488 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; 499 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
489 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 500 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
490 INIT_LIST_HEAD(&inode->i_mapping->private_list); 501 inode->i_mapping->private_data = resv_map;
491 info = HUGETLBFS_I(inode); 502 info = HUGETLBFS_I(inode);
492 /* 503 /*
493 * The policy is initialized here even if we are creating a 504 * The policy is initialized here even if we are creating a
@@ -517,7 +528,9 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
517 break; 528 break;
518 } 529 }
519 lockdep_annotate_inode_mutex_key(inode); 530 lockdep_annotate_inode_mutex_key(inode);
520 } 531 } else
532 kref_put(&resv_map->refs, resv_map_release);
533
521 return inode; 534 return inode;
522} 535}
523 536
diff --git a/fs/inode.c b/fs/inode.c
index 4bcdad3c9361..f96d2a6f88cc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -503,6 +503,7 @@ void clear_inode(struct inode *inode)
503 */ 503 */
504 spin_lock_irq(&inode->i_data.tree_lock); 504 spin_lock_irq(&inode->i_data.tree_lock);
505 BUG_ON(inode->i_data.nrpages); 505 BUG_ON(inode->i_data.nrpages);
506 BUG_ON(inode->i_data.nrshadows);
506 spin_unlock_irq(&inode->i_data.tree_lock); 507 spin_unlock_irq(&inode->i_data.tree_lock);
507 BUG_ON(!list_empty(&inode->i_data.private_list)); 508 BUG_ON(!list_empty(&inode->i_data.private_list));
508 BUG_ON(!(inode->i_state & I_FREEING)); 509 BUG_ON(!(inode->i_state & I_FREEING));
@@ -548,8 +549,7 @@ static void evict(struct inode *inode)
548 if (op->evict_inode) { 549 if (op->evict_inode) {
549 op->evict_inode(inode); 550 op->evict_inode(inode);
550 } else { 551 } else {
551 if (inode->i_data.nrpages) 552 truncate_inode_pages_final(&inode->i_data);
552 truncate_inode_pages(&inode->i_data, 0);
553 clear_inode(inode); 553 clear_inode(inode);
554 } 554 }
555 if (S_ISBLK(inode->i_mode) && inode->i_bdev) 555 if (S_ISBLK(inode->i_mode) && inode->i_bdev)
@@ -944,24 +944,22 @@ EXPORT_SYMBOL(unlock_new_inode);
944 944
945/** 945/**
946 * lock_two_nondirectories - take two i_mutexes on non-directory objects 946 * lock_two_nondirectories - take two i_mutexes on non-directory objects
947 *
948 * Lock any non-NULL argument that is not a directory.
949 * Zero, one or two objects may be locked by this function.
950 *
947 * @inode1: first inode to lock 951 * @inode1: first inode to lock
948 * @inode2: second inode to lock 952 * @inode2: second inode to lock
949 */ 953 */
950void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) 954void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
951{ 955{
952 WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); 956 if (inode1 > inode2)
953 if (inode1 == inode2 || !inode2) { 957 swap(inode1, inode2);
954 mutex_lock(&inode1->i_mutex); 958
955 return; 959 if (inode1 && !S_ISDIR(inode1->i_mode))
956 }
957 WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
958 if (inode1 < inode2) {
959 mutex_lock(&inode1->i_mutex); 960 mutex_lock(&inode1->i_mutex);
961 if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
960 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2); 962 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2);
961 } else {
962 mutex_lock(&inode2->i_mutex);
963 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_NONDIR2);
964 }
965} 963}
966EXPORT_SYMBOL(lock_two_nondirectories); 964EXPORT_SYMBOL(lock_two_nondirectories);
967 965
@@ -972,8 +970,9 @@ EXPORT_SYMBOL(lock_two_nondirectories);
972 */ 970 */
973void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) 971void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
974{ 972{
975 mutex_unlock(&inode1->i_mutex); 973 if (inode1 && !S_ISDIR(inode1->i_mode))
976 if (inode2 && inode2 != inode1) 974 mutex_unlock(&inode1->i_mutex);
975 if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
977 mutex_unlock(&inode2->i_mutex); 976 mutex_unlock(&inode2->i_mutex);
978} 977}
979EXPORT_SYMBOL(unlock_two_nondirectories); 978EXPORT_SYMBOL(unlock_two_nondirectories);
@@ -1899,3 +1898,34 @@ void inode_dio_done(struct inode *inode)
1899 wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); 1898 wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
1900} 1899}
1901EXPORT_SYMBOL(inode_dio_done); 1900EXPORT_SYMBOL(inode_dio_done);
1901
1902/*
1903 * inode_set_flags - atomically set some inode flags
1904 *
1905 * Note: the caller should be holding i_mutex, or else be sure that
1906 * they have exclusive access to the inode structure (i.e., while the
1907 * inode is being instantiated). The reason for the cmpxchg() loop
1908 * --- which wouldn't be necessary if all code paths which modify
1909 * i_flags actually followed this rule, is that there is at least one
1910 * code path which doesn't today --- for example,
1911 * __generic_file_aio_write() calls file_remove_suid() without holding
1912 * i_mutex --- so we use cmpxchg() out of an abundance of caution.
1913 *
1914 * In the long run, i_mutex is overkill, and we should probably look
1915 * at using the i_lock spinlock to protect i_flags, and then make sure
1916 * it is so documented in include/linux/fs.h and that all code follows
1917 * the locking convention!!
1918 */
1919void inode_set_flags(struct inode *inode, unsigned int flags,
1920 unsigned int mask)
1921{
1922 unsigned int old_flags, new_flags;
1923
1924 WARN_ON_ONCE(flags & ~mask);
1925 do {
1926 old_flags = ACCESS_ONCE(inode->i_flags);
1927 new_flags = (old_flags & ~mask) | flags;
1928 } while (unlikely(cmpxchg(&inode->i_flags, old_flags,
1929 new_flags) != old_flags));
1930}
1931EXPORT_SYMBOL(inode_set_flags);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 4a9e10ea13f2..6af66ee56390 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -117,6 +117,7 @@ static void destroy_inodecache(void)
117 117
118static int isofs_remount(struct super_block *sb, int *flags, char *data) 118static int isofs_remount(struct super_block *sb, int *flags, char *data)
119{ 119{
120 sync_filesystem(sb);
120 if (!(*flags & MS_RDONLY)) 121 if (!(*flags & MS_RDONLY))
121 return -EROFS; 122 return -EROFS;
122 return 0; 123 return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index cf2fc0594063..5f26139a165a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -555,7 +555,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
555 blk_start_plug(&plug); 555 blk_start_plug(&plug);
556 jbd2_journal_write_revoke_records(journal, commit_transaction, 556 jbd2_journal_write_revoke_records(journal, commit_transaction,
557 &log_bufs, WRITE_SYNC); 557 &log_bufs, WRITE_SYNC);
558 blk_finish_plug(&plug);
559 558
560 jbd_debug(3, "JBD2: commit phase 2b\n"); 559 jbd_debug(3, "JBD2: commit phase 2b\n");
561 560
@@ -582,7 +581,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
582 err = 0; 581 err = 0;
583 bufs = 0; 582 bufs = 0;
584 descriptor = NULL; 583 descriptor = NULL;
585 blk_start_plug(&plug);
586 while (commit_transaction->t_buffers) { 584 while (commit_transaction->t_buffers) {
587 585
588 /* Find the next buffer to be journaled... */ 586 /* Find the next buffer to be journaled... */
@@ -1067,6 +1065,25 @@ restart_loop:
1067 goto restart_loop; 1065 goto restart_loop;
1068 } 1066 }
1069 1067
1068 /* Add the transaction to the checkpoint list
1069 * __journal_remove_checkpoint() can not destroy transaction
1070 * under us because it is not marked as T_FINISHED yet */
1071 if (journal->j_checkpoint_transactions == NULL) {
1072 journal->j_checkpoint_transactions = commit_transaction;
1073 commit_transaction->t_cpnext = commit_transaction;
1074 commit_transaction->t_cpprev = commit_transaction;
1075 } else {
1076 commit_transaction->t_cpnext =
1077 journal->j_checkpoint_transactions;
1078 commit_transaction->t_cpprev =
1079 commit_transaction->t_cpnext->t_cpprev;
1080 commit_transaction->t_cpnext->t_cpprev =
1081 commit_transaction;
1082 commit_transaction->t_cpprev->t_cpnext =
1083 commit_transaction;
1084 }
1085 spin_unlock(&journal->j_list_lock);
1086
1070 /* Done with this transaction! */ 1087 /* Done with this transaction! */
1071 1088
1072 jbd_debug(3, "JBD2: commit phase 7\n"); 1089 jbd_debug(3, "JBD2: commit phase 7\n");
@@ -1085,24 +1102,7 @@ restart_loop:
1085 atomic_read(&commit_transaction->t_handle_count); 1102 atomic_read(&commit_transaction->t_handle_count);
1086 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, 1103 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1087 commit_transaction->t_tid, &stats.run); 1104 commit_transaction->t_tid, &stats.run);
1088 1105 stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1089 /*
1090 * Calculate overall stats
1091 */
1092 spin_lock(&journal->j_history_lock);
1093 journal->j_stats.ts_tid++;
1094 if (commit_transaction->t_requested)
1095 journal->j_stats.ts_requested++;
1096 journal->j_stats.run.rs_wait += stats.run.rs_wait;
1097 journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1098 journal->j_stats.run.rs_running += stats.run.rs_running;
1099 journal->j_stats.run.rs_locked += stats.run.rs_locked;
1100 journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1101 journal->j_stats.run.rs_logging += stats.run.rs_logging;
1102 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1103 journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1104 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1105 spin_unlock(&journal->j_history_lock);
1106 1106
1107 commit_transaction->t_state = T_COMMIT_CALLBACK; 1107 commit_transaction->t_state = T_COMMIT_CALLBACK;
1108 J_ASSERT(commit_transaction == journal->j_committing_transaction); 1108 J_ASSERT(commit_transaction == journal->j_committing_transaction);
@@ -1122,24 +1122,6 @@ restart_loop:
1122 1122
1123 write_unlock(&journal->j_state_lock); 1123 write_unlock(&journal->j_state_lock);
1124 1124
1125 if (journal->j_checkpoint_transactions == NULL) {
1126 journal->j_checkpoint_transactions = commit_transaction;
1127 commit_transaction->t_cpnext = commit_transaction;
1128 commit_transaction->t_cpprev = commit_transaction;
1129 } else {
1130 commit_transaction->t_cpnext =
1131 journal->j_checkpoint_transactions;
1132 commit_transaction->t_cpprev =
1133 commit_transaction->t_cpnext->t_cpprev;
1134 commit_transaction->t_cpnext->t_cpprev =
1135 commit_transaction;
1136 commit_transaction->t_cpprev->t_cpnext =
1137 commit_transaction;
1138 }
1139 spin_unlock(&journal->j_list_lock);
1140 /* Drop all spin_locks because commit_callback may be block.
1141 * __journal_remove_checkpoint() can not destroy transaction
1142 * under us because it is not marked as T_FINISHED yet */
1143 if (journal->j_commit_callback) 1125 if (journal->j_commit_callback)
1144 journal->j_commit_callback(journal, commit_transaction); 1126 journal->j_commit_callback(journal, commit_transaction);
1145 1127
@@ -1150,7 +1132,7 @@ restart_loop:
1150 write_lock(&journal->j_state_lock); 1132 write_lock(&journal->j_state_lock);
1151 spin_lock(&journal->j_list_lock); 1133 spin_lock(&journal->j_list_lock);
1152 commit_transaction->t_state = T_FINISHED; 1134 commit_transaction->t_state = T_FINISHED;
1153 /* Recheck checkpoint lists after j_list_lock was dropped */ 1135 /* Check if the transaction can be dropped now that we are finished */
1154 if (commit_transaction->t_checkpoint_list == NULL && 1136 if (commit_transaction->t_checkpoint_list == NULL &&
1155 commit_transaction->t_checkpoint_io_list == NULL) { 1137 commit_transaction->t_checkpoint_io_list == NULL) {
1156 __jbd2_journal_drop_transaction(journal, commit_transaction); 1138 __jbd2_journal_drop_transaction(journal, commit_transaction);
@@ -1159,4 +1141,21 @@ restart_loop:
1159 spin_unlock(&journal->j_list_lock); 1141 spin_unlock(&journal->j_list_lock);
1160 write_unlock(&journal->j_state_lock); 1142 write_unlock(&journal->j_state_lock);
1161 wake_up(&journal->j_wait_done_commit); 1143 wake_up(&journal->j_wait_done_commit);
1144
1145 /*
1146 * Calculate overall stats
1147 */
1148 spin_lock(&journal->j_history_lock);
1149 journal->j_stats.ts_tid++;
1150 journal->j_stats.ts_requested += stats.ts_requested;
1151 journal->j_stats.run.rs_wait += stats.run.rs_wait;
1152 journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1153 journal->j_stats.run.rs_running += stats.run.rs_running;
1154 journal->j_stats.run.rs_locked += stats.run.rs_locked;
1155 journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1156 journal->j_stats.run.rs_logging += stats.run.rs_logging;
1157 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1158 journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1159 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1160 spin_unlock(&journal->j_history_lock);
1162} 1161}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 5fa344afb49a..67b8e303946c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -122,7 +122,7 @@ EXPORT_SYMBOL(__jbd2_debug);
122#endif 122#endif
123 123
124/* Checksumming functions */ 124/* Checksumming functions */
125int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) 125static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
126{ 126{
127 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 127 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
128 return 1; 128 return 1;
@@ -143,7 +143,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
143 return cpu_to_be32(csum); 143 return cpu_to_be32(csum);
144} 144}
145 145
146int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) 146static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
147{ 147{
148 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 148 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
149 return 1; 149 return 1;
@@ -151,7 +151,7 @@ int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
151 return sb->s_checksum == jbd2_superblock_csum(j, sb); 151 return sb->s_checksum == jbd2_superblock_csum(j, sb);
152} 152}
153 153
154void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) 154static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
155{ 155{
156 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 156 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
157 return; 157 return;
@@ -302,8 +302,8 @@ static void journal_kill_thread(journal_t *journal)
302 journal->j_flags |= JBD2_UNMOUNT; 302 journal->j_flags |= JBD2_UNMOUNT;
303 303
304 while (journal->j_task) { 304 while (journal->j_task) {
305 wake_up(&journal->j_wait_commit);
306 write_unlock(&journal->j_state_lock); 305 write_unlock(&journal->j_state_lock);
306 wake_up(&journal->j_wait_commit);
307 wait_event(journal->j_wait_done_commit, journal->j_task == NULL); 307 wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
308 write_lock(&journal->j_state_lock); 308 write_lock(&journal->j_state_lock);
309 } 309 }
@@ -710,8 +710,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
710 while (tid_gt(tid, journal->j_commit_sequence)) { 710 while (tid_gt(tid, journal->j_commit_sequence)) {
711 jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n", 711 jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n",
712 tid, journal->j_commit_sequence); 712 tid, journal->j_commit_sequence);
713 wake_up(&journal->j_wait_commit);
714 read_unlock(&journal->j_state_lock); 713 read_unlock(&journal->j_state_lock);
714 wake_up(&journal->j_wait_commit);
715 wait_event(journal->j_wait_done_commit, 715 wait_event(journal->j_wait_done_commit,
716 !tid_gt(tid, journal->j_commit_sequence)); 716 !tid_gt(tid, journal->j_commit_sequence));
717 read_lock(&journal->j_state_lock); 717 read_lock(&journal->j_state_lock);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 60bb365f54a5..38cfcf5f6fce 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1073,7 +1073,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
1073 * reused here. 1073 * reused here.
1074 */ 1074 */
1075 jbd_lock_bh_state(bh); 1075 jbd_lock_bh_state(bh);
1076 spin_lock(&journal->j_list_lock);
1077 J_ASSERT_JH(jh, (jh->b_transaction == transaction || 1076 J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
1078 jh->b_transaction == NULL || 1077 jh->b_transaction == NULL ||
1079 (jh->b_transaction == journal->j_committing_transaction && 1078 (jh->b_transaction == journal->j_committing_transaction &&
@@ -1096,12 +1095,14 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
1096 jh->b_modified = 0; 1095 jh->b_modified = 0;
1097 1096
1098 JBUFFER_TRACE(jh, "file as BJ_Reserved"); 1097 JBUFFER_TRACE(jh, "file as BJ_Reserved");
1098 spin_lock(&journal->j_list_lock);
1099 __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); 1099 __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
1100 } else if (jh->b_transaction == journal->j_committing_transaction) { 1100 } else if (jh->b_transaction == journal->j_committing_transaction) {
1101 /* first access by this transaction */ 1101 /* first access by this transaction */
1102 jh->b_modified = 0; 1102 jh->b_modified = 0;
1103 1103
1104 JBUFFER_TRACE(jh, "set next transaction"); 1104 JBUFFER_TRACE(jh, "set next transaction");
1105 spin_lock(&journal->j_list_lock);
1105 jh->b_next_transaction = transaction; 1106 jh->b_next_transaction = transaction;
1106 } 1107 }
1107 spin_unlock(&journal->j_list_lock); 1108 spin_unlock(&journal->j_list_lock);
@@ -1312,7 +1313,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1312 journal->j_running_transaction)) { 1313 journal->j_running_transaction)) {
1313 printk(KERN_ERR "JBD2: %s: " 1314 printk(KERN_ERR "JBD2: %s: "
1314 "jh->b_transaction (%llu, %p, %u) != " 1315 "jh->b_transaction (%llu, %p, %u) != "
1315 "journal->j_running_transaction (%p, %u)", 1316 "journal->j_running_transaction (%p, %u)\n",
1316 journal->j_devname, 1317 journal->j_devname,
1317 (unsigned long long) bh->b_blocknr, 1318 (unsigned long long) bh->b_blocknr,
1318 jh->b_transaction, 1319 jh->b_transaction,
@@ -1335,30 +1336,25 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1335 */ 1336 */
1336 if (jh->b_transaction != transaction) { 1337 if (jh->b_transaction != transaction) {
1337 JBUFFER_TRACE(jh, "already on other transaction"); 1338 JBUFFER_TRACE(jh, "already on other transaction");
1338 if (unlikely(jh->b_transaction != 1339 if (unlikely(((jh->b_transaction !=
1339 journal->j_committing_transaction)) { 1340 journal->j_committing_transaction)) ||
1340 printk(KERN_ERR "JBD2: %s: " 1341 (jh->b_next_transaction != transaction))) {
1341 "jh->b_transaction (%llu, %p, %u) != " 1342 printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: "
1342 "journal->j_committing_transaction (%p, %u)", 1343 "bad jh for block %llu: "
1344 "transaction (%p, %u), "
1345 "jh->b_transaction (%p, %u), "
1346 "jh->b_next_transaction (%p, %u), jlist %u\n",
1343 journal->j_devname, 1347 journal->j_devname,
1344 (unsigned long long) bh->b_blocknr, 1348 (unsigned long long) bh->b_blocknr,
1349 transaction, transaction->t_tid,
1345 jh->b_transaction, 1350 jh->b_transaction,
1346 jh->b_transaction ? jh->b_transaction->t_tid : 0, 1351 jh->b_transaction ?
1347 journal->j_committing_transaction, 1352 jh->b_transaction->t_tid : 0,
1348 journal->j_committing_transaction ?
1349 journal->j_committing_transaction->t_tid : 0);
1350 ret = -EINVAL;
1351 }
1352 if (unlikely(jh->b_next_transaction != transaction)) {
1353 printk(KERN_ERR "JBD2: %s: "
1354 "jh->b_next_transaction (%llu, %p, %u) != "
1355 "transaction (%p, %u)",
1356 journal->j_devname,
1357 (unsigned long long) bh->b_blocknr,
1358 jh->b_next_transaction, 1353 jh->b_next_transaction,
1359 jh->b_next_transaction ? 1354 jh->b_next_transaction ?
1360 jh->b_next_transaction->t_tid : 0, 1355 jh->b_next_transaction->t_tid : 0,
1361 transaction, transaction->t_tid); 1356 jh->b_jlist);
1357 WARN_ON(1);
1362 ret = -EINVAL; 1358 ret = -EINVAL;
1363 } 1359 }
1364 /* And this case is illegal: we can't reuse another 1360 /* And this case is illegal: we can't reuse another
@@ -1415,7 +1411,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1415 BUFFER_TRACE(bh, "entry"); 1411 BUFFER_TRACE(bh, "entry");
1416 1412
1417 jbd_lock_bh_state(bh); 1413 jbd_lock_bh_state(bh);
1418 spin_lock(&journal->j_list_lock);
1419 1414
1420 if (!buffer_jbd(bh)) 1415 if (!buffer_jbd(bh))
1421 goto not_jbd; 1416 goto not_jbd;
@@ -1468,6 +1463,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1468 * we know to remove the checkpoint after we commit. 1463 * we know to remove the checkpoint after we commit.
1469 */ 1464 */
1470 1465
1466 spin_lock(&journal->j_list_lock);
1471 if (jh->b_cp_transaction) { 1467 if (jh->b_cp_transaction) {
1472 __jbd2_journal_temp_unlink_buffer(jh); 1468 __jbd2_journal_temp_unlink_buffer(jh);
1473 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); 1469 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
@@ -1480,6 +1476,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1480 goto drop; 1476 goto drop;
1481 } 1477 }
1482 } 1478 }
1479 spin_unlock(&journal->j_list_lock);
1483 } else if (jh->b_transaction) { 1480 } else if (jh->b_transaction) {
1484 J_ASSERT_JH(jh, (jh->b_transaction == 1481 J_ASSERT_JH(jh, (jh->b_transaction ==
1485 journal->j_committing_transaction)); 1482 journal->j_committing_transaction));
@@ -1491,7 +1488,9 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1491 1488
1492 if (jh->b_next_transaction) { 1489 if (jh->b_next_transaction) {
1493 J_ASSERT(jh->b_next_transaction == transaction); 1490 J_ASSERT(jh->b_next_transaction == transaction);
1491 spin_lock(&journal->j_list_lock);
1494 jh->b_next_transaction = NULL; 1492 jh->b_next_transaction = NULL;
1493 spin_unlock(&journal->j_list_lock);
1495 1494
1496 /* 1495 /*
1497 * only drop a reference if this transaction modified 1496 * only drop a reference if this transaction modified
@@ -1503,7 +1502,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1503 } 1502 }
1504 1503
1505not_jbd: 1504not_jbd:
1506 spin_unlock(&journal->j_list_lock);
1507 jbd_unlock_bh_state(bh); 1505 jbd_unlock_bh_state(bh);
1508 __brelse(bh); 1506 __brelse(bh);
1509drop: 1507drop:
@@ -1821,11 +1819,11 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1821 if (buffer_locked(bh) || buffer_dirty(bh)) 1819 if (buffer_locked(bh) || buffer_dirty(bh))
1822 goto out; 1820 goto out;
1823 1821
1824 if (jh->b_next_transaction != NULL) 1822 if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
1825 goto out; 1823 goto out;
1826 1824
1827 spin_lock(&journal->j_list_lock); 1825 spin_lock(&journal->j_list_lock);
1828 if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { 1826 if (jh->b_cp_transaction != NULL) {
1829 /* written-back checkpointed metadata buffer */ 1827 /* written-back checkpointed metadata buffer */
1830 JBUFFER_TRACE(jh, "remove from checkpoint list"); 1828 JBUFFER_TRACE(jh, "remove from checkpoint list");
1831 __jbd2_journal_remove_checkpoint(jh); 1829 __jbd2_journal_remove_checkpoint(jh);
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index a69e426435dd..f73991522672 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -242,7 +242,7 @@ void jffs2_evict_inode (struct inode *inode)
242 242
243 jffs2_dbg(1, "%s(): ino #%lu mode %o\n", 243 jffs2_dbg(1, "%s(): ino #%lu mode %o\n",
244 __func__, inode->i_ino, inode->i_mode); 244 __func__, inode->i_ino, inode->i_mode);
245 truncate_inode_pages(&inode->i_data, 0); 245 truncate_inode_pages_final(&inode->i_data);
246 clear_inode(inode); 246 clear_inode(inode);
247 jffs2_do_clear_inode(c, f); 247 jffs2_do_clear_inode(c, f);
248} 248}
@@ -687,7 +687,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c,
687 struct inode *inode = OFNI_EDONI_2SFFJ(f); 687 struct inode *inode = OFNI_EDONI_2SFFJ(f);
688 struct page *pg; 688 struct page *pg;
689 689
690 pg = read_cache_page_async(inode->i_mapping, offset >> PAGE_CACHE_SHIFT, 690 pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
691 (void *)jffs2_do_readpage_unlock, inode); 691 (void *)jffs2_do_readpage_unlock, inode);
692 if (IS_ERR(pg)) 692 if (IS_ERR(pg))
693 return (void *)pg; 693 return (void *)pg;
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 0defb1cc2a35..0918f0e2e266 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -243,6 +243,7 @@ static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data)
243 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); 243 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
244 int err; 244 int err;
245 245
246 sync_filesystem(sb);
246 err = jffs2_parse_options(c, data); 247 err = jffs2_parse_options(c, data);
247 if (err) 248 if (err)
248 return -EINVAL; 249 return -EINVAL;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index f4aab719add5..6f8fe72c2a7a 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -154,7 +154,7 @@ void jfs_evict_inode(struct inode *inode)
154 dquot_initialize(inode); 154 dquot_initialize(inode);
155 155
156 if (JFS_IP(inode)->fileset == FILESYSTEM_I) { 156 if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
157 truncate_inode_pages(&inode->i_data, 0); 157 truncate_inode_pages_final(&inode->i_data);
158 158
159 if (test_cflag(COMMIT_Freewmap, inode)) 159 if (test_cflag(COMMIT_Freewmap, inode))
160 jfs_free_zero_link(inode); 160 jfs_free_zero_link(inode);
@@ -168,7 +168,7 @@ void jfs_evict_inode(struct inode *inode)
168 dquot_free_inode(inode); 168 dquot_free_inode(inode);
169 } 169 }
170 } else { 170 } else {
171 truncate_inode_pages(&inode->i_data, 0); 171 truncate_inode_pages_final(&inode->i_data);
172 } 172 }
173 clear_inode(inode); 173 clear_inode(inode);
174 dquot_drop(inode); 174 dquot_drop(inode);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index e2b7483444fd..97f7fda51890 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -418,6 +418,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
418 int flag = JFS_SBI(sb)->flag; 418 int flag = JFS_SBI(sb)->flag;
419 int ret; 419 int ret;
420 420
421 sync_filesystem(sb);
421 if (!parse_options(data, sb, &newLVSize, &flag)) { 422 if (!parse_options(data, sb, &newLVSize, &flag)) {
422 return -EINVAL; 423 return -EINVAL;
423 } 424 }
diff --git a/fs/kernfs/Kconfig b/fs/kernfs/Kconfig
new file mode 100644
index 000000000000..397b5f7a7a16
--- /dev/null
+++ b/fs/kernfs/Kconfig
@@ -0,0 +1,7 @@
1#
2# KERNFS should be selected by its users
3#
4
5config KERNFS
6 bool
7 default n
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index bd6e18be6e1a..78f3403300af 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -8,6 +8,7 @@
8 * This file is released under the GPLv2. 8 * This file is released under the GPLv2.
9 */ 9 */
10 10
11#include <linux/sched.h>
11#include <linux/fs.h> 12#include <linux/fs.h>
12#include <linux/namei.h> 13#include <linux/namei.h>
13#include <linux/idr.h> 14#include <linux/idr.h>
@@ -18,9 +19,162 @@
18#include "kernfs-internal.h" 19#include "kernfs-internal.h"
19 20
20DEFINE_MUTEX(kernfs_mutex); 21DEFINE_MUTEX(kernfs_mutex);
22static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */
23static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */
21 24
22#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) 25#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
23 26
27static bool kernfs_active(struct kernfs_node *kn)
28{
29 lockdep_assert_held(&kernfs_mutex);
30 return atomic_read(&kn->active) >= 0;
31}
32
33static bool kernfs_lockdep(struct kernfs_node *kn)
34{
35#ifdef CONFIG_DEBUG_LOCK_ALLOC
36 return kn->flags & KERNFS_LOCKDEP;
37#else
38 return false;
39#endif
40}
41
42static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
43{
44 return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
45}
46
47static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
48 size_t buflen)
49{
50 char *p = buf + buflen;
51 int len;
52
53 *--p = '\0';
54
55 do {
56 len = strlen(kn->name);
57 if (p - buf < len + 1) {
58 buf[0] = '\0';
59 p = NULL;
60 break;
61 }
62 p -= len;
63 memcpy(p, kn->name, len);
64 *--p = '/';
65 kn = kn->parent;
66 } while (kn && kn->parent);
67
68 return p;
69}
70
71/**
72 * kernfs_name - obtain the name of a given node
73 * @kn: kernfs_node of interest
74 * @buf: buffer to copy @kn's name into
75 * @buflen: size of @buf
76 *
77 * Copies the name of @kn into @buf of @buflen bytes. The behavior is
78 * similar to strlcpy(). It returns the length of @kn's name and if @buf
79 * isn't long enough, it's filled upto @buflen-1 and nul terminated.
80 *
81 * This function can be called from any context.
82 */
83int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
84{
85 unsigned long flags;
86 int ret;
87
88 spin_lock_irqsave(&kernfs_rename_lock, flags);
89 ret = kernfs_name_locked(kn, buf, buflen);
90 spin_unlock_irqrestore(&kernfs_rename_lock, flags);
91 return ret;
92}
93
94/**
95 * kernfs_path - build full path of a given node
96 * @kn: kernfs_node of interest
97 * @buf: buffer to copy @kn's name into
98 * @buflen: size of @buf
99 *
100 * Builds and returns the full path of @kn in @buf of @buflen bytes. The
101 * path is built from the end of @buf so the returned pointer usually
102 * doesn't match @buf. If @buf isn't long enough, @buf is nul terminated
103 * and %NULL is returned.
104 */
105char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
106{
107 unsigned long flags;
108 char *p;
109
110 spin_lock_irqsave(&kernfs_rename_lock, flags);
111 p = kernfs_path_locked(kn, buf, buflen);
112 spin_unlock_irqrestore(&kernfs_rename_lock, flags);
113 return p;
114}
115EXPORT_SYMBOL_GPL(kernfs_path);
116
117/**
118 * pr_cont_kernfs_name - pr_cont name of a kernfs_node
119 * @kn: kernfs_node of interest
120 *
121 * This function can be called from any context.
122 */
123void pr_cont_kernfs_name(struct kernfs_node *kn)
124{
125 unsigned long flags;
126
127 spin_lock_irqsave(&kernfs_rename_lock, flags);
128
129 kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
130 pr_cont("%s", kernfs_pr_cont_buf);
131
132 spin_unlock_irqrestore(&kernfs_rename_lock, flags);
133}
134
135/**
136 * pr_cont_kernfs_path - pr_cont path of a kernfs_node
137 * @kn: kernfs_node of interest
138 *
139 * This function can be called from any context.
140 */
141void pr_cont_kernfs_path(struct kernfs_node *kn)
142{
143 unsigned long flags;
144 char *p;
145
146 spin_lock_irqsave(&kernfs_rename_lock, flags);
147
148 p = kernfs_path_locked(kn, kernfs_pr_cont_buf,
149 sizeof(kernfs_pr_cont_buf));
150 if (p)
151 pr_cont("%s", p);
152 else
153 pr_cont("<name too long>");
154
155 spin_unlock_irqrestore(&kernfs_rename_lock, flags);
156}
157
158/**
159 * kernfs_get_parent - determine the parent node and pin it
160 * @kn: kernfs_node of interest
161 *
162 * Determines @kn's parent, pins and returns it. This function can be
163 * called from any context.
164 */
165struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
166{
167 struct kernfs_node *parent;
168 unsigned long flags;
169
170 spin_lock_irqsave(&kernfs_rename_lock, flags);
171 parent = kn->parent;
172 kernfs_get(parent);
173 spin_unlock_irqrestore(&kernfs_rename_lock, flags);
174
175 return parent;
176}
177
24/** 178/**
25 * kernfs_name_hash 179 * kernfs_name_hash
26 * @name: Null terminated string to hash 180 * @name: Null terminated string to hash
@@ -37,7 +191,7 @@ static unsigned int kernfs_name_hash(const char *name, const void *ns)
37 hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31)); 191 hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
38 hash &= 0x7fffffffU; 192 hash &= 0x7fffffffU;
39 /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ 193 /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
40 if (hash < 1) 194 if (hash < 2)
41 hash += 2; 195 hash += 2;
42 if (hash >= INT_MAX) 196 if (hash >= INT_MAX)
43 hash = INT_MAX - 1; 197 hash = INT_MAX - 1;
@@ -105,18 +259,24 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
105 * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree 259 * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
106 * @kn: kernfs_node of interest 260 * @kn: kernfs_node of interest
107 * 261 *
108 * Unlink @kn from its sibling rbtree which starts from 262 * Try to unlink @kn from its sibling rbtree which starts from
109 * kn->parent->dir.children. 263 * kn->parent->dir.children. Returns %true if @kn was actually
264 * removed, %false if @kn wasn't on the rbtree.
110 * 265 *
111 * Locking: 266 * Locking:
112 * mutex_lock(kernfs_mutex) 267 * mutex_lock(kernfs_mutex)
113 */ 268 */
114static void kernfs_unlink_sibling(struct kernfs_node *kn) 269static bool kernfs_unlink_sibling(struct kernfs_node *kn)
115{ 270{
271 if (RB_EMPTY_NODE(&kn->rb))
272 return false;
273
116 if (kernfs_type(kn) == KERNFS_DIR) 274 if (kernfs_type(kn) == KERNFS_DIR)
117 kn->parent->dir.subdirs--; 275 kn->parent->dir.subdirs--;
118 276
119 rb_erase(&kn->rb, &kn->parent->dir.children); 277 rb_erase(&kn->rb, &kn->parent->dir.children);
278 RB_CLEAR_NODE(&kn->rb);
279 return true;
120} 280}
121 281
122/** 282/**
@@ -137,7 +297,7 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
137 if (!atomic_inc_unless_negative(&kn->active)) 297 if (!atomic_inc_unless_negative(&kn->active))
138 return NULL; 298 return NULL;
139 299
140 if (kn->flags & KERNFS_LOCKDEP) 300 if (kernfs_lockdep(kn))
141 rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_); 301 rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
142 return kn; 302 return kn;
143} 303}
@@ -151,59 +311,57 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
151 */ 311 */
152void kernfs_put_active(struct kernfs_node *kn) 312void kernfs_put_active(struct kernfs_node *kn)
153{ 313{
314 struct kernfs_root *root = kernfs_root(kn);
154 int v; 315 int v;
155 316
156 if (unlikely(!kn)) 317 if (unlikely(!kn))
157 return; 318 return;
158 319
159 if (kn->flags & KERNFS_LOCKDEP) 320 if (kernfs_lockdep(kn))
160 rwsem_release(&kn->dep_map, 1, _RET_IP_); 321 rwsem_release(&kn->dep_map, 1, _RET_IP_);
161 v = atomic_dec_return(&kn->active); 322 v = atomic_dec_return(&kn->active);
162 if (likely(v != KN_DEACTIVATED_BIAS)) 323 if (likely(v != KN_DEACTIVATED_BIAS))
163 return; 324 return;
164 325
165 /* 326 wake_up_all(&root->deactivate_waitq);
166 * atomic_dec_return() is a mb(), we'll always see the updated
167 * kn->u.completion.
168 */
169 complete(kn->u.completion);
170} 327}
171 328
172/** 329/**
173 * kernfs_deactivate - deactivate kernfs_node 330 * kernfs_drain - drain kernfs_node
174 * @kn: kernfs_node to deactivate 331 * @kn: kernfs_node to drain
175 * 332 *
176 * Deny new active references and drain existing ones. 333 * Drain existing usages and nuke all existing mmaps of @kn. Mutiple
334 * removers may invoke this function concurrently on @kn and all will
335 * return after draining is complete.
177 */ 336 */
178static void kernfs_deactivate(struct kernfs_node *kn) 337static void kernfs_drain(struct kernfs_node *kn)
338 __releases(&kernfs_mutex) __acquires(&kernfs_mutex)
179{ 339{
180 DECLARE_COMPLETION_ONSTACK(wait); 340 struct kernfs_root *root = kernfs_root(kn);
181 int v;
182 341
183 BUG_ON(!(kn->flags & KERNFS_REMOVED)); 342 lockdep_assert_held(&kernfs_mutex);
184 343 WARN_ON_ONCE(kernfs_active(kn));
185 if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF))
186 return;
187 344
188 kn->u.completion = (void *)&wait; 345 mutex_unlock(&kernfs_mutex);
189 346
190 if (kn->flags & KERNFS_LOCKDEP) 347 if (kernfs_lockdep(kn)) {
191 rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); 348 rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
192 /* atomic_add_return() is a mb(), put_active() will always see 349 if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
193 * the updated kn->u.completion.
194 */
195 v = atomic_add_return(KN_DEACTIVATED_BIAS, &kn->active);
196
197 if (v != KN_DEACTIVATED_BIAS) {
198 if (kn->flags & KERNFS_LOCKDEP)
199 lock_contended(&kn->dep_map, _RET_IP_); 350 lock_contended(&kn->dep_map, _RET_IP_);
200 wait_for_completion(&wait);
201 } 351 }
202 352
203 if (kn->flags & KERNFS_LOCKDEP) { 353 /* but everyone should wait for draining */
354 wait_event(root->deactivate_waitq,
355 atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);
356
357 if (kernfs_lockdep(kn)) {
204 lock_acquired(&kn->dep_map, _RET_IP_); 358 lock_acquired(&kn->dep_map, _RET_IP_);
205 rwsem_release(&kn->dep_map, 1, _RET_IP_); 359 rwsem_release(&kn->dep_map, 1, _RET_IP_);
206 } 360 }
361
362 kernfs_unmap_bin_file(kn);
363
364 mutex_lock(&kernfs_mutex);
207} 365}
208 366
209/** 367/**
@@ -234,13 +392,15 @@ void kernfs_put(struct kernfs_node *kn)
234 return; 392 return;
235 root = kernfs_root(kn); 393 root = kernfs_root(kn);
236 repeat: 394 repeat:
237 /* Moving/renaming is always done while holding reference. 395 /*
396 * Moving/renaming is always done while holding reference.
238 * kn->parent won't change beneath us. 397 * kn->parent won't change beneath us.
239 */ 398 */
240 parent = kn->parent; 399 parent = kn->parent;
241 400
242 WARN(!(kn->flags & KERNFS_REMOVED), "kernfs: free using entry: %s/%s\n", 401 WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS,
243 parent ? parent->name : "", kn->name); 402 "kernfs_put: %s/%s: released with incorrect active_ref %d\n",
403 parent ? parent->name : "", kn->name, atomic_read(&kn->active));
244 404
245 if (kernfs_type(kn) == KERNFS_LINK) 405 if (kernfs_type(kn) == KERNFS_LINK)
246 kernfs_put(kn->symlink.target_kn); 406 kernfs_put(kn->symlink.target_kn);
@@ -282,8 +442,8 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
282 kn = dentry->d_fsdata; 442 kn = dentry->d_fsdata;
283 mutex_lock(&kernfs_mutex); 443 mutex_lock(&kernfs_mutex);
284 444
285 /* The kernfs node has been deleted */ 445 /* The kernfs node has been deactivated */
286 if (kn->flags & KERNFS_REMOVED) 446 if (!kernfs_active(kn))
287 goto out_bad; 447 goto out_bad;
288 448
289 /* The kernfs node has been moved? */ 449 /* The kernfs node has been moved? */
@@ -328,6 +488,24 @@ const struct dentry_operations kernfs_dops = {
328 .d_release = kernfs_dop_release, 488 .d_release = kernfs_dop_release,
329}; 489};
330 490
491/**
492 * kernfs_node_from_dentry - determine kernfs_node associated with a dentry
493 * @dentry: the dentry in question
494 *
495 * Return the kernfs_node associated with @dentry. If @dentry is not a
496 * kernfs one, %NULL is returned.
497 *
498 * While the returned kernfs_node will stay accessible as long as @dentry
499 * is accessible, the returned node can be in any state and the caller is
500 * fully responsible for determining what's accessible.
501 */
502struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
503{
504 if (dentry->d_sb->s_op == &kernfs_sops)
505 return dentry->d_fsdata;
506 return NULL;
507}
508
331static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, 509static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
332 const char *name, umode_t mode, 510 const char *name, umode_t mode,
333 unsigned flags) 511 unsigned flags)
@@ -352,11 +530,12 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
352 kn->ino = ret; 530 kn->ino = ret;
353 531
354 atomic_set(&kn->count, 1); 532 atomic_set(&kn->count, 1);
355 atomic_set(&kn->active, 0); 533 atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
534 RB_CLEAR_NODE(&kn->rb);
356 535
357 kn->name = name; 536 kn->name = name;
358 kn->mode = mode; 537 kn->mode = mode;
359 kn->flags = flags | KERNFS_REMOVED; 538 kn->flags = flags;
360 539
361 return kn; 540 return kn;
362 541
@@ -382,69 +561,44 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
382} 561}
383 562
384/** 563/**
385 * kernfs_addrm_start - prepare for kernfs_node add/remove
386 * @acxt: pointer to kernfs_addrm_cxt to be used
387 *
388 * This function is called when the caller is about to add or remove
389 * kernfs_node. This function acquires kernfs_mutex. @acxt is used
390 * to keep and pass context to other addrm functions.
391 *
392 * LOCKING:
393 * Kernel thread context (may sleep). kernfs_mutex is locked on
394 * return.
395 */
396void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt)
397 __acquires(kernfs_mutex)
398{
399 memset(acxt, 0, sizeof(*acxt));
400
401 mutex_lock(&kernfs_mutex);
402}
403
404/**
405 * kernfs_add_one - add kernfs_node to parent without warning 564 * kernfs_add_one - add kernfs_node to parent without warning
406 * @acxt: addrm context to use
407 * @kn: kernfs_node to be added 565 * @kn: kernfs_node to be added
408 * 566 *
409 * The caller must already have initialized @kn->parent. This 567 * The caller must already have initialized @kn->parent. This
410 * function increments nlink of the parent's inode if @kn is a 568 * function increments nlink of the parent's inode if @kn is a
411 * directory and link into the children list of the parent. 569 * directory and link into the children list of the parent.
412 * 570 *
413 * This function should be called between calls to
414 * kernfs_addrm_start() and kernfs_addrm_finish() and should be passed
415 * the same @acxt as passed to kernfs_addrm_start().
416 *
417 * LOCKING:
418 * Determined by kernfs_addrm_start().
419 *
420 * RETURNS: 571 * RETURNS:
421 * 0 on success, -EEXIST if entry with the given name already 572 * 0 on success, -EEXIST if entry with the given name already
422 * exists. 573 * exists.
423 */ 574 */
424int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn) 575int kernfs_add_one(struct kernfs_node *kn)
425{ 576{
426 struct kernfs_node *parent = kn->parent; 577 struct kernfs_node *parent = kn->parent;
427 bool has_ns = kernfs_ns_enabled(parent);
428 struct kernfs_iattrs *ps_iattr; 578 struct kernfs_iattrs *ps_iattr;
579 bool has_ns;
429 int ret; 580 int ret;
430 581
431 if (has_ns != (bool)kn->ns) { 582 mutex_lock(&kernfs_mutex);
432 WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", 583
433 has_ns ? "required" : "invalid", parent->name, kn->name); 584 ret = -EINVAL;
434 return -EINVAL; 585 has_ns = kernfs_ns_enabled(parent);
435 } 586 if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
587 has_ns ? "required" : "invalid", parent->name, kn->name))
588 goto out_unlock;
436 589
437 if (kernfs_type(parent) != KERNFS_DIR) 590 if (kernfs_type(parent) != KERNFS_DIR)
438 return -EINVAL; 591 goto out_unlock;
439 592
440 if (parent->flags & KERNFS_REMOVED) 593 ret = -ENOENT;
441 return -ENOENT; 594 if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
595 goto out_unlock;
442 596
443 kn->hash = kernfs_name_hash(kn->name, kn->ns); 597 kn->hash = kernfs_name_hash(kn->name, kn->ns);
444 598
445 ret = kernfs_link_sibling(kn); 599 ret = kernfs_link_sibling(kn);
446 if (ret) 600 if (ret)
447 return ret; 601 goto out_unlock;
448 602
449 /* Update timestamps on the parent */ 603 /* Update timestamps on the parent */
450 ps_iattr = parent->iattr; 604 ps_iattr = parent->iattr;
@@ -453,82 +607,22 @@ int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn)
453 ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; 607 ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
454 } 608 }
455 609
456 /* Mark the entry added into directory tree */ 610 mutex_unlock(&kernfs_mutex);
457 kn->flags &= ~KERNFS_REMOVED;
458
459 return 0;
460}
461
462/**
463 * kernfs_remove_one - remove kernfs_node from parent
464 * @acxt: addrm context to use
465 * @kn: kernfs_node to be removed
466 *
467 * Mark @kn removed and drop nlink of parent inode if @kn is a
468 * directory. @kn is unlinked from the children list.
469 *
470 * This function should be called between calls to
471 * kernfs_addrm_start() and kernfs_addrm_finish() and should be
472 * passed the same @acxt as passed to kernfs_addrm_start().
473 *
474 * LOCKING:
475 * Determined by kernfs_addrm_start().
476 */
477static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt,
478 struct kernfs_node *kn)
479{
480 struct kernfs_iattrs *ps_iattr;
481 611
482 /* 612 /*
483 * Removal can be called multiple times on the same node. Only the 613 * Activate the new node unless CREATE_DEACTIVATED is requested.
484 * first invocation is effective and puts the base ref. 614 * If not activated here, the kernfs user is responsible for
615 * activating the node with kernfs_activate(). A node which hasn't
616 * been activated is not visible to userland and its removal won't
617 * trigger deactivation.
485 */ 618 */
486 if (kn->flags & KERNFS_REMOVED) 619 if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
487 return; 620 kernfs_activate(kn);
488 621 return 0;
489 if (kn->parent) {
490 kernfs_unlink_sibling(kn);
491
492 /* Update timestamps on the parent */
493 ps_iattr = kn->parent->iattr;
494 if (ps_iattr) {
495 ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
496 ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
497 }
498 }
499
500 kn->flags |= KERNFS_REMOVED;
501 kn->u.removed_list = acxt->removed;
502 acxt->removed = kn;
503}
504 622
505/** 623out_unlock:
506 * kernfs_addrm_finish - finish up kernfs_node add/remove
507 * @acxt: addrm context to finish up
508 *
509 * Finish up kernfs_node add/remove. Resources acquired by
510 * kernfs_addrm_start() are released and removed kernfs_nodes are
511 * cleaned up.
512 *
513 * LOCKING:
514 * kernfs_mutex is released.
515 */
516void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
517 __releases(kernfs_mutex)
518{
519 /* release resources acquired by kernfs_addrm_start() */
520 mutex_unlock(&kernfs_mutex); 624 mutex_unlock(&kernfs_mutex);
521 625 return ret;
522 /* kill removed kernfs_nodes */
523 while (acxt->removed) {
524 struct kernfs_node *kn = acxt->removed;
525
526 acxt->removed = kn->u.removed_list;
527
528 kernfs_deactivate(kn);
529 kernfs_unmap_bin_file(kn);
530 kernfs_put(kn);
531 }
532} 626}
533 627
534/** 628/**
@@ -599,13 +693,15 @@ EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
599 693
600/** 694/**
601 * kernfs_create_root - create a new kernfs hierarchy 695 * kernfs_create_root - create a new kernfs hierarchy
602 * @kdops: optional directory syscall operations for the hierarchy 696 * @scops: optional syscall operations for the hierarchy
697 * @flags: KERNFS_ROOT_* flags
603 * @priv: opaque data associated with the new directory 698 * @priv: opaque data associated with the new directory
604 * 699 *
605 * Returns the root of the new hierarchy on success, ERR_PTR() value on 700 * Returns the root of the new hierarchy on success, ERR_PTR() value on
606 * failure. 701 * failure.
607 */ 702 */
608struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv) 703struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
704 unsigned int flags, void *priv)
609{ 705{
610 struct kernfs_root *root; 706 struct kernfs_root *root;
611 struct kernfs_node *kn; 707 struct kernfs_node *kn;
@@ -624,12 +720,16 @@ struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv)
624 return ERR_PTR(-ENOMEM); 720 return ERR_PTR(-ENOMEM);
625 } 721 }
626 722
627 kn->flags &= ~KERNFS_REMOVED;
628 kn->priv = priv; 723 kn->priv = priv;
629 kn->dir.root = root; 724 kn->dir.root = root;
630 725
631 root->dir_ops = kdops; 726 root->syscall_ops = scops;
727 root->flags = flags;
632 root->kn = kn; 728 root->kn = kn;
729 init_waitqueue_head(&root->deactivate_waitq);
730
731 if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
732 kernfs_activate(kn);
633 733
634 return root; 734 return root;
635} 735}
@@ -660,7 +760,6 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
660 const char *name, umode_t mode, 760 const char *name, umode_t mode,
661 void *priv, const void *ns) 761 void *priv, const void *ns)
662{ 762{
663 struct kernfs_addrm_cxt acxt;
664 struct kernfs_node *kn; 763 struct kernfs_node *kn;
665 int rc; 764 int rc;
666 765
@@ -674,10 +773,7 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
674 kn->priv = priv; 773 kn->priv = priv;
675 774
676 /* link in */ 775 /* link in */
677 kernfs_addrm_start(&acxt); 776 rc = kernfs_add_one(kn);
678 rc = kernfs_add_one(&acxt, kn);
679 kernfs_addrm_finish(&acxt);
680
681 if (!rc) 777 if (!rc)
682 return kn; 778 return kn;
683 779
@@ -703,7 +799,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
703 kn = kernfs_find_ns(parent, dentry->d_name.name, ns); 799 kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
704 800
705 /* no such entry */ 801 /* no such entry */
706 if (!kn) { 802 if (!kn || !kernfs_active(kn)) {
707 ret = NULL; 803 ret = NULL;
708 goto out_unlock; 804 goto out_unlock;
709 } 805 }
@@ -728,23 +824,37 @@ static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry,
728 umode_t mode) 824 umode_t mode)
729{ 825{
730 struct kernfs_node *parent = dir->i_private; 826 struct kernfs_node *parent = dir->i_private;
731 struct kernfs_dir_ops *kdops = kernfs_root(parent)->dir_ops; 827 struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops;
828 int ret;
732 829
733 if (!kdops || !kdops->mkdir) 830 if (!scops || !scops->mkdir)
734 return -EPERM; 831 return -EPERM;
735 832
736 return kdops->mkdir(parent, dentry->d_name.name, mode); 833 if (!kernfs_get_active(parent))
834 return -ENODEV;
835
836 ret = scops->mkdir(parent, dentry->d_name.name, mode);
837
838 kernfs_put_active(parent);
839 return ret;
737} 840}
738 841
739static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) 842static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
740{ 843{
741 struct kernfs_node *kn = dentry->d_fsdata; 844 struct kernfs_node *kn = dentry->d_fsdata;
742 struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops; 845 struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
846 int ret;
743 847
744 if (!kdops || !kdops->rmdir) 848 if (!scops || !scops->rmdir)
745 return -EPERM; 849 return -EPERM;
746 850
747 return kdops->rmdir(kn); 851 if (!kernfs_get_active(kn))
852 return -ENODEV;
853
854 ret = scops->rmdir(kn);
855
856 kernfs_put_active(kn);
857 return ret;
748} 858}
749 859
750static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, 860static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -752,12 +862,25 @@ static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
752{ 862{
753 struct kernfs_node *kn = old_dentry->d_fsdata; 863 struct kernfs_node *kn = old_dentry->d_fsdata;
754 struct kernfs_node *new_parent = new_dir->i_private; 864 struct kernfs_node *new_parent = new_dir->i_private;
755 struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops; 865 struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
866 int ret;
756 867
757 if (!kdops || !kdops->rename) 868 if (!scops || !scops->rename)
758 return -EPERM; 869 return -EPERM;
759 870
760 return kdops->rename(kn, new_parent, new_dentry->d_name.name); 871 if (!kernfs_get_active(kn))
872 return -ENODEV;
873
874 if (!kernfs_get_active(new_parent)) {
875 kernfs_put_active(kn);
876 return -ENODEV;
877 }
878
879 ret = scops->rename(kn, new_parent, new_dentry->d_name.name);
880
881 kernfs_put_active(new_parent);
882 kernfs_put_active(kn);
883 return ret;
761} 884}
762 885
763const struct inode_operations kernfs_dir_iops = { 886const struct inode_operations kernfs_dir_iops = {
@@ -830,23 +953,104 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
830 return pos->parent; 953 return pos->parent;
831} 954}
832 955
833static void __kernfs_remove(struct kernfs_addrm_cxt *acxt, 956/**
834 struct kernfs_node *kn) 957 * kernfs_activate - activate a node which started deactivated
958 * @kn: kernfs_node whose subtree is to be activated
959 *
960 * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node
961 * needs to be explicitly activated. A node which hasn't been activated
962 * isn't visible to userland and deactivation is skipped during its
963 * removal. This is useful to construct atomic init sequences where
964 * creation of multiple nodes should either succeed or fail atomically.
965 *
966 * The caller is responsible for ensuring that this function is not called
967 * after kernfs_remove*() is invoked on @kn.
968 */
969void kernfs_activate(struct kernfs_node *kn)
835{ 970{
836 struct kernfs_node *pos, *next; 971 struct kernfs_node *pos;
837 972
838 if (!kn) 973 mutex_lock(&kernfs_mutex);
974
975 pos = NULL;
976 while ((pos = kernfs_next_descendant_post(pos, kn))) {
977 if (!pos || (pos->flags & KERNFS_ACTIVATED))
978 continue;
979
980 WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb));
981 WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS);
982
983 atomic_sub(KN_DEACTIVATED_BIAS, &pos->active);
984 pos->flags |= KERNFS_ACTIVATED;
985 }
986
987 mutex_unlock(&kernfs_mutex);
988}
989
990static void __kernfs_remove(struct kernfs_node *kn)
991{
992 struct kernfs_node *pos;
993
994 lockdep_assert_held(&kernfs_mutex);
995
996 /*
997 * Short-circuit if non-root @kn has already finished removal.
998 * This is for kernfs_remove_self() which plays with active ref
999 * after removal.
1000 */
1001 if (!kn || (kn->parent && RB_EMPTY_NODE(&kn->rb)))
839 return; 1002 return;
840 1003
841 pr_debug("kernfs %s: removing\n", kn->name); 1004 pr_debug("kernfs %s: removing\n", kn->name);
842 1005
843 next = NULL; 1006 /* prevent any new usage under @kn by deactivating all nodes */
1007 pos = NULL;
1008 while ((pos = kernfs_next_descendant_post(pos, kn)))
1009 if (kernfs_active(pos))
1010 atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
1011
1012 /* deactivate and unlink the subtree node-by-node */
844 do { 1013 do {
845 pos = next; 1014 pos = kernfs_leftmost_descendant(kn);
846 next = kernfs_next_descendant_post(pos, kn); 1015
847 if (pos) 1016 /*
848 kernfs_remove_one(acxt, pos); 1017 * kernfs_drain() drops kernfs_mutex temporarily and @pos's
849 } while (next); 1018 * base ref could have been put by someone else by the time
1019 * the function returns. Make sure it doesn't go away
1020 * underneath us.
1021 */
1022 kernfs_get(pos);
1023
1024 /*
1025 * Drain iff @kn was activated. This avoids draining and
1026 * its lockdep annotations for nodes which have never been
1027 * activated and allows embedding kernfs_remove() in create
1028 * error paths without worrying about draining.
1029 */
1030 if (kn->flags & KERNFS_ACTIVATED)
1031 kernfs_drain(pos);
1032 else
1033 WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
1034
1035 /*
1036 * kernfs_unlink_sibling() succeeds once per node. Use it
1037 * to decide who's responsible for cleanups.
1038 */
1039 if (!pos->parent || kernfs_unlink_sibling(pos)) {
1040 struct kernfs_iattrs *ps_iattr =
1041 pos->parent ? pos->parent->iattr : NULL;
1042
1043 /* update timestamps on the parent */
1044 if (ps_iattr) {
1045 ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
1046 ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
1047 }
1048
1049 kernfs_put(pos);
1050 }
1051
1052 kernfs_put(pos);
1053 } while (pos != kn);
850} 1054}
851 1055
852/** 1056/**
@@ -857,11 +1061,140 @@ static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
857 */ 1061 */
858void kernfs_remove(struct kernfs_node *kn) 1062void kernfs_remove(struct kernfs_node *kn)
859{ 1063{
860 struct kernfs_addrm_cxt acxt; 1064 mutex_lock(&kernfs_mutex);
1065 __kernfs_remove(kn);
1066 mutex_unlock(&kernfs_mutex);
1067}
861 1068
862 kernfs_addrm_start(&acxt); 1069/**
863 __kernfs_remove(&acxt, kn); 1070 * kernfs_break_active_protection - break out of active protection
864 kernfs_addrm_finish(&acxt); 1071 * @kn: the self kernfs_node
1072 *
1073 * The caller must be running off of a kernfs operation which is invoked
1074 * with an active reference - e.g. one of kernfs_ops. Each invocation of
1075 * this function must also be matched with an invocation of
1076 * kernfs_unbreak_active_protection().
1077 *
1078 * This function releases the active reference of @kn the caller is
1079 * holding. Once this function is called, @kn may be removed at any point
1080 * and the caller is solely responsible for ensuring that the objects it
1081 * dereferences are accessible.
1082 */
1083void kernfs_break_active_protection(struct kernfs_node *kn)
1084{
1085 /*
1086 * Take out ourself out of the active ref dependency chain. If
1087 * we're called without an active ref, lockdep will complain.
1088 */
1089 kernfs_put_active(kn);
1090}
1091
1092/**
1093 * kernfs_unbreak_active_protection - undo kernfs_break_active_protection()
1094 * @kn: the self kernfs_node
1095 *
1096 * If kernfs_break_active_protection() was called, this function must be
1097 * invoked before finishing the kernfs operation. Note that while this
1098 * function restores the active reference, it doesn't and can't actually
1099 * restore the active protection - @kn may already or be in the process of
1100 * being removed. Once kernfs_break_active_protection() is invoked, that
1101 * protection is irreversibly gone for the kernfs operation instance.
1102 *
1103 * While this function may be called at any point after
1104 * kernfs_break_active_protection() is invoked, its most useful location
1105 * would be right before the enclosing kernfs operation returns.
1106 */
1107void kernfs_unbreak_active_protection(struct kernfs_node *kn)
1108{
1109 /*
1110 * @kn->active could be in any state; however, the increment we do
1111 * here will be undone as soon as the enclosing kernfs operation
1112 * finishes and this temporary bump can't break anything. If @kn
1113 * is alive, nothing changes. If @kn is being deactivated, the
1114 * soon-to-follow put will either finish deactivation or restore
1115 * deactivated state. If @kn is already removed, the temporary
1116 * bump is guaranteed to be gone before @kn is released.
1117 */
1118 atomic_inc(&kn->active);
1119 if (kernfs_lockdep(kn))
1120 rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_);
1121}
1122
1123/**
1124 * kernfs_remove_self - remove a kernfs_node from its own method
1125 * @kn: the self kernfs_node to remove
1126 *
1127 * The caller must be running off of a kernfs operation which is invoked
1128 * with an active reference - e.g. one of kernfs_ops. This can be used to
1129 * implement a file operation which deletes itself.
1130 *
1131 * For example, the "delete" file for a sysfs device directory can be
1132 * implemented by invoking kernfs_remove_self() on the "delete" file
1133 * itself. This function breaks the circular dependency of trying to
1134 * deactivate self while holding an active ref itself. It isn't necessary
1135 * to modify the usual removal path to use kernfs_remove_self(). The
1136 * "delete" implementation can simply invoke kernfs_remove_self() on self
1137 * before proceeding with the usual removal path. kernfs will ignore later
1138 * kernfs_remove() on self.
1139 *
1140 * kernfs_remove_self() can be called multiple times concurrently on the
1141 * same kernfs_node. Only the first one actually performs removal and
1142 * returns %true. All others will wait until the kernfs operation which
1143 * won self-removal finishes and return %false. Note that the losers wait
1144 * for the completion of not only the winning kernfs_remove_self() but also
1145 * the whole kernfs_ops which won the arbitration. This can be used to
1146 * guarantee, for example, all concurrent writes to a "delete" file to
1147 * finish only after the whole operation is complete.
1148 */
1149bool kernfs_remove_self(struct kernfs_node *kn)
1150{
1151 bool ret;
1152
1153 mutex_lock(&kernfs_mutex);
1154 kernfs_break_active_protection(kn);
1155
1156 /*
1157 * SUICIDAL is used to arbitrate among competing invocations. Only
1158 * the first one will actually perform removal. When the removal
1159 * is complete, SUICIDED is set and the active ref is restored
1160 * while holding kernfs_mutex. The ones which lost arbitration
1161 * waits for SUICDED && drained which can happen only after the
1162 * enclosing kernfs operation which executed the winning instance
1163 * of kernfs_remove_self() finished.
1164 */
1165 if (!(kn->flags & KERNFS_SUICIDAL)) {
1166 kn->flags |= KERNFS_SUICIDAL;
1167 __kernfs_remove(kn);
1168 kn->flags |= KERNFS_SUICIDED;
1169 ret = true;
1170 } else {
1171 wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq;
1172 DEFINE_WAIT(wait);
1173
1174 while (true) {
1175 prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE);
1176
1177 if ((kn->flags & KERNFS_SUICIDED) &&
1178 atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
1179 break;
1180
1181 mutex_unlock(&kernfs_mutex);
1182 schedule();
1183 mutex_lock(&kernfs_mutex);
1184 }
1185 finish_wait(waitq, &wait);
1186 WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
1187 ret = false;
1188 }
1189
1190 /*
1191 * This must be done while holding kernfs_mutex; otherwise, waiting
1192 * for SUICIDED && deactivated could finish prematurely.
1193 */
1194 kernfs_unbreak_active_protection(kn);
1195
1196 mutex_unlock(&kernfs_mutex);
1197 return ret;
865} 1198}
866 1199
867/** 1200/**
@@ -876,7 +1209,6 @@ void kernfs_remove(struct kernfs_node *kn)
876int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, 1209int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
877 const void *ns) 1210 const void *ns)
878{ 1211{
879 struct kernfs_addrm_cxt acxt;
880 struct kernfs_node *kn; 1212 struct kernfs_node *kn;
881 1213
882 if (!parent) { 1214 if (!parent) {
@@ -885,13 +1217,13 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
885 return -ENOENT; 1217 return -ENOENT;
886 } 1218 }
887 1219
888 kernfs_addrm_start(&acxt); 1220 mutex_lock(&kernfs_mutex);
889 1221
890 kn = kernfs_find_ns(parent, name, ns); 1222 kn = kernfs_find_ns(parent, name, ns);
891 if (kn) 1223 if (kn)
892 __kernfs_remove(&acxt, kn); 1224 __kernfs_remove(kn);
893 1225
894 kernfs_addrm_finish(&acxt); 1226 mutex_unlock(&kernfs_mutex);
895 1227
896 if (kn) 1228 if (kn)
897 return 0; 1229 return 0;
@@ -909,12 +1241,18 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
909int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, 1241int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
910 const char *new_name, const void *new_ns) 1242 const char *new_name, const void *new_ns)
911{ 1243{
1244 struct kernfs_node *old_parent;
1245 const char *old_name = NULL;
912 int error; 1246 int error;
913 1247
1248 /* can't move or rename root */
1249 if (!kn->parent)
1250 return -EINVAL;
1251
914 mutex_lock(&kernfs_mutex); 1252 mutex_lock(&kernfs_mutex);
915 1253
916 error = -ENOENT; 1254 error = -ENOENT;
917 if ((kn->flags | new_parent->flags) & KERNFS_REMOVED) 1255 if (!kernfs_active(kn) || !kernfs_active(new_parent))
918 goto out; 1256 goto out;
919 1257
920 error = 0; 1258 error = 0;
@@ -932,13 +1270,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
932 new_name = kstrdup(new_name, GFP_KERNEL); 1270 new_name = kstrdup(new_name, GFP_KERNEL);
933 if (!new_name) 1271 if (!new_name)
934 goto out; 1272 goto out;
935 1273 } else {
936 if (kn->flags & KERNFS_STATIC_NAME) 1274 new_name = NULL;
937 kn->flags &= ~KERNFS_STATIC_NAME;
938 else
939 kfree(kn->name);
940
941 kn->name = new_name;
942 } 1275 }
943 1276
944 /* 1277 /*
@@ -946,12 +1279,29 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
946 */ 1279 */
947 kernfs_unlink_sibling(kn); 1280 kernfs_unlink_sibling(kn);
948 kernfs_get(new_parent); 1281 kernfs_get(new_parent);
949 kernfs_put(kn->parent); 1282
1283 /* rename_lock protects ->parent and ->name accessors */
1284 spin_lock_irq(&kernfs_rename_lock);
1285
1286 old_parent = kn->parent;
1287 kn->parent = new_parent;
1288
950 kn->ns = new_ns; 1289 kn->ns = new_ns;
1290 if (new_name) {
1291 if (!(kn->flags & KERNFS_STATIC_NAME))
1292 old_name = kn->name;
1293 kn->flags &= ~KERNFS_STATIC_NAME;
1294 kn->name = new_name;
1295 }
1296
1297 spin_unlock_irq(&kernfs_rename_lock);
1298
951 kn->hash = kernfs_name_hash(kn->name, kn->ns); 1299 kn->hash = kernfs_name_hash(kn->name, kn->ns);
952 kn->parent = new_parent;
953 kernfs_link_sibling(kn); 1300 kernfs_link_sibling(kn);
954 1301
1302 kernfs_put(old_parent);
1303 kfree(old_name);
1304
955 error = 0; 1305 error = 0;
956 out: 1306 out:
957 mutex_unlock(&kernfs_mutex); 1307 mutex_unlock(&kernfs_mutex);
@@ -974,7 +1324,7 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
974 struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos) 1324 struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
975{ 1325{
976 if (pos) { 1326 if (pos) {
977 int valid = !(pos->flags & KERNFS_REMOVED) && 1327 int valid = kernfs_active(pos) &&
978 pos->parent == parent && hash == pos->hash; 1328 pos->parent == parent && hash == pos->hash;
979 kernfs_put(pos); 1329 kernfs_put(pos);
980 if (!valid) 1330 if (!valid)
@@ -993,8 +1343,8 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
993 break; 1343 break;
994 } 1344 }
995 } 1345 }
996 /* Skip over entries in the wrong namespace */ 1346 /* Skip over entries which are dying/dead or in the wrong namespace */
997 while (pos && pos->ns != ns) { 1347 while (pos && (!kernfs_active(pos) || pos->ns != ns)) {
998 struct rb_node *node = rb_next(&pos->rb); 1348 struct rb_node *node = rb_next(&pos->rb);
999 if (!node) 1349 if (!node)
1000 pos = NULL; 1350 pos = NULL;
@@ -1008,14 +1358,15 @@ static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
1008 struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos) 1358 struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
1009{ 1359{
1010 pos = kernfs_dir_pos(ns, parent, ino, pos); 1360 pos = kernfs_dir_pos(ns, parent, ino, pos);
1011 if (pos) 1361 if (pos) {
1012 do { 1362 do {
1013 struct rb_node *node = rb_next(&pos->rb); 1363 struct rb_node *node = rb_next(&pos->rb);
1014 if (!node) 1364 if (!node)
1015 pos = NULL; 1365 pos = NULL;
1016 else 1366 else
1017 pos = rb_to_kn(node); 1367 pos = rb_to_kn(node);
1018 } while (pos && pos->ns != ns); 1368 } while (pos && (!kernfs_active(pos) || pos->ns != ns));
1369 }
1019 return pos; 1370 return pos;
1020} 1371}
1021 1372
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index dbf397bfdff2..8034706a7af8 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -252,10 +252,18 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
252 size_t count, loff_t *ppos) 252 size_t count, loff_t *ppos)
253{ 253{
254 struct kernfs_open_file *of = kernfs_of(file); 254 struct kernfs_open_file *of = kernfs_of(file);
255 ssize_t len = min_t(size_t, count, PAGE_SIZE);
256 const struct kernfs_ops *ops; 255 const struct kernfs_ops *ops;
256 size_t len;
257 char *buf; 257 char *buf;
258 258
259 if (of->atomic_write_len) {
260 len = count;
261 if (len > of->atomic_write_len)
262 return -E2BIG;
263 } else {
264 len = min_t(size_t, count, PAGE_SIZE);
265 }
266
259 buf = kmalloc(len + 1, GFP_KERNEL); 267 buf = kmalloc(len + 1, GFP_KERNEL);
260 if (!buf) 268 if (!buf)
261 return -ENOMEM; 269 return -ENOMEM;
@@ -653,6 +661,12 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
653 of->file = file; 661 of->file = file;
654 662
655 /* 663 /*
664 * Write path needs to atomic_write_len outside active reference.
665 * Cache it in open_file. See kernfs_fop_write() for details.
666 */
667 of->atomic_write_len = ops->atomic_write_len;
668
669 /*
656 * Always instantiate seq_file even if read access doesn't use 670 * Always instantiate seq_file even if read access doesn't use
657 * seq_file or is not requested. This unifies private data access 671 * seq_file or is not requested. This unifies private data access
658 * and readable regular files are the vast majority anyway. 672 * and readable regular files are the vast majority anyway.
@@ -820,7 +834,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
820 bool name_is_static, 834 bool name_is_static,
821 struct lock_class_key *key) 835 struct lock_class_key *key)
822{ 836{
823 struct kernfs_addrm_cxt acxt;
824 struct kernfs_node *kn; 837 struct kernfs_node *kn;
825 unsigned flags; 838 unsigned flags;
826 int rc; 839 int rc;
@@ -855,10 +868,7 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
855 if (ops->mmap) 868 if (ops->mmap)
856 kn->flags |= KERNFS_HAS_MMAP; 869 kn->flags |= KERNFS_HAS_MMAP;
857 870
858 kernfs_addrm_start(&acxt); 871 rc = kernfs_add_one(kn);
859 rc = kernfs_add_one(&acxt, kn);
860 kernfs_addrm_finish(&acxt);
861
862 if (rc) { 872 if (rc) {
863 kernfs_put(kn); 873 kernfs_put(kn);
864 return ERR_PTR(rc); 874 return ERR_PTR(rc);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index e55126f85bd2..abb0f1f53d93 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -355,7 +355,7 @@ void kernfs_evict_inode(struct inode *inode)
355{ 355{
356 struct kernfs_node *kn = inode->i_private; 356 struct kernfs_node *kn = inode->i_private;
357 357
358 truncate_inode_pages(&inode->i_data, 0); 358 truncate_inode_pages_final(&inode->i_data);
359 clear_inode(inode); 359 clear_inode(inode);
360 kernfs_put(kn); 360 kernfs_put(kn);
361} 361}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index eb536b76374a..8be13b2a079b 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -26,7 +26,8 @@ struct kernfs_iattrs {
26 struct simple_xattrs xattrs; 26 struct simple_xattrs xattrs;
27}; 27};
28 28
29#define KN_DEACTIVATED_BIAS INT_MIN 29/* +1 to avoid triggering overflow warning when negating it */
30#define KN_DEACTIVATED_BIAS (INT_MIN + 1)
30 31
31/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */ 32/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
32 33
@@ -45,13 +46,6 @@ static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
45} 46}
46 47
47/* 48/*
48 * Context structure to be used while adding/removing nodes.
49 */
50struct kernfs_addrm_cxt {
51 struct kernfs_node *removed;
52};
53
54/*
55 * mount.c 49 * mount.c
56 */ 50 */
57struct kernfs_super_info { 51struct kernfs_super_info {
@@ -71,6 +65,7 @@ struct kernfs_super_info {
71}; 65};
72#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) 66#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
73 67
68extern const struct super_operations kernfs_sops;
74extern struct kmem_cache *kernfs_node_cache; 69extern struct kmem_cache *kernfs_node_cache;
75 70
76/* 71/*
@@ -100,9 +95,7 @@ extern const struct inode_operations kernfs_dir_iops;
100 95
101struct kernfs_node *kernfs_get_active(struct kernfs_node *kn); 96struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
102void kernfs_put_active(struct kernfs_node *kn); 97void kernfs_put_active(struct kernfs_node *kn);
103void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt); 98int kernfs_add_one(struct kernfs_node *kn);
104int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn);
105void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt);
106struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, 99struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
107 const char *name, umode_t mode, 100 const char *name, umode_t mode,
108 unsigned flags); 101 unsigned flags);
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 0f4152defe7b..6a5f04ac8704 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -19,12 +19,49 @@
19 19
20struct kmem_cache *kernfs_node_cache; 20struct kmem_cache *kernfs_node_cache;
21 21
22static const struct super_operations kernfs_sops = { 22static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data)
23{
24 struct kernfs_root *root = kernfs_info(sb)->root;
25 struct kernfs_syscall_ops *scops = root->syscall_ops;
26
27 if (scops && scops->remount_fs)
28 return scops->remount_fs(root, flags, data);
29 return 0;
30}
31
32static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
33{
34 struct kernfs_root *root = kernfs_root(dentry->d_fsdata);
35 struct kernfs_syscall_ops *scops = root->syscall_ops;
36
37 if (scops && scops->show_options)
38 return scops->show_options(sf, root);
39 return 0;
40}
41
42const struct super_operations kernfs_sops = {
23 .statfs = simple_statfs, 43 .statfs = simple_statfs,
24 .drop_inode = generic_delete_inode, 44 .drop_inode = generic_delete_inode,
25 .evict_inode = kernfs_evict_inode, 45 .evict_inode = kernfs_evict_inode,
46
47 .remount_fs = kernfs_sop_remount_fs,
48 .show_options = kernfs_sop_show_options,
26}; 49};
27 50
51/**
52 * kernfs_root_from_sb - determine kernfs_root associated with a super_block
53 * @sb: the super_block in question
54 *
55 * Return the kernfs_root associated with @sb. If @sb is not a kernfs one,
56 * %NULL is returned.
57 */
58struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
59{
60 if (sb->s_op == &kernfs_sops)
61 return kernfs_info(sb)->root;
62 return NULL;
63}
64
28static int kernfs_fill_super(struct super_block *sb) 65static int kernfs_fill_super(struct super_block *sb)
29{ 66{
30 struct kernfs_super_info *info = kernfs_info(sb); 67 struct kernfs_super_info *info = kernfs_info(sb);
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 4d457055acb9..8a198898e39a 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -27,7 +27,6 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
27 struct kernfs_node *target) 27 struct kernfs_node *target)
28{ 28{
29 struct kernfs_node *kn; 29 struct kernfs_node *kn;
30 struct kernfs_addrm_cxt acxt;
31 int error; 30 int error;
32 31
33 kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK); 32 kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK);
@@ -39,10 +38,7 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
39 kn->symlink.target_kn = target; 38 kn->symlink.target_kn = target;
40 kernfs_get(target); /* ref owned by symlink */ 39 kernfs_get(target); /* ref owned by symlink */
41 40
42 kernfs_addrm_start(&acxt); 41 error = kernfs_add_one(kn);
43 error = kernfs_add_one(&acxt, kn);
44 kernfs_addrm_finish(&acxt);
45
46 if (!error) 42 if (!error)
47 return kn; 43 return kn;
48 44
diff --git a/fs/locks.c b/fs/locks.c
index 92a0f0a52b06..13fc7a6d380a 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -135,6 +135,7 @@
135#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) 135#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
136#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) 136#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
137#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG)) 137#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG))
138#define IS_FILE_PVT(fl) (fl->fl_flags & FL_FILE_PVT)
138 139
139static bool lease_breaking(struct file_lock *fl) 140static bool lease_breaking(struct file_lock *fl)
140{ 141{
@@ -344,48 +345,43 @@ static int assign_type(struct file_lock *fl, long type)
344 return 0; 345 return 0;
345} 346}
346 347
347/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX 348static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
348 * style lock. 349 struct flock64 *l)
349 */
350static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
351 struct flock *l)
352{ 350{
353 off_t start, end;
354
355 switch (l->l_whence) { 351 switch (l->l_whence) {
356 case SEEK_SET: 352 case SEEK_SET:
357 start = 0; 353 fl->fl_start = 0;
358 break; 354 break;
359 case SEEK_CUR: 355 case SEEK_CUR:
360 start = filp->f_pos; 356 fl->fl_start = filp->f_pos;
361 break; 357 break;
362 case SEEK_END: 358 case SEEK_END:
363 start = i_size_read(file_inode(filp)); 359 fl->fl_start = i_size_read(file_inode(filp));
364 break; 360 break;
365 default: 361 default:
366 return -EINVAL; 362 return -EINVAL;
367 } 363 }
364 if (l->l_start > OFFSET_MAX - fl->fl_start)
365 return -EOVERFLOW;
366 fl->fl_start += l->l_start;
367 if (fl->fl_start < 0)
368 return -EINVAL;
368 369
369 /* POSIX-1996 leaves the case l->l_len < 0 undefined; 370 /* POSIX-1996 leaves the case l->l_len < 0 undefined;
370 POSIX-2001 defines it. */ 371 POSIX-2001 defines it. */
371 start += l->l_start;
372 if (start < 0)
373 return -EINVAL;
374 fl->fl_end = OFFSET_MAX;
375 if (l->l_len > 0) { 372 if (l->l_len > 0) {
376 end = start + l->l_len - 1; 373 if (l->l_len - 1 > OFFSET_MAX - fl->fl_start)
377 fl->fl_end = end; 374 return -EOVERFLOW;
375 fl->fl_end = fl->fl_start + l->l_len - 1;
376
378 } else if (l->l_len < 0) { 377 } else if (l->l_len < 0) {
379 end = start - 1; 378 if (fl->fl_start + l->l_len < 0)
380 fl->fl_end = end;
381 start += l->l_len;
382 if (start < 0)
383 return -EINVAL; 379 return -EINVAL;
384 } 380 fl->fl_end = fl->fl_start - 1;
385 fl->fl_start = start; /* we record the absolute position */ 381 fl->fl_start += l->l_len;
386 if (fl->fl_end < fl->fl_start) 382 } else
387 return -EOVERFLOW; 383 fl->fl_end = OFFSET_MAX;
388 384
389 fl->fl_owner = current->files; 385 fl->fl_owner = current->files;
390 fl->fl_pid = current->tgid; 386 fl->fl_pid = current->tgid;
391 fl->fl_file = filp; 387 fl->fl_file = filp;
@@ -393,55 +389,36 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
393 fl->fl_ops = NULL; 389 fl->fl_ops = NULL;
394 fl->fl_lmops = NULL; 390 fl->fl_lmops = NULL;
395 391
396 return assign_type(fl, l->l_type); 392 /* Ensure that fl->fl_filp has compatible f_mode */
397} 393 switch (l->l_type) {
398 394 case F_RDLCK:
399#if BITS_PER_LONG == 32 395 if (!(filp->f_mode & FMODE_READ))
400static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, 396 return -EBADF;
401 struct flock64 *l)
402{
403 loff_t start;
404
405 switch (l->l_whence) {
406 case SEEK_SET:
407 start = 0;
408 break;
409 case SEEK_CUR:
410 start = filp->f_pos;
411 break; 397 break;
412 case SEEK_END: 398 case F_WRLCK:
413 start = i_size_read(file_inode(filp)); 399 if (!(filp->f_mode & FMODE_WRITE))
400 return -EBADF;
414 break; 401 break;
415 default:
416 return -EINVAL;
417 } 402 }
418 403
419 start += l->l_start;
420 if (start < 0)
421 return -EINVAL;
422 fl->fl_end = OFFSET_MAX;
423 if (l->l_len > 0) {
424 fl->fl_end = start + l->l_len - 1;
425 } else if (l->l_len < 0) {
426 fl->fl_end = start - 1;
427 start += l->l_len;
428 if (start < 0)
429 return -EINVAL;
430 }
431 fl->fl_start = start; /* we record the absolute position */
432 if (fl->fl_end < fl->fl_start)
433 return -EOVERFLOW;
434
435 fl->fl_owner = current->files;
436 fl->fl_pid = current->tgid;
437 fl->fl_file = filp;
438 fl->fl_flags = FL_POSIX;
439 fl->fl_ops = NULL;
440 fl->fl_lmops = NULL;
441
442 return assign_type(fl, l->l_type); 404 return assign_type(fl, l->l_type);
443} 405}
444#endif 406
407/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
408 * style lock.
409 */
410static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
411 struct flock *l)
412{
413 struct flock64 ll = {
414 .l_type = l->l_type,
415 .l_whence = l->l_whence,
416 .l_start = l->l_start,
417 .l_len = l->l_len,
418 };
419
420 return flock64_to_posix_lock(filp, fl, &ll);
421}
445 422
446/* default lease lock manager operations */ 423/* default lease lock manager operations */
447static void lease_break_callback(struct file_lock *fl) 424static void lease_break_callback(struct file_lock *fl)
@@ -511,8 +488,7 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
511} 488}
512 489
513/* Must be called with the i_lock held! */ 490/* Must be called with the i_lock held! */
514static inline void 491static void locks_insert_global_locks(struct file_lock *fl)
515locks_insert_global_locks(struct file_lock *fl)
516{ 492{
517 lg_local_lock(&file_lock_lglock); 493 lg_local_lock(&file_lock_lglock);
518 fl->fl_link_cpu = smp_processor_id(); 494 fl->fl_link_cpu = smp_processor_id();
@@ -521,8 +497,7 @@ locks_insert_global_locks(struct file_lock *fl)
521} 497}
522 498
523/* Must be called with the i_lock held! */ 499/* Must be called with the i_lock held! */
524static inline void 500static void locks_delete_global_locks(struct file_lock *fl)
525locks_delete_global_locks(struct file_lock *fl)
526{ 501{
527 /* 502 /*
528 * Avoid taking lock if already unhashed. This is safe since this check 503 * Avoid taking lock if already unhashed. This is safe since this check
@@ -544,14 +519,12 @@ posix_owner_key(struct file_lock *fl)
544 return (unsigned long)fl->fl_owner; 519 return (unsigned long)fl->fl_owner;
545} 520}
546 521
547static inline void 522static void locks_insert_global_blocked(struct file_lock *waiter)
548locks_insert_global_blocked(struct file_lock *waiter)
549{ 523{
550 hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter)); 524 hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter));
551} 525}
552 526
553static inline void 527static void locks_delete_global_blocked(struct file_lock *waiter)
554locks_delete_global_blocked(struct file_lock *waiter)
555{ 528{
556 hash_del(&waiter->fl_link); 529 hash_del(&waiter->fl_link);
557} 530}
@@ -581,7 +554,7 @@ static void locks_delete_block(struct file_lock *waiter)
581 * it seems like the reasonable thing to do. 554 * it seems like the reasonable thing to do.
582 * 555 *
583 * Must be called with both the i_lock and blocked_lock_lock held. The fl_block 556 * Must be called with both the i_lock and blocked_lock_lock held. The fl_block
584 * list itself is protected by the file_lock_list, but by ensuring that the 557 * list itself is protected by the blocked_lock_lock, but by ensuring that the
585 * i_lock is also held on insertions we can avoid taking the blocked_lock_lock 558 * i_lock is also held on insertions we can avoid taking the blocked_lock_lock
586 * in some cases when we see that the fl_block list is empty. 559 * in some cases when we see that the fl_block list is empty.
587 */ 560 */
@@ -591,7 +564,7 @@ static void __locks_insert_block(struct file_lock *blocker,
591 BUG_ON(!list_empty(&waiter->fl_block)); 564 BUG_ON(!list_empty(&waiter->fl_block));
592 waiter->fl_next = blocker; 565 waiter->fl_next = blocker;
593 list_add_tail(&waiter->fl_block, &blocker->fl_block); 566 list_add_tail(&waiter->fl_block, &blocker->fl_block);
594 if (IS_POSIX(blocker)) 567 if (IS_POSIX(blocker) && !IS_FILE_PVT(blocker))
595 locks_insert_global_blocked(waiter); 568 locks_insert_global_blocked(waiter);
596} 569}
597 570
@@ -652,15 +625,18 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
652 locks_insert_global_locks(fl); 625 locks_insert_global_locks(fl);
653} 626}
654 627
655/* 628/**
656 * Delete a lock and then free it. 629 * locks_delete_lock - Delete a lock and then free it.
657 * Wake up processes that are blocked waiting for this lock, 630 * @thisfl_p: pointer that points to the fl_next field of the previous
658 * notify the FS that the lock has been cleared and 631 * inode->i_flock list entry
659 * finally free the lock. 632 *
633 * Unlink a lock from all lists and free the namespace reference, but don't
634 * free it yet. Wake up processes that are blocked waiting for this lock and
635 * notify the FS that the lock has been cleared.
660 * 636 *
661 * Must be called with the i_lock held! 637 * Must be called with the i_lock held!
662 */ 638 */
663static void locks_delete_lock(struct file_lock **thisfl_p) 639static void locks_unlink_lock(struct file_lock **thisfl_p)
664{ 640{
665 struct file_lock *fl = *thisfl_p; 641 struct file_lock *fl = *thisfl_p;
666 642
@@ -675,6 +651,18 @@ static void locks_delete_lock(struct file_lock **thisfl_p)
675 } 651 }
676 652
677 locks_wake_up_blocks(fl); 653 locks_wake_up_blocks(fl);
654}
655
656/*
657 * Unlink a lock from all lists and free it.
658 *
659 * Must be called with i_lock held!
660 */
661static void locks_delete_lock(struct file_lock **thisfl_p)
662{
663 struct file_lock *fl = *thisfl_p;
664
665 locks_unlink_lock(thisfl_p);
678 locks_free_lock(fl); 666 locks_free_lock(fl);
679} 667}
680 668
@@ -769,8 +757,16 @@ EXPORT_SYMBOL(posix_test_lock);
769 * Note: the above assumption may not be true when handling lock 757 * Note: the above assumption may not be true when handling lock
770 * requests from a broken NFS client. It may also fail in the presence 758 * requests from a broken NFS client. It may also fail in the presence
771 * of tasks (such as posix threads) sharing the same open file table. 759 * of tasks (such as posix threads) sharing the same open file table.
772 *
773 * To handle those cases, we just bail out after a few iterations. 760 * To handle those cases, we just bail out after a few iterations.
761 *
762 * For FL_FILE_PVT locks, the owner is the filp, not the files_struct.
763 * Because the owner is not even nominally tied to a thread of
764 * execution, the deadlock detection below can't reasonably work well. Just
765 * skip it for those.
766 *
767 * In principle, we could do a more limited deadlock detection on FL_FILE_PVT
768 * locks that just checks for the case where two tasks are attempting to
769 * upgrade from read to write locks on the same inode.
774 */ 770 */
775 771
776#define MAX_DEADLK_ITERATIONS 10 772#define MAX_DEADLK_ITERATIONS 10
@@ -793,6 +789,13 @@ static int posix_locks_deadlock(struct file_lock *caller_fl,
793{ 789{
794 int i = 0; 790 int i = 0;
795 791
792 /*
793 * This deadlock detector can't reasonably detect deadlocks with
794 * FL_FILE_PVT locks, since they aren't owned by a process, per-se.
795 */
796 if (IS_FILE_PVT(caller_fl))
797 return 0;
798
796 while ((block_fl = what_owner_is_waiting_for(block_fl))) { 799 while ((block_fl = what_owner_is_waiting_for(block_fl))) {
797 if (i++ > MAX_DEADLK_ITERATIONS) 800 if (i++ > MAX_DEADLK_ITERATIONS)
798 return 0; 801 return 0;
@@ -1152,13 +1155,14 @@ EXPORT_SYMBOL(posix_lock_file_wait);
1152 1155
1153/** 1156/**
1154 * locks_mandatory_locked - Check for an active lock 1157 * locks_mandatory_locked - Check for an active lock
1155 * @inode: the file to check 1158 * @file: the file to check
1156 * 1159 *
1157 * Searches the inode's list of locks to find any POSIX locks which conflict. 1160 * Searches the inode's list of locks to find any POSIX locks which conflict.
1158 * This function is called from locks_verify_locked() only. 1161 * This function is called from locks_verify_locked() only.
1159 */ 1162 */
1160int locks_mandatory_locked(struct inode *inode) 1163int locks_mandatory_locked(struct file *file)
1161{ 1164{
1165 struct inode *inode = file_inode(file);
1162 fl_owner_t owner = current->files; 1166 fl_owner_t owner = current->files;
1163 struct file_lock *fl; 1167 struct file_lock *fl;
1164 1168
@@ -1169,7 +1173,7 @@ int locks_mandatory_locked(struct inode *inode)
1169 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 1173 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
1170 if (!IS_POSIX(fl)) 1174 if (!IS_POSIX(fl))
1171 continue; 1175 continue;
1172 if (fl->fl_owner != owner) 1176 if (fl->fl_owner != owner && fl->fl_owner != (fl_owner_t)file)
1173 break; 1177 break;
1174 } 1178 }
1175 spin_unlock(&inode->i_lock); 1179 spin_unlock(&inode->i_lock);
@@ -1195,19 +1199,30 @@ int locks_mandatory_area(int read_write, struct inode *inode,
1195{ 1199{
1196 struct file_lock fl; 1200 struct file_lock fl;
1197 int error; 1201 int error;
1202 bool sleep = false;
1198 1203
1199 locks_init_lock(&fl); 1204 locks_init_lock(&fl);
1200 fl.fl_owner = current->files;
1201 fl.fl_pid = current->tgid; 1205 fl.fl_pid = current->tgid;
1202 fl.fl_file = filp; 1206 fl.fl_file = filp;
1203 fl.fl_flags = FL_POSIX | FL_ACCESS; 1207 fl.fl_flags = FL_POSIX | FL_ACCESS;
1204 if (filp && !(filp->f_flags & O_NONBLOCK)) 1208 if (filp && !(filp->f_flags & O_NONBLOCK))
1205 fl.fl_flags |= FL_SLEEP; 1209 sleep = true;
1206 fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK; 1210 fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK;
1207 fl.fl_start = offset; 1211 fl.fl_start = offset;
1208 fl.fl_end = offset + count - 1; 1212 fl.fl_end = offset + count - 1;
1209 1213
1210 for (;;) { 1214 for (;;) {
1215 if (filp) {
1216 fl.fl_owner = (fl_owner_t)filp;
1217 fl.fl_flags &= ~FL_SLEEP;
1218 error = __posix_lock_file(inode, &fl, NULL);
1219 if (!error)
1220 break;
1221 }
1222
1223 if (sleep)
1224 fl.fl_flags |= FL_SLEEP;
1225 fl.fl_owner = current->files;
1211 error = __posix_lock_file(inode, &fl, NULL); 1226 error = __posix_lock_file(inode, &fl, NULL);
1212 if (error != FILE_LOCK_DEFERRED) 1227 if (error != FILE_LOCK_DEFERRED)
1213 break; 1228 break;
@@ -1472,6 +1487,32 @@ int fcntl_getlease(struct file *filp)
1472 return type; 1487 return type;
1473} 1488}
1474 1489
1490/**
1491 * check_conflicting_open - see if the given dentry points to a file that has
1492 * an existing open that would conflict with the
1493 * desired lease.
1494 * @dentry: dentry to check
1495 * @arg: type of lease that we're trying to acquire
1496 *
1497 * Check to see if there's an existing open fd on this file that would
1498 * conflict with the lease we're trying to set.
1499 */
1500static int
1501check_conflicting_open(const struct dentry *dentry, const long arg)
1502{
1503 int ret = 0;
1504 struct inode *inode = dentry->d_inode;
1505
1506 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
1507 return -EAGAIN;
1508
1509 if ((arg == F_WRLCK) && ((d_count(dentry) > 1) ||
1510 (atomic_read(&inode->i_count) > 1)))
1511 ret = -EAGAIN;
1512
1513 return ret;
1514}
1515
1475static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) 1516static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
1476{ 1517{
1477 struct file_lock *fl, **before, **my_before = NULL, *lease; 1518 struct file_lock *fl, **before, **my_before = NULL, *lease;
@@ -1499,12 +1540,8 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
1499 return -EINVAL; 1540 return -EINVAL;
1500 } 1541 }
1501 1542
1502 error = -EAGAIN; 1543 error = check_conflicting_open(dentry, arg);
1503 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) 1544 if (error)
1504 goto out;
1505 if ((arg == F_WRLCK)
1506 && ((d_count(dentry) > 1)
1507 || (atomic_read(&inode->i_count) > 1)))
1508 goto out; 1545 goto out;
1509 1546
1510 /* 1547 /*
@@ -1549,7 +1586,19 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
1549 goto out; 1586 goto out;
1550 1587
1551 locks_insert_lock(before, lease); 1588 locks_insert_lock(before, lease);
1552 error = 0; 1589 /*
1590 * The check in break_lease() is lockless. It's possible for another
1591 * open to race in after we did the earlier check for a conflicting
1592 * open but before the lease was inserted. Check again for a
1593 * conflicting open and cancel the lease if there is one.
1594 *
1595 * We also add a barrier here to ensure that the insertion of the lock
1596 * precedes these checks.
1597 */
1598 smp_mb();
1599 error = check_conflicting_open(dentry, arg);
1600 if (error)
1601 locks_unlink_lock(flp);
1553out: 1602out:
1554 if (is_deleg) 1603 if (is_deleg)
1555 mutex_unlock(&inode->i_mutex); 1604 mutex_unlock(&inode->i_mutex);
@@ -1842,7 +1891,7 @@ EXPORT_SYMBOL_GPL(vfs_test_lock);
1842 1891
1843static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) 1892static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
1844{ 1893{
1845 flock->l_pid = fl->fl_pid; 1894 flock->l_pid = IS_FILE_PVT(fl) ? -1 : fl->fl_pid;
1846#if BITS_PER_LONG == 32 1895#if BITS_PER_LONG == 32
1847 /* 1896 /*
1848 * Make sure we can represent the posix lock via 1897 * Make sure we can represent the posix lock via
@@ -1864,7 +1913,7 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
1864#if BITS_PER_LONG == 32 1913#if BITS_PER_LONG == 32
1865static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) 1914static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
1866{ 1915{
1867 flock->l_pid = fl->fl_pid; 1916 flock->l_pid = IS_FILE_PVT(fl) ? -1 : fl->fl_pid;
1868 flock->l_start = fl->fl_start; 1917 flock->l_start = fl->fl_start;
1869 flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : 1918 flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
1870 fl->fl_end - fl->fl_start + 1; 1919 fl->fl_end - fl->fl_start + 1;
@@ -1876,7 +1925,7 @@ static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
1876/* Report the first existing lock that would conflict with l. 1925/* Report the first existing lock that would conflict with l.
1877 * This implements the F_GETLK command of fcntl(). 1926 * This implements the F_GETLK command of fcntl().
1878 */ 1927 */
1879int fcntl_getlk(struct file *filp, struct flock __user *l) 1928int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l)
1880{ 1929{
1881 struct file_lock file_lock; 1930 struct file_lock file_lock;
1882 struct flock flock; 1931 struct flock flock;
@@ -1893,6 +1942,16 @@ int fcntl_getlk(struct file *filp, struct flock __user *l)
1893 if (error) 1942 if (error)
1894 goto out; 1943 goto out;
1895 1944
1945 if (cmd == F_GETLKP) {
1946 error = -EINVAL;
1947 if (flock.l_pid != 0)
1948 goto out;
1949
1950 cmd = F_GETLK;
1951 file_lock.fl_flags |= FL_FILE_PVT;
1952 file_lock.fl_owner = (fl_owner_t)filp;
1953 }
1954
1896 error = vfs_test_lock(filp, &file_lock); 1955 error = vfs_test_lock(filp, &file_lock);
1897 if (error) 1956 if (error)
1898 goto out; 1957 goto out;
@@ -2012,25 +2071,32 @@ again:
2012 error = flock_to_posix_lock(filp, file_lock, &flock); 2071 error = flock_to_posix_lock(filp, file_lock, &flock);
2013 if (error) 2072 if (error)
2014 goto out; 2073 goto out;
2015 if (cmd == F_SETLKW) { 2074
2016 file_lock->fl_flags |= FL_SLEEP; 2075 /*
2017 } 2076 * If the cmd is requesting file-private locks, then set the
2018 2077 * FL_FILE_PVT flag and override the owner.
2019 error = -EBADF; 2078 */
2020 switch (flock.l_type) { 2079 switch (cmd) {
2021 case F_RDLCK: 2080 case F_SETLKP:
2022 if (!(filp->f_mode & FMODE_READ)) 2081 error = -EINVAL;
2023 goto out; 2082 if (flock.l_pid != 0)
2024 break;
2025 case F_WRLCK:
2026 if (!(filp->f_mode & FMODE_WRITE))
2027 goto out; 2083 goto out;
2084
2085 cmd = F_SETLK;
2086 file_lock->fl_flags |= FL_FILE_PVT;
2087 file_lock->fl_owner = (fl_owner_t)filp;
2028 break; 2088 break;
2029 case F_UNLCK: 2089 case F_SETLKPW:
2030 break;
2031 default:
2032 error = -EINVAL; 2090 error = -EINVAL;
2033 goto out; 2091 if (flock.l_pid != 0)
2092 goto out;
2093
2094 cmd = F_SETLKW;
2095 file_lock->fl_flags |= FL_FILE_PVT;
2096 file_lock->fl_owner = (fl_owner_t)filp;
2097 /* Fallthrough */
2098 case F_SETLKW:
2099 file_lock->fl_flags |= FL_SLEEP;
2034 } 2100 }
2035 2101
2036 error = do_lock_file_wait(filp, cmd, file_lock); 2102 error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2061,7 +2127,7 @@ out:
2061/* Report the first existing lock that would conflict with l. 2127/* Report the first existing lock that would conflict with l.
2062 * This implements the F_GETLK command of fcntl(). 2128 * This implements the F_GETLK command of fcntl().
2063 */ 2129 */
2064int fcntl_getlk64(struct file *filp, struct flock64 __user *l) 2130int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l)
2065{ 2131{
2066 struct file_lock file_lock; 2132 struct file_lock file_lock;
2067 struct flock64 flock; 2133 struct flock64 flock;
@@ -2078,6 +2144,16 @@ int fcntl_getlk64(struct file *filp, struct flock64 __user *l)
2078 if (error) 2144 if (error)
2079 goto out; 2145 goto out;
2080 2146
2147 if (cmd == F_GETLKP) {
2148 error = -EINVAL;
2149 if (flock.l_pid != 0)
2150 goto out;
2151
2152 cmd = F_GETLK64;
2153 file_lock.fl_flags |= FL_FILE_PVT;
2154 file_lock.fl_owner = (fl_owner_t)filp;
2155 }
2156
2081 error = vfs_test_lock(filp, &file_lock); 2157 error = vfs_test_lock(filp, &file_lock);
2082 if (error) 2158 if (error)
2083 goto out; 2159 goto out;
@@ -2130,25 +2206,32 @@ again:
2130 error = flock64_to_posix_lock(filp, file_lock, &flock); 2206 error = flock64_to_posix_lock(filp, file_lock, &flock);
2131 if (error) 2207 if (error)
2132 goto out; 2208 goto out;
2133 if (cmd == F_SETLKW64) { 2209
2134 file_lock->fl_flags |= FL_SLEEP; 2210 /*
2135 } 2211 * If the cmd is requesting file-private locks, then set the
2136 2212 * FL_FILE_PVT flag and override the owner.
2137 error = -EBADF; 2213 */
2138 switch (flock.l_type) { 2214 switch (cmd) {
2139 case F_RDLCK: 2215 case F_SETLKP:
2140 if (!(filp->f_mode & FMODE_READ)) 2216 error = -EINVAL;
2141 goto out; 2217 if (flock.l_pid != 0)
2142 break;
2143 case F_WRLCK:
2144 if (!(filp->f_mode & FMODE_WRITE))
2145 goto out; 2218 goto out;
2219
2220 cmd = F_SETLK64;
2221 file_lock->fl_flags |= FL_FILE_PVT;
2222 file_lock->fl_owner = (fl_owner_t)filp;
2146 break; 2223 break;
2147 case F_UNLCK: 2224 case F_SETLKPW:
2148 break;
2149 default:
2150 error = -EINVAL; 2225 error = -EINVAL;
2151 goto out; 2226 if (flock.l_pid != 0)
2227 goto out;
2228
2229 cmd = F_SETLKW64;
2230 file_lock->fl_flags |= FL_FILE_PVT;
2231 file_lock->fl_owner = (fl_owner_t)filp;
2232 /* Fallthrough */
2233 case F_SETLKW64:
2234 file_lock->fl_flags |= FL_SLEEP;
2152 } 2235 }
2153 2236
2154 error = do_lock_file_wait(filp, cmd, file_lock); 2237 error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2209,7 +2292,7 @@ EXPORT_SYMBOL(locks_remove_posix);
2209/* 2292/*
2210 * This function is called on the last close of an open file. 2293 * This function is called on the last close of an open file.
2211 */ 2294 */
2212void locks_remove_flock(struct file *filp) 2295void locks_remove_file(struct file *filp)
2213{ 2296{
2214 struct inode * inode = file_inode(filp); 2297 struct inode * inode = file_inode(filp);
2215 struct file_lock *fl; 2298 struct file_lock *fl;
@@ -2218,6 +2301,8 @@ void locks_remove_flock(struct file *filp)
2218 if (!inode->i_flock) 2301 if (!inode->i_flock)
2219 return; 2302 return;
2220 2303
2304 locks_remove_posix(filp, (fl_owner_t)filp);
2305
2221 if (filp->f_op->flock) { 2306 if (filp->f_op->flock) {
2222 struct file_lock fl = { 2307 struct file_lock fl = {
2223 .fl_pid = current->tgid, 2308 .fl_pid = current->tgid,
@@ -2236,16 +2321,28 @@ void locks_remove_flock(struct file *filp)
2236 2321
2237 while ((fl = *before) != NULL) { 2322 while ((fl = *before) != NULL) {
2238 if (fl->fl_file == filp) { 2323 if (fl->fl_file == filp) {
2239 if (IS_FLOCK(fl)) {
2240 locks_delete_lock(before);
2241 continue;
2242 }
2243 if (IS_LEASE(fl)) { 2324 if (IS_LEASE(fl)) {
2244 lease_modify(before, F_UNLCK); 2325 lease_modify(before, F_UNLCK);
2245 continue; 2326 continue;
2246 } 2327 }
2247 /* What? */ 2328
2248 BUG(); 2329 /*
2330 * There's a leftover lock on the list of a type that
2331 * we didn't expect to see. Most likely a classic
2332 * POSIX lock that ended up not getting released
2333 * properly, or that raced onto the list somehow. Log
2334 * some info about it and then just remove it from
2335 * the list.
2336 */
2337 WARN(!IS_FLOCK(fl),
2338 "leftover lock: dev=%u:%u ino=%lu type=%hhd flags=0x%x start=%lld end=%lld\n",
2339 MAJOR(inode->i_sb->s_dev),
2340 MINOR(inode->i_sb->s_dev), inode->i_ino,
2341 fl->fl_type, fl->fl_flags,
2342 fl->fl_start, fl->fl_end);
2343
2344 locks_delete_lock(before);
2345 continue;
2249 } 2346 }
2250 before = &fl->fl_next; 2347 before = &fl->fl_next;
2251 } 2348 }
@@ -2314,8 +2411,14 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2314 2411
2315 seq_printf(f, "%lld:%s ", id, pfx); 2412 seq_printf(f, "%lld:%s ", id, pfx);
2316 if (IS_POSIX(fl)) { 2413 if (IS_POSIX(fl)) {
2317 seq_printf(f, "%6s %s ", 2414 if (fl->fl_flags & FL_ACCESS)
2318 (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ", 2415 seq_printf(f, "ACCESS");
2416 else if (IS_FILE_PVT(fl))
2417 seq_printf(f, "FLPVT ");
2418 else
2419 seq_printf(f, "POSIX ");
2420
2421 seq_printf(f, " %s ",
2319 (inode == NULL) ? "*NOINODE*" : 2422 (inode == NULL) ? "*NOINODE*" :
2320 mandatory_lock(inode) ? "MANDATORY" : "ADVISORY "); 2423 mandatory_lock(inode) ? "MANDATORY" : "ADVISORY ");
2321 } else if (IS_FLOCK(fl)) { 2424 } else if (IS_FLOCK(fl)) {
@@ -2385,6 +2488,7 @@ static int locks_show(struct seq_file *f, void *v)
2385} 2488}
2386 2489
2387static void *locks_start(struct seq_file *f, loff_t *pos) 2490static void *locks_start(struct seq_file *f, loff_t *pos)
2491 __acquires(&blocked_lock_lock)
2388{ 2492{
2389 struct locks_iterator *iter = f->private; 2493 struct locks_iterator *iter = f->private;
2390 2494
@@ -2403,6 +2507,7 @@ static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
2403} 2507}
2404 2508
2405static void locks_stop(struct seq_file *f, void *v) 2509static void locks_stop(struct seq_file *f, void *v)
2510 __releases(&blocked_lock_lock)
2406{ 2511{
2407 spin_unlock(&blocked_lock_lock); 2512 spin_unlock(&blocked_lock_lock);
2408 lg_global_unlock(&file_lock_lglock); 2513 lg_global_unlock(&file_lock_lglock);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 9a59cbade2fb..48140315f627 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -2180,7 +2180,7 @@ void logfs_evict_inode(struct inode *inode)
2180 do_delete_inode(inode); 2180 do_delete_inode(inode);
2181 } 2181 }
2182 } 2182 }
2183 truncate_inode_pages(&inode->i_data, 0); 2183 truncate_inode_pages_final(&inode->i_data);
2184 clear_inode(inode); 2184 clear_inode(inode);
2185 2185
2186 /* Cheaper version of write_inode. All changes are concealed in 2186 /* Cheaper version of write_inode. All changes are concealed in
diff --git a/fs/mbcache.c b/fs/mbcache.c
index e519e45bf673..bf166e388f0d 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -26,6 +26,41 @@
26 * back on the lru list. 26 * back on the lru list.
27 */ 27 */
28 28
29/*
30 * Lock descriptions and usage:
31 *
32 * Each hash chain of both the block and index hash tables now contains
33 * a built-in lock used to serialize accesses to the hash chain.
34 *
35 * Accesses to global data structures mb_cache_list and mb_cache_lru_list
36 * are serialized via the global spinlock mb_cache_spinlock.
37 *
38 * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize
39 * accesses to its local data, such as e_used and e_queued.
40 *
41 * Lock ordering:
42 *
43 * Each block hash chain's lock has the highest lock order, followed by an
44 * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's
45 * lock), and mb_cach_spinlock, with the lowest order. While holding
46 * either a block or index hash chain lock, a thread can acquire an
47 * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock.
48 *
49 * Synchronization:
50 *
51 * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and
52 * index hash chian, it needs to lock the corresponding hash chain. For each
53 * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to
54 * prevent either any simultaneous release or free on the entry and also
55 * to serialize accesses to either the e_used or e_queued member of the entry.
56 *
57 * To avoid having a dangling reference to an already freed
58 * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
59 * block hash chain and also no longer being referenced, both e_used,
60 * and e_queued are 0's. When an mb_cache_entry is explicitly freed it is
61 * first removed from a block hash chain.
62 */
63
29#include <linux/kernel.h> 64#include <linux/kernel.h>
30#include <linux/module.h> 65#include <linux/module.h>
31 66
@@ -34,9 +69,10 @@
34#include <linux/mm.h> 69#include <linux/mm.h>
35#include <linux/slab.h> 70#include <linux/slab.h>
36#include <linux/sched.h> 71#include <linux/sched.h>
37#include <linux/init.h> 72#include <linux/list_bl.h>
38#include <linux/mbcache.h> 73#include <linux/mbcache.h>
39 74#include <linux/init.h>
75#include <linux/blockgroup_lock.h>
40 76
41#ifdef MB_CACHE_DEBUG 77#ifdef MB_CACHE_DEBUG
42# define mb_debug(f...) do { \ 78# define mb_debug(f...) do { \
@@ -57,8 +93,14 @@
57 93
58#define MB_CACHE_WRITER ((unsigned short)~0U >> 1) 94#define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
59 95
96#define MB_CACHE_ENTRY_LOCK_BITS __builtin_log2(NR_BG_LOCKS)
97#define MB_CACHE_ENTRY_LOCK_INDEX(ce) \
98 (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
99
60static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue); 100static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
61 101static struct blockgroup_lock *mb_cache_bg_lock;
102static struct kmem_cache *mb_cache_kmem_cache;
103
62MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>"); 104MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
63MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); 105MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
64MODULE_LICENSE("GPL"); 106MODULE_LICENSE("GPL");
@@ -86,58 +128,110 @@ static LIST_HEAD(mb_cache_list);
86static LIST_HEAD(mb_cache_lru_list); 128static LIST_HEAD(mb_cache_lru_list);
87static DEFINE_SPINLOCK(mb_cache_spinlock); 129static DEFINE_SPINLOCK(mb_cache_spinlock);
88 130
131static inline void
132__spin_lock_mb_cache_entry(struct mb_cache_entry *ce)
133{
134 spin_lock(bgl_lock_ptr(mb_cache_bg_lock,
135 MB_CACHE_ENTRY_LOCK_INDEX(ce)));
136}
137
138static inline void
139__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce)
140{
141 spin_unlock(bgl_lock_ptr(mb_cache_bg_lock,
142 MB_CACHE_ENTRY_LOCK_INDEX(ce)));
143}
144
89static inline int 145static inline int
90__mb_cache_entry_is_hashed(struct mb_cache_entry *ce) 146__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
91{ 147{
92 return !list_empty(&ce->e_block_list); 148 return !hlist_bl_unhashed(&ce->e_block_list);
93} 149}
94 150
95 151
96static void 152static inline void
97__mb_cache_entry_unhash(struct mb_cache_entry *ce) 153__mb_cache_entry_unhash_block(struct mb_cache_entry *ce)
98{ 154{
99 if (__mb_cache_entry_is_hashed(ce)) { 155 if (__mb_cache_entry_is_block_hashed(ce))
100 list_del_init(&ce->e_block_list); 156 hlist_bl_del_init(&ce->e_block_list);
101 list_del(&ce->e_index.o_list);
102 }
103} 157}
104 158
159static inline int
160__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce)
161{
162 return !hlist_bl_unhashed(&ce->e_index.o_list);
163}
164
165static inline void
166__mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
167{
168 if (__mb_cache_entry_is_index_hashed(ce))
169 hlist_bl_del_init(&ce->e_index.o_list);
170}
171
172/*
173 * __mb_cache_entry_unhash_unlock()
174 *
175 * This function is called to unhash both the block and index hash
176 * chain.
177 * It assumes both the block and index hash chain is locked upon entry.
178 * It also unlock both hash chains both exit
179 */
180static inline void
181__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce)
182{
183 __mb_cache_entry_unhash_index(ce);
184 hlist_bl_unlock(ce->e_index_hash_p);
185 __mb_cache_entry_unhash_block(ce);
186 hlist_bl_unlock(ce->e_block_hash_p);
187}
105 188
106static void 189static void
107__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask) 190__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
108{ 191{
109 struct mb_cache *cache = ce->e_cache; 192 struct mb_cache *cache = ce->e_cache;
110 193
111 mb_assert(!(ce->e_used || ce->e_queued)); 194 mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)));
112 kmem_cache_free(cache->c_entry_cache, ce); 195 kmem_cache_free(cache->c_entry_cache, ce);
113 atomic_dec(&cache->c_entry_count); 196 atomic_dec(&cache->c_entry_count);
114} 197}
115 198
116
117static void 199static void
118__mb_cache_entry_release_unlock(struct mb_cache_entry *ce) 200__mb_cache_entry_release(struct mb_cache_entry *ce)
119 __releases(mb_cache_spinlock)
120{ 201{
202 /* First lock the entry to serialize access to its local data. */
203 __spin_lock_mb_cache_entry(ce);
121 /* Wake up all processes queuing for this cache entry. */ 204 /* Wake up all processes queuing for this cache entry. */
122 if (ce->e_queued) 205 if (ce->e_queued)
123 wake_up_all(&mb_cache_queue); 206 wake_up_all(&mb_cache_queue);
124 if (ce->e_used >= MB_CACHE_WRITER) 207 if (ce->e_used >= MB_CACHE_WRITER)
125 ce->e_used -= MB_CACHE_WRITER; 208 ce->e_used -= MB_CACHE_WRITER;
209 /*
210 * Make sure that all cache entries on lru_list have
211 * both e_used and e_qued of 0s.
212 */
126 ce->e_used--; 213 ce->e_used--;
127 if (!(ce->e_used || ce->e_queued)) { 214 if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) {
128 if (!__mb_cache_entry_is_hashed(ce)) 215 if (!__mb_cache_entry_is_block_hashed(ce)) {
216 __spin_unlock_mb_cache_entry(ce);
129 goto forget; 217 goto forget;
130 mb_assert(list_empty(&ce->e_lru_list)); 218 }
131 list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); 219 /*
220 * Need access to lru list, first drop entry lock,
221 * then reacquire the lock in the proper order.
222 */
223 spin_lock(&mb_cache_spinlock);
224 if (list_empty(&ce->e_lru_list))
225 list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
226 spin_unlock(&mb_cache_spinlock);
132 } 227 }
133 spin_unlock(&mb_cache_spinlock); 228 __spin_unlock_mb_cache_entry(ce);
134 return; 229 return;
135forget: 230forget:
136 spin_unlock(&mb_cache_spinlock); 231 mb_assert(list_empty(&ce->e_lru_list));
137 __mb_cache_entry_forget(ce, GFP_KERNEL); 232 __mb_cache_entry_forget(ce, GFP_KERNEL);
138} 233}
139 234
140
141/* 235/*
142 * mb_cache_shrink_scan() memory pressure callback 236 * mb_cache_shrink_scan() memory pressure callback
143 * 237 *
@@ -160,17 +254,34 @@ mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
160 254
161 mb_debug("trying to free %d entries", nr_to_scan); 255 mb_debug("trying to free %d entries", nr_to_scan);
162 spin_lock(&mb_cache_spinlock); 256 spin_lock(&mb_cache_spinlock);
163 while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) { 257 while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) {
164 struct mb_cache_entry *ce = 258 struct mb_cache_entry *ce =
165 list_entry(mb_cache_lru_list.next, 259 list_entry(mb_cache_lru_list.next,
166 struct mb_cache_entry, e_lru_list); 260 struct mb_cache_entry, e_lru_list);
167 list_move_tail(&ce->e_lru_list, &free_list); 261 list_del_init(&ce->e_lru_list);
168 __mb_cache_entry_unhash(ce); 262 if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))
169 freed++; 263 continue;
264 spin_unlock(&mb_cache_spinlock);
265 /* Prevent any find or get operation on the entry */
266 hlist_bl_lock(ce->e_block_hash_p);
267 hlist_bl_lock(ce->e_index_hash_p);
268 /* Ignore if it is touched by a find/get */
269 if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) ||
270 !list_empty(&ce->e_lru_list)) {
271 hlist_bl_unlock(ce->e_index_hash_p);
272 hlist_bl_unlock(ce->e_block_hash_p);
273 spin_lock(&mb_cache_spinlock);
274 continue;
275 }
276 __mb_cache_entry_unhash_unlock(ce);
277 list_add_tail(&ce->e_lru_list, &free_list);
278 spin_lock(&mb_cache_spinlock);
170 } 279 }
171 spin_unlock(&mb_cache_spinlock); 280 spin_unlock(&mb_cache_spinlock);
281
172 list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { 282 list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
173 __mb_cache_entry_forget(entry, gfp_mask); 283 __mb_cache_entry_forget(entry, gfp_mask);
284 freed++;
174 } 285 }
175 return freed; 286 return freed;
176} 287}
@@ -215,29 +326,40 @@ mb_cache_create(const char *name, int bucket_bits)
215 int n, bucket_count = 1 << bucket_bits; 326 int n, bucket_count = 1 << bucket_bits;
216 struct mb_cache *cache = NULL; 327 struct mb_cache *cache = NULL;
217 328
329 if (!mb_cache_bg_lock) {
330 mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock),
331 GFP_KERNEL);
332 if (!mb_cache_bg_lock)
333 return NULL;
334 bgl_lock_init(mb_cache_bg_lock);
335 }
336
218 cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL); 337 cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
219 if (!cache) 338 if (!cache)
220 return NULL; 339 return NULL;
221 cache->c_name = name; 340 cache->c_name = name;
222 atomic_set(&cache->c_entry_count, 0); 341 atomic_set(&cache->c_entry_count, 0);
223 cache->c_bucket_bits = bucket_bits; 342 cache->c_bucket_bits = bucket_bits;
224 cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head), 343 cache->c_block_hash = kmalloc(bucket_count *
225 GFP_KERNEL); 344 sizeof(struct hlist_bl_head), GFP_KERNEL);
226 if (!cache->c_block_hash) 345 if (!cache->c_block_hash)
227 goto fail; 346 goto fail;
228 for (n=0; n<bucket_count; n++) 347 for (n=0; n<bucket_count; n++)
229 INIT_LIST_HEAD(&cache->c_block_hash[n]); 348 INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
230 cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head), 349 cache->c_index_hash = kmalloc(bucket_count *
231 GFP_KERNEL); 350 sizeof(struct hlist_bl_head), GFP_KERNEL);
232 if (!cache->c_index_hash) 351 if (!cache->c_index_hash)
233 goto fail; 352 goto fail;
234 for (n=0; n<bucket_count; n++) 353 for (n=0; n<bucket_count; n++)
235 INIT_LIST_HEAD(&cache->c_index_hash[n]); 354 INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
236 cache->c_entry_cache = kmem_cache_create(name, 355 if (!mb_cache_kmem_cache) {
237 sizeof(struct mb_cache_entry), 0, 356 mb_cache_kmem_cache = kmem_cache_create(name,
238 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); 357 sizeof(struct mb_cache_entry), 0,
239 if (!cache->c_entry_cache) 358 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
240 goto fail2; 359 if (!mb_cache_kmem_cache)
360 goto fail2;
361 }
362 cache->c_entry_cache = mb_cache_kmem_cache;
241 363
242 /* 364 /*
243 * Set an upper limit on the number of cache entries so that the hash 365 * Set an upper limit on the number of cache entries so that the hash
@@ -273,21 +395,47 @@ void
273mb_cache_shrink(struct block_device *bdev) 395mb_cache_shrink(struct block_device *bdev)
274{ 396{
275 LIST_HEAD(free_list); 397 LIST_HEAD(free_list);
276 struct list_head *l, *ltmp; 398 struct list_head *l;
399 struct mb_cache_entry *ce, *tmp;
277 400
401 l = &mb_cache_lru_list;
278 spin_lock(&mb_cache_spinlock); 402 spin_lock(&mb_cache_spinlock);
279 list_for_each_safe(l, ltmp, &mb_cache_lru_list) { 403 while (!list_is_last(l, &mb_cache_lru_list)) {
280 struct mb_cache_entry *ce = 404 l = l->next;
281 list_entry(l, struct mb_cache_entry, e_lru_list); 405 ce = list_entry(l, struct mb_cache_entry, e_lru_list);
282 if (ce->e_bdev == bdev) { 406 if (ce->e_bdev == bdev) {
283 list_move_tail(&ce->e_lru_list, &free_list); 407 list_del_init(&ce->e_lru_list);
284 __mb_cache_entry_unhash(ce); 408 if (ce->e_used || ce->e_queued ||
409 atomic_read(&ce->e_refcnt))
410 continue;
411 spin_unlock(&mb_cache_spinlock);
412 /*
413 * Prevent any find or get operation on the entry.
414 */
415 hlist_bl_lock(ce->e_block_hash_p);
416 hlist_bl_lock(ce->e_index_hash_p);
417 /* Ignore if it is touched by a find/get */
418 if (ce->e_used || ce->e_queued ||
419 atomic_read(&ce->e_refcnt) ||
420 !list_empty(&ce->e_lru_list)) {
421 hlist_bl_unlock(ce->e_index_hash_p);
422 hlist_bl_unlock(ce->e_block_hash_p);
423 l = &mb_cache_lru_list;
424 spin_lock(&mb_cache_spinlock);
425 continue;
426 }
427 __mb_cache_entry_unhash_unlock(ce);
428 mb_assert(!(ce->e_used || ce->e_queued ||
429 atomic_read(&ce->e_refcnt)));
430 list_add_tail(&ce->e_lru_list, &free_list);
431 l = &mb_cache_lru_list;
432 spin_lock(&mb_cache_spinlock);
285 } 433 }
286 } 434 }
287 spin_unlock(&mb_cache_spinlock); 435 spin_unlock(&mb_cache_spinlock);
288 list_for_each_safe(l, ltmp, &free_list) { 436
289 __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, 437 list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
290 e_lru_list), GFP_KERNEL); 438 __mb_cache_entry_forget(ce, GFP_KERNEL);
291 } 439 }
292} 440}
293 441
@@ -303,23 +451,27 @@ void
303mb_cache_destroy(struct mb_cache *cache) 451mb_cache_destroy(struct mb_cache *cache)
304{ 452{
305 LIST_HEAD(free_list); 453 LIST_HEAD(free_list);
306 struct list_head *l, *ltmp; 454 struct mb_cache_entry *ce, *tmp;
307 455
308 spin_lock(&mb_cache_spinlock); 456 spin_lock(&mb_cache_spinlock);
309 list_for_each_safe(l, ltmp, &mb_cache_lru_list) { 457 list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) {
310 struct mb_cache_entry *ce = 458 if (ce->e_cache == cache)
311 list_entry(l, struct mb_cache_entry, e_lru_list);
312 if (ce->e_cache == cache) {
313 list_move_tail(&ce->e_lru_list, &free_list); 459 list_move_tail(&ce->e_lru_list, &free_list);
314 __mb_cache_entry_unhash(ce);
315 }
316 } 460 }
317 list_del(&cache->c_cache_list); 461 list_del(&cache->c_cache_list);
318 spin_unlock(&mb_cache_spinlock); 462 spin_unlock(&mb_cache_spinlock);
319 463
320 list_for_each_safe(l, ltmp, &free_list) { 464 list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
321 __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, 465 list_del_init(&ce->e_lru_list);
322 e_lru_list), GFP_KERNEL); 466 /*
467 * Prevent any find or get operation on the entry.
468 */
469 hlist_bl_lock(ce->e_block_hash_p);
470 hlist_bl_lock(ce->e_index_hash_p);
471 mb_assert(!(ce->e_used || ce->e_queued ||
472 atomic_read(&ce->e_refcnt)));
473 __mb_cache_entry_unhash_unlock(ce);
474 __mb_cache_entry_forget(ce, GFP_KERNEL);
323 } 475 }
324 476
325 if (atomic_read(&cache->c_entry_count) > 0) { 477 if (atomic_read(&cache->c_entry_count) > 0) {
@@ -328,8 +480,10 @@ mb_cache_destroy(struct mb_cache *cache)
328 atomic_read(&cache->c_entry_count)); 480 atomic_read(&cache->c_entry_count));
329 } 481 }
330 482
331 kmem_cache_destroy(cache->c_entry_cache); 483 if (list_empty(&mb_cache_list)) {
332 484 kmem_cache_destroy(mb_cache_kmem_cache);
485 mb_cache_kmem_cache = NULL;
486 }
333 kfree(cache->c_index_hash); 487 kfree(cache->c_index_hash);
334 kfree(cache->c_block_hash); 488 kfree(cache->c_block_hash);
335 kfree(cache); 489 kfree(cache);
@@ -346,28 +500,61 @@ mb_cache_destroy(struct mb_cache *cache)
346struct mb_cache_entry * 500struct mb_cache_entry *
347mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) 501mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
348{ 502{
349 struct mb_cache_entry *ce = NULL; 503 struct mb_cache_entry *ce;
350 504
351 if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) { 505 if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
506 struct list_head *l;
507
508 l = &mb_cache_lru_list;
352 spin_lock(&mb_cache_spinlock); 509 spin_lock(&mb_cache_spinlock);
353 if (!list_empty(&mb_cache_lru_list)) { 510 while (!list_is_last(l, &mb_cache_lru_list)) {
354 ce = list_entry(mb_cache_lru_list.next, 511 l = l->next;
355 struct mb_cache_entry, e_lru_list); 512 ce = list_entry(l, struct mb_cache_entry, e_lru_list);
356 list_del_init(&ce->e_lru_list); 513 if (ce->e_cache == cache) {
357 __mb_cache_entry_unhash(ce); 514 list_del_init(&ce->e_lru_list);
515 if (ce->e_used || ce->e_queued ||
516 atomic_read(&ce->e_refcnt))
517 continue;
518 spin_unlock(&mb_cache_spinlock);
519 /*
520 * Prevent any find or get operation on the
521 * entry.
522 */
523 hlist_bl_lock(ce->e_block_hash_p);
524 hlist_bl_lock(ce->e_index_hash_p);
525 /* Ignore if it is touched by a find/get */
526 if (ce->e_used || ce->e_queued ||
527 atomic_read(&ce->e_refcnt) ||
528 !list_empty(&ce->e_lru_list)) {
529 hlist_bl_unlock(ce->e_index_hash_p);
530 hlist_bl_unlock(ce->e_block_hash_p);
531 l = &mb_cache_lru_list;
532 spin_lock(&mb_cache_spinlock);
533 continue;
534 }
535 mb_assert(list_empty(&ce->e_lru_list));
536 mb_assert(!(ce->e_used || ce->e_queued ||
537 atomic_read(&ce->e_refcnt)));
538 __mb_cache_entry_unhash_unlock(ce);
539 goto found;
540 }
358 } 541 }
359 spin_unlock(&mb_cache_spinlock); 542 spin_unlock(&mb_cache_spinlock);
360 } 543 }
361 if (!ce) { 544
362 ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); 545 ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
363 if (!ce) 546 if (!ce)
364 return NULL; 547 return NULL;
365 atomic_inc(&cache->c_entry_count); 548 atomic_inc(&cache->c_entry_count);
366 INIT_LIST_HEAD(&ce->e_lru_list); 549 INIT_LIST_HEAD(&ce->e_lru_list);
367 INIT_LIST_HEAD(&ce->e_block_list); 550 INIT_HLIST_BL_NODE(&ce->e_block_list);
368 ce->e_cache = cache; 551 INIT_HLIST_BL_NODE(&ce->e_index.o_list);
369 ce->e_queued = 0; 552 ce->e_cache = cache;
370 } 553 ce->e_queued = 0;
554 atomic_set(&ce->e_refcnt, 0);
555found:
556 ce->e_block_hash_p = &cache->c_block_hash[0];
557 ce->e_index_hash_p = &cache->c_index_hash[0];
371 ce->e_used = 1 + MB_CACHE_WRITER; 558 ce->e_used = 1 + MB_CACHE_WRITER;
372 return ce; 559 return ce;
373} 560}
@@ -393,29 +580,38 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
393{ 580{
394 struct mb_cache *cache = ce->e_cache; 581 struct mb_cache *cache = ce->e_cache;
395 unsigned int bucket; 582 unsigned int bucket;
396 struct list_head *l; 583 struct hlist_bl_node *l;
397 int error = -EBUSY; 584 struct hlist_bl_head *block_hash_p;
585 struct hlist_bl_head *index_hash_p;
586 struct mb_cache_entry *lce;
398 587
588 mb_assert(ce);
399 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 589 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
400 cache->c_bucket_bits); 590 cache->c_bucket_bits);
401 spin_lock(&mb_cache_spinlock); 591 block_hash_p = &cache->c_block_hash[bucket];
402 list_for_each_prev(l, &cache->c_block_hash[bucket]) { 592 hlist_bl_lock(block_hash_p);
403 struct mb_cache_entry *ce = 593 hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
404 list_entry(l, struct mb_cache_entry, e_block_list); 594 if (lce->e_bdev == bdev && lce->e_block == block) {
405 if (ce->e_bdev == bdev && ce->e_block == block) 595 hlist_bl_unlock(block_hash_p);
406 goto out; 596 return -EBUSY;
597 }
407 } 598 }
408 __mb_cache_entry_unhash(ce); 599 mb_assert(!__mb_cache_entry_is_block_hashed(ce));
600 __mb_cache_entry_unhash_block(ce);
601 __mb_cache_entry_unhash_index(ce);
409 ce->e_bdev = bdev; 602 ce->e_bdev = bdev;
410 ce->e_block = block; 603 ce->e_block = block;
411 list_add(&ce->e_block_list, &cache->c_block_hash[bucket]); 604 ce->e_block_hash_p = block_hash_p;
412 ce->e_index.o_key = key; 605 ce->e_index.o_key = key;
606 hlist_bl_add_head(&ce->e_block_list, block_hash_p);
607 hlist_bl_unlock(block_hash_p);
413 bucket = hash_long(key, cache->c_bucket_bits); 608 bucket = hash_long(key, cache->c_bucket_bits);
414 list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]); 609 index_hash_p = &cache->c_index_hash[bucket];
415 error = 0; 610 hlist_bl_lock(index_hash_p);
416out: 611 ce->e_index_hash_p = index_hash_p;
417 spin_unlock(&mb_cache_spinlock); 612 hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
418 return error; 613 hlist_bl_unlock(index_hash_p);
614 return 0;
419} 615}
420 616
421 617
@@ -429,24 +625,26 @@ out:
429void 625void
430mb_cache_entry_release(struct mb_cache_entry *ce) 626mb_cache_entry_release(struct mb_cache_entry *ce)
431{ 627{
432 spin_lock(&mb_cache_spinlock); 628 __mb_cache_entry_release(ce);
433 __mb_cache_entry_release_unlock(ce);
434} 629}
435 630
436 631
437/* 632/*
438 * mb_cache_entry_free() 633 * mb_cache_entry_free()
439 * 634 *
440 * This is equivalent to the sequence mb_cache_entry_takeout() --
441 * mb_cache_entry_release().
442 */ 635 */
443void 636void
444mb_cache_entry_free(struct mb_cache_entry *ce) 637mb_cache_entry_free(struct mb_cache_entry *ce)
445{ 638{
446 spin_lock(&mb_cache_spinlock); 639 mb_assert(ce);
447 mb_assert(list_empty(&ce->e_lru_list)); 640 mb_assert(list_empty(&ce->e_lru_list));
448 __mb_cache_entry_unhash(ce); 641 hlist_bl_lock(ce->e_index_hash_p);
449 __mb_cache_entry_release_unlock(ce); 642 __mb_cache_entry_unhash_index(ce);
643 hlist_bl_unlock(ce->e_index_hash_p);
644 hlist_bl_lock(ce->e_block_hash_p);
645 __mb_cache_entry_unhash_block(ce);
646 hlist_bl_unlock(ce->e_block_hash_p);
647 __mb_cache_entry_release(ce);
450} 648}
451 649
452 650
@@ -463,84 +661,110 @@ mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
463 sector_t block) 661 sector_t block)
464{ 662{
465 unsigned int bucket; 663 unsigned int bucket;
466 struct list_head *l; 664 struct hlist_bl_node *l;
467 struct mb_cache_entry *ce; 665 struct mb_cache_entry *ce;
666 struct hlist_bl_head *block_hash_p;
468 667
469 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 668 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
470 cache->c_bucket_bits); 669 cache->c_bucket_bits);
471 spin_lock(&mb_cache_spinlock); 670 block_hash_p = &cache->c_block_hash[bucket];
472 list_for_each(l, &cache->c_block_hash[bucket]) { 671 /* First serialize access to the block corresponding hash chain. */
473 ce = list_entry(l, struct mb_cache_entry, e_block_list); 672 hlist_bl_lock(block_hash_p);
673 hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
674 mb_assert(ce->e_block_hash_p == block_hash_p);
474 if (ce->e_bdev == bdev && ce->e_block == block) { 675 if (ce->e_bdev == bdev && ce->e_block == block) {
475 DEFINE_WAIT(wait); 676 /*
677 * Prevent a free from removing the entry.
678 */
679 atomic_inc(&ce->e_refcnt);
680 hlist_bl_unlock(block_hash_p);
681 __spin_lock_mb_cache_entry(ce);
682 atomic_dec(&ce->e_refcnt);
683 if (ce->e_used > 0) {
684 DEFINE_WAIT(wait);
685 while (ce->e_used > 0) {
686 ce->e_queued++;
687 prepare_to_wait(&mb_cache_queue, &wait,
688 TASK_UNINTERRUPTIBLE);
689 __spin_unlock_mb_cache_entry(ce);
690 schedule();
691 __spin_lock_mb_cache_entry(ce);
692 ce->e_queued--;
693 }
694 finish_wait(&mb_cache_queue, &wait);
695 }
696 ce->e_used += 1 + MB_CACHE_WRITER;
697 __spin_unlock_mb_cache_entry(ce);
476 698
477 if (!list_empty(&ce->e_lru_list)) 699 if (!list_empty(&ce->e_lru_list)) {
700 spin_lock(&mb_cache_spinlock);
478 list_del_init(&ce->e_lru_list); 701 list_del_init(&ce->e_lru_list);
479
480 while (ce->e_used > 0) {
481 ce->e_queued++;
482 prepare_to_wait(&mb_cache_queue, &wait,
483 TASK_UNINTERRUPTIBLE);
484 spin_unlock(&mb_cache_spinlock); 702 spin_unlock(&mb_cache_spinlock);
485 schedule();
486 spin_lock(&mb_cache_spinlock);
487 ce->e_queued--;
488 } 703 }
489 finish_wait(&mb_cache_queue, &wait); 704 if (!__mb_cache_entry_is_block_hashed(ce)) {
490 ce->e_used += 1 + MB_CACHE_WRITER; 705 __mb_cache_entry_release(ce);
491
492 if (!__mb_cache_entry_is_hashed(ce)) {
493 __mb_cache_entry_release_unlock(ce);
494 return NULL; 706 return NULL;
495 } 707 }
496 goto cleanup; 708 return ce;
497 } 709 }
498 } 710 }
499 ce = NULL; 711 hlist_bl_unlock(block_hash_p);
500 712 return NULL;
501cleanup:
502 spin_unlock(&mb_cache_spinlock);
503 return ce;
504} 713}
505 714
506#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) 715#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
507 716
508static struct mb_cache_entry * 717static struct mb_cache_entry *
509__mb_cache_entry_find(struct list_head *l, struct list_head *head, 718__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
510 struct block_device *bdev, unsigned int key) 719 struct block_device *bdev, unsigned int key)
511{ 720{
512 while (l != head) { 721
722 /* The index hash chain is alredy acquire by caller. */
723 while (l != NULL) {
513 struct mb_cache_entry *ce = 724 struct mb_cache_entry *ce =
514 list_entry(l, struct mb_cache_entry, e_index.o_list); 725 hlist_bl_entry(l, struct mb_cache_entry,
726 e_index.o_list);
727 mb_assert(ce->e_index_hash_p == head);
515 if (ce->e_bdev == bdev && ce->e_index.o_key == key) { 728 if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
516 DEFINE_WAIT(wait); 729 /*
517 730 * Prevent a free from removing the entry.
518 if (!list_empty(&ce->e_lru_list)) 731 */
519 list_del_init(&ce->e_lru_list); 732 atomic_inc(&ce->e_refcnt);
520 733 hlist_bl_unlock(head);
734 __spin_lock_mb_cache_entry(ce);
735 atomic_dec(&ce->e_refcnt);
736 ce->e_used++;
521 /* Incrementing before holding the lock gives readers 737 /* Incrementing before holding the lock gives readers
522 priority over writers. */ 738 priority over writers. */
523 ce->e_used++; 739 if (ce->e_used >= MB_CACHE_WRITER) {
524 while (ce->e_used >= MB_CACHE_WRITER) { 740 DEFINE_WAIT(wait);
525 ce->e_queued++; 741
526 prepare_to_wait(&mb_cache_queue, &wait, 742 while (ce->e_used >= MB_CACHE_WRITER) {
527 TASK_UNINTERRUPTIBLE); 743 ce->e_queued++;
528 spin_unlock(&mb_cache_spinlock); 744 prepare_to_wait(&mb_cache_queue, &wait,
529 schedule(); 745 TASK_UNINTERRUPTIBLE);
530 spin_lock(&mb_cache_spinlock); 746 __spin_unlock_mb_cache_entry(ce);
531 ce->e_queued--; 747 schedule();
748 __spin_lock_mb_cache_entry(ce);
749 ce->e_queued--;
750 }
751 finish_wait(&mb_cache_queue, &wait);
532 } 752 }
533 finish_wait(&mb_cache_queue, &wait); 753 __spin_unlock_mb_cache_entry(ce);
534 754 if (!list_empty(&ce->e_lru_list)) {
535 if (!__mb_cache_entry_is_hashed(ce)) {
536 __mb_cache_entry_release_unlock(ce);
537 spin_lock(&mb_cache_spinlock); 755 spin_lock(&mb_cache_spinlock);
756 list_del_init(&ce->e_lru_list);
757 spin_unlock(&mb_cache_spinlock);
758 }
759 if (!__mb_cache_entry_is_block_hashed(ce)) {
760 __mb_cache_entry_release(ce);
538 return ERR_PTR(-EAGAIN); 761 return ERR_PTR(-EAGAIN);
539 } 762 }
540 return ce; 763 return ce;
541 } 764 }
542 l = l->next; 765 l = l->next;
543 } 766 }
767 hlist_bl_unlock(head);
544 return NULL; 768 return NULL;
545} 769}
546 770
@@ -562,13 +786,17 @@ mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
562 unsigned int key) 786 unsigned int key)
563{ 787{
564 unsigned int bucket = hash_long(key, cache->c_bucket_bits); 788 unsigned int bucket = hash_long(key, cache->c_bucket_bits);
565 struct list_head *l; 789 struct hlist_bl_node *l;
566 struct mb_cache_entry *ce; 790 struct mb_cache_entry *ce = NULL;
567 791 struct hlist_bl_head *index_hash_p;
568 spin_lock(&mb_cache_spinlock); 792
569 l = cache->c_index_hash[bucket].next; 793 index_hash_p = &cache->c_index_hash[bucket];
570 ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key); 794 hlist_bl_lock(index_hash_p);
571 spin_unlock(&mb_cache_spinlock); 795 if (!hlist_bl_empty(index_hash_p)) {
796 l = hlist_bl_first(index_hash_p);
797 ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
798 } else
799 hlist_bl_unlock(index_hash_p);
572 return ce; 800 return ce;
573} 801}
574 802
@@ -597,13 +825,17 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev,
597{ 825{
598 struct mb_cache *cache = prev->e_cache; 826 struct mb_cache *cache = prev->e_cache;
599 unsigned int bucket = hash_long(key, cache->c_bucket_bits); 827 unsigned int bucket = hash_long(key, cache->c_bucket_bits);
600 struct list_head *l; 828 struct hlist_bl_node *l;
601 struct mb_cache_entry *ce; 829 struct mb_cache_entry *ce;
830 struct hlist_bl_head *index_hash_p;
602 831
603 spin_lock(&mb_cache_spinlock); 832 index_hash_p = &cache->c_index_hash[bucket];
833 mb_assert(prev->e_index_hash_p == index_hash_p);
834 hlist_bl_lock(index_hash_p);
835 mb_assert(!hlist_bl_empty(index_hash_p));
604 l = prev->e_index.o_list.next; 836 l = prev->e_index.o_list.next;
605 ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key); 837 ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
606 __mb_cache_entry_release_unlock(prev); 838 __mb_cache_entry_release(prev);
607 return ce; 839 return ce;
608} 840}
609 841
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 0332109162a5..f007a3355570 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -26,7 +26,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data);
26 26
27static void minix_evict_inode(struct inode *inode) 27static void minix_evict_inode(struct inode *inode)
28{ 28{
29 truncate_inode_pages(&inode->i_data, 0); 29 truncate_inode_pages_final(&inode->i_data);
30 if (!inode->i_nlink) { 30 if (!inode->i_nlink) {
31 inode->i_size = 0; 31 inode->i_size = 0;
32 minix_truncate(inode); 32 minix_truncate(inode);
@@ -86,7 +86,7 @@ static void init_once(void *foo)
86 inode_init_once(&ei->vfs_inode); 86 inode_init_once(&ei->vfs_inode);
87} 87}
88 88
89static int init_inodecache(void) 89static int __init init_inodecache(void)
90{ 90{
91 minix_inode_cachep = kmem_cache_create("minix_inode_cache", 91 minix_inode_cachep = kmem_cache_create("minix_inode_cache",
92 sizeof(struct minix_inode_info), 92 sizeof(struct minix_inode_info),
@@ -123,6 +123,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data)
123 struct minix_sb_info * sbi = minix_sb(sb); 123 struct minix_sb_info * sbi = minix_sb(sb);
124 struct minix_super_block * ms; 124 struct minix_super_block * ms;
125 125
126 sync_filesystem(sb);
126 ms = sbi->s_ms; 127 ms = sbi->s_ms;
127 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 128 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
128 return 0; 129 return 0;
diff --git a/fs/namei.c b/fs/namei.c
index 4b491b431990..88339f59efb5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1796,7 +1796,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
1796 if (err) 1796 if (err)
1797 return err; 1797 return err;
1798 } 1798 }
1799 if (!d_is_directory(nd->path.dentry)) { 1799 if (!d_can_lookup(nd->path.dentry)) {
1800 err = -ENOTDIR; 1800 err = -ENOTDIR;
1801 break; 1801 break;
1802 } 1802 }
@@ -1817,7 +1817,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1817 struct dentry *root = nd->root.dentry; 1817 struct dentry *root = nd->root.dentry;
1818 struct inode *inode = root->d_inode; 1818 struct inode *inode = root->d_inode;
1819 if (*name) { 1819 if (*name) {
1820 if (!d_is_directory(root)) 1820 if (!d_can_lookup(root))
1821 return -ENOTDIR; 1821 return -ENOTDIR;
1822 retval = inode_permission(inode, MAY_EXEC); 1822 retval = inode_permission(inode, MAY_EXEC);
1823 if (retval) 1823 if (retval)
@@ -1873,7 +1873,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1873 dentry = f.file->f_path.dentry; 1873 dentry = f.file->f_path.dentry;
1874 1874
1875 if (*name) { 1875 if (*name) {
1876 if (!d_is_directory(dentry)) { 1876 if (!d_can_lookup(dentry)) {
1877 fdput(f); 1877 fdput(f);
1878 return -ENOTDIR; 1878 return -ENOTDIR;
1879 } 1879 }
@@ -1955,7 +1955,7 @@ static int path_lookupat(int dfd, const char *name,
1955 err = complete_walk(nd); 1955 err = complete_walk(nd);
1956 1956
1957 if (!err && nd->flags & LOOKUP_DIRECTORY) { 1957 if (!err && nd->flags & LOOKUP_DIRECTORY) {
1958 if (!d_is_directory(nd->path.dentry)) { 1958 if (!d_can_lookup(nd->path.dentry)) {
1959 path_put(&nd->path); 1959 path_put(&nd->path);
1960 err = -ENOTDIR; 1960 err = -ENOTDIR;
1961 } 1961 }
@@ -2414,11 +2414,11 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
2414 IS_IMMUTABLE(inode) || IS_SWAPFILE(inode)) 2414 IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
2415 return -EPERM; 2415 return -EPERM;
2416 if (isdir) { 2416 if (isdir) {
2417 if (!d_is_directory(victim) && !d_is_autodir(victim)) 2417 if (!d_is_dir(victim))
2418 return -ENOTDIR; 2418 return -ENOTDIR;
2419 if (IS_ROOT(victim)) 2419 if (IS_ROOT(victim))
2420 return -EBUSY; 2420 return -EBUSY;
2421 } else if (d_is_directory(victim) || d_is_autodir(victim)) 2421 } else if (d_is_dir(victim))
2422 return -EISDIR; 2422 return -EISDIR;
2423 if (IS_DEADDIR(dir)) 2423 if (IS_DEADDIR(dir))
2424 return -ENOENT; 2424 return -ENOENT;
@@ -2569,7 +2569,7 @@ static int handle_truncate(struct file *filp)
2569 /* 2569 /*
2570 * Refuse to truncate files with mandatory locks held on them. 2570 * Refuse to truncate files with mandatory locks held on them.
2571 */ 2571 */
2572 error = locks_verify_locked(inode); 2572 error = locks_verify_locked(filp);
2573 if (!error) 2573 if (!error)
2574 error = security_path_truncate(path); 2574 error = security_path_truncate(path);
2575 if (!error) { 2575 if (!error) {
@@ -3016,11 +3016,10 @@ finish_open:
3016 } 3016 }
3017 audit_inode(name, nd->path.dentry, 0); 3017 audit_inode(name, nd->path.dentry, 0);
3018 error = -EISDIR; 3018 error = -EISDIR;
3019 if ((open_flag & O_CREAT) && 3019 if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
3020 (d_is_directory(nd->path.dentry) || d_is_autodir(nd->path.dentry)))
3021 goto out; 3020 goto out;
3022 error = -ENOTDIR; 3021 error = -ENOTDIR;
3023 if ((nd->flags & LOOKUP_DIRECTORY) && !d_is_directory(nd->path.dentry)) 3022 if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
3024 goto out; 3023 goto out;
3025 if (!S_ISREG(nd->inode->i_mode)) 3024 if (!S_ISREG(nd->inode->i_mode))
3026 will_truncate = false; 3025 will_truncate = false;
@@ -3744,7 +3743,7 @@ exit1:
3744slashes: 3743slashes:
3745 if (d_is_negative(dentry)) 3744 if (d_is_negative(dentry))
3746 error = -ENOENT; 3745 error = -ENOENT;
3747 else if (d_is_directory(dentry) || d_is_autodir(dentry)) 3746 else if (d_is_dir(dentry))
3748 error = -EISDIR; 3747 error = -EISDIR;
3749 else 3748 else
3750 error = -ENOTDIR; 3749 error = -ENOTDIR;
@@ -3974,7 +3973,28 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
3974 return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); 3973 return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
3975} 3974}
3976 3975
3977/* 3976/**
3977 * vfs_rename - rename a filesystem object
3978 * @old_dir: parent of source
3979 * @old_dentry: source
3980 * @new_dir: parent of destination
3981 * @new_dentry: destination
3982 * @delegated_inode: returns an inode needing a delegation break
3983 * @flags: rename flags
3984 *
3985 * The caller must hold multiple mutexes--see lock_rename()).
3986 *
3987 * If vfs_rename discovers a delegation in need of breaking at either
3988 * the source or destination, it will return -EWOULDBLOCK and return a
3989 * reference to the inode in delegated_inode. The caller should then
3990 * break the delegation and retry. Because breaking a delegation may
3991 * take a long time, the caller should drop all locks before doing
3992 * so.
3993 *
3994 * Alternatively, a caller may pass NULL for delegated_inode. This may
3995 * be appropriate for callers that expect the underlying filesystem not
3996 * to be NFS exported.
3997 *
3978 * The worst of all namespace operations - renaming directory. "Perverted" 3998 * The worst of all namespace operations - renaming directory. "Perverted"
3979 * doesn't even start to describe it. Somebody in UCB had a heck of a trip... 3999 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
3980 * Problems: 4000 * Problems:
@@ -4002,163 +4022,139 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
4002 * ->i_mutex on parents, which works but leads to some truly excessive 4022 * ->i_mutex on parents, which works but leads to some truly excessive
4003 * locking]. 4023 * locking].
4004 */ 4024 */
4005static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, 4025int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4006 struct inode *new_dir, struct dentry *new_dentry) 4026 struct inode *new_dir, struct dentry *new_dentry,
4027 struct inode **delegated_inode, unsigned int flags)
4007{ 4028{
4008 int error = 0; 4029 int error;
4030 bool is_dir = d_is_dir(old_dentry);
4031 const unsigned char *old_name;
4032 struct inode *source = old_dentry->d_inode;
4009 struct inode *target = new_dentry->d_inode; 4033 struct inode *target = new_dentry->d_inode;
4034 bool new_is_dir = false;
4010 unsigned max_links = new_dir->i_sb->s_max_links; 4035 unsigned max_links = new_dir->i_sb->s_max_links;
4011 4036
4037 if (source == target)
4038 return 0;
4039
4040 error = may_delete(old_dir, old_dentry, is_dir);
4041 if (error)
4042 return error;
4043
4044 if (!target) {
4045 error = may_create(new_dir, new_dentry);
4046 } else {
4047 new_is_dir = d_is_dir(new_dentry);
4048
4049 if (!(flags & RENAME_EXCHANGE))
4050 error = may_delete(new_dir, new_dentry, is_dir);
4051 else
4052 error = may_delete(new_dir, new_dentry, new_is_dir);
4053 }
4054 if (error)
4055 return error;
4056
4057 if (!old_dir->i_op->rename)
4058 return -EPERM;
4059
4060 if (flags && !old_dir->i_op->rename2)
4061 return -EINVAL;
4062
4012 /* 4063 /*
4013 * If we are going to change the parent - check write permissions, 4064 * If we are going to change the parent - check write permissions,
4014 * we'll need to flip '..'. 4065 * we'll need to flip '..'.
4015 */ 4066 */
4016 if (new_dir != old_dir) { 4067 if (new_dir != old_dir) {
4017 error = inode_permission(old_dentry->d_inode, MAY_WRITE); 4068 if (is_dir) {
4018 if (error) 4069 error = inode_permission(source, MAY_WRITE);
4019 return error; 4070 if (error)
4071 return error;
4072 }
4073 if ((flags & RENAME_EXCHANGE) && new_is_dir) {
4074 error = inode_permission(target, MAY_WRITE);
4075 if (error)
4076 return error;
4077 }
4020 } 4078 }
4021 4079
4022 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); 4080 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
4081 flags);
4023 if (error) 4082 if (error)
4024 return error; 4083 return error;
4025 4084
4085 old_name = fsnotify_oldname_init(old_dentry->d_name.name);
4026 dget(new_dentry); 4086 dget(new_dentry);
4027 if (target) 4087 if (!is_dir || (flags & RENAME_EXCHANGE))
4088 lock_two_nondirectories(source, target);
4089 else if (target)
4028 mutex_lock(&target->i_mutex); 4090 mutex_lock(&target->i_mutex);
4029 4091
4030 error = -EBUSY; 4092 error = -EBUSY;
4031 if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) 4093 if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
4032 goto out; 4094 goto out;
4033 4095
4034 error = -EMLINK; 4096 if (max_links && new_dir != old_dir) {
4035 if (max_links && !target && new_dir != old_dir && 4097 error = -EMLINK;
4036 new_dir->i_nlink >= max_links) 4098 if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
4037 goto out; 4099 goto out;
4038 4100 if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
4039 if (target) 4101 old_dir->i_nlink >= max_links)
4102 goto out;
4103 }
4104 if (is_dir && !(flags & RENAME_EXCHANGE) && target)
4040 shrink_dcache_parent(new_dentry); 4105 shrink_dcache_parent(new_dentry);
4041 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 4106 if (!is_dir) {
4042 if (error) 4107 error = try_break_deleg(source, delegated_inode);
4043 goto out; 4108 if (error)
4044 4109 goto out;
4045 if (target) {
4046 target->i_flags |= S_DEAD;
4047 dont_mount(new_dentry);
4048 } 4110 }
4049out: 4111 if (target && !new_is_dir) {
4050 if (target)
4051 mutex_unlock(&target->i_mutex);
4052 dput(new_dentry);
4053 if (!error)
4054 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
4055 d_move(old_dentry,new_dentry);
4056 return error;
4057}
4058
4059static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
4060 struct inode *new_dir, struct dentry *new_dentry,
4061 struct inode **delegated_inode)
4062{
4063 struct inode *target = new_dentry->d_inode;
4064 struct inode *source = old_dentry->d_inode;
4065 int error;
4066
4067 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
4068 if (error)
4069 return error;
4070
4071 dget(new_dentry);
4072 lock_two_nondirectories(source, target);
4073
4074 error = -EBUSY;
4075 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
4076 goto out;
4077
4078 error = try_break_deleg(source, delegated_inode);
4079 if (error)
4080 goto out;
4081 if (target) {
4082 error = try_break_deleg(target, delegated_inode); 4112 error = try_break_deleg(target, delegated_inode);
4083 if (error) 4113 if (error)
4084 goto out; 4114 goto out;
4085 } 4115 }
4086 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 4116 if (!flags) {
4117 error = old_dir->i_op->rename(old_dir, old_dentry,
4118 new_dir, new_dentry);
4119 } else {
4120 error = old_dir->i_op->rename2(old_dir, old_dentry,
4121 new_dir, new_dentry, flags);
4122 }
4087 if (error) 4123 if (error)
4088 goto out; 4124 goto out;
4089 4125
4090 if (target) 4126 if (!(flags & RENAME_EXCHANGE) && target) {
4127 if (is_dir)
4128 target->i_flags |= S_DEAD;
4091 dont_mount(new_dentry); 4129 dont_mount(new_dentry);
4092 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 4130 }
4093 d_move(old_dentry, new_dentry); 4131 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
4132 if (!(flags & RENAME_EXCHANGE))
4133 d_move(old_dentry, new_dentry);
4134 else
4135 d_exchange(old_dentry, new_dentry);
4136 }
4094out: 4137out:
4095 unlock_two_nondirectories(source, target); 4138 if (!is_dir || (flags & RENAME_EXCHANGE))
4139 unlock_two_nondirectories(source, target);
4140 else if (target)
4141 mutex_unlock(&target->i_mutex);
4096 dput(new_dentry); 4142 dput(new_dentry);
4097 return error; 4143 if (!error) {
4098}
4099
4100/**
4101 * vfs_rename - rename a filesystem object
4102 * @old_dir: parent of source
4103 * @old_dentry: source
4104 * @new_dir: parent of destination
4105 * @new_dentry: destination
4106 * @delegated_inode: returns an inode needing a delegation break
4107 *
4108 * The caller must hold multiple mutexes--see lock_rename()).
4109 *
4110 * If vfs_rename discovers a delegation in need of breaking at either
4111 * the source or destination, it will return -EWOULDBLOCK and return a
4112 * reference to the inode in delegated_inode. The caller should then
4113 * break the delegation and retry. Because breaking a delegation may
4114 * take a long time, the caller should drop all locks before doing
4115 * so.
4116 *
4117 * Alternatively, a caller may pass NULL for delegated_inode. This may
4118 * be appropriate for callers that expect the underlying filesystem not
4119 * to be NFS exported.
4120 */
4121int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4122 struct inode *new_dir, struct dentry *new_dentry,
4123 struct inode **delegated_inode)
4124{
4125 int error;
4126 int is_dir = d_is_directory(old_dentry) || d_is_autodir(old_dentry);
4127 const unsigned char *old_name;
4128
4129 if (old_dentry->d_inode == new_dentry->d_inode)
4130 return 0;
4131
4132 error = may_delete(old_dir, old_dentry, is_dir);
4133 if (error)
4134 return error;
4135
4136 if (!new_dentry->d_inode)
4137 error = may_create(new_dir, new_dentry);
4138 else
4139 error = may_delete(new_dir, new_dentry, is_dir);
4140 if (error)
4141 return error;
4142
4143 if (!old_dir->i_op->rename)
4144 return -EPERM;
4145
4146 old_name = fsnotify_oldname_init(old_dentry->d_name.name);
4147
4148 if (is_dir)
4149 error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
4150 else
4151 error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry,delegated_inode);
4152 if (!error)
4153 fsnotify_move(old_dir, new_dir, old_name, is_dir, 4144 fsnotify_move(old_dir, new_dir, old_name, is_dir,
4154 new_dentry->d_inode, old_dentry); 4145 !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
4146 if (flags & RENAME_EXCHANGE) {
4147 fsnotify_move(new_dir, old_dir, old_dentry->d_name.name,
4148 new_is_dir, NULL, new_dentry);
4149 }
4150 }
4155 fsnotify_oldname_free(old_name); 4151 fsnotify_oldname_free(old_name);
4156 4152
4157 return error; 4153 return error;
4158} 4154}
4159 4155
4160SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, 4156SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
4161 int, newdfd, const char __user *, newname) 4157 int, newdfd, const char __user *, newname, unsigned int, flags)
4162{ 4158{
4163 struct dentry *old_dir, *new_dir; 4159 struct dentry *old_dir, *new_dir;
4164 struct dentry *old_dentry, *new_dentry; 4160 struct dentry *old_dentry, *new_dentry;
@@ -4170,6 +4166,13 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
4170 unsigned int lookup_flags = 0; 4166 unsigned int lookup_flags = 0;
4171 bool should_retry = false; 4167 bool should_retry = false;
4172 int error; 4168 int error;
4169
4170 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
4171 return -EINVAL;
4172
4173 if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE))
4174 return -EINVAL;
4175
4173retry: 4176retry:
4174 from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); 4177 from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
4175 if (IS_ERR(from)) { 4178 if (IS_ERR(from)) {
@@ -4193,6 +4196,8 @@ retry:
4193 goto exit2; 4196 goto exit2;
4194 4197
4195 new_dir = newnd.path.dentry; 4198 new_dir = newnd.path.dentry;
4199 if (flags & RENAME_NOREPLACE)
4200 error = -EEXIST;
4196 if (newnd.last_type != LAST_NORM) 4201 if (newnd.last_type != LAST_NORM)
4197 goto exit2; 4202 goto exit2;
4198 4203
@@ -4202,7 +4207,8 @@ retry:
4202 4207
4203 oldnd.flags &= ~LOOKUP_PARENT; 4208 oldnd.flags &= ~LOOKUP_PARENT;
4204 newnd.flags &= ~LOOKUP_PARENT; 4209 newnd.flags &= ~LOOKUP_PARENT;
4205 newnd.flags |= LOOKUP_RENAME_TARGET; 4210 if (!(flags & RENAME_EXCHANGE))
4211 newnd.flags |= LOOKUP_RENAME_TARGET;
4206 4212
4207retry_deleg: 4213retry_deleg:
4208 trap = lock_rename(new_dir, old_dir); 4214 trap = lock_rename(new_dir, old_dir);
@@ -4215,34 +4221,49 @@ retry_deleg:
4215 error = -ENOENT; 4221 error = -ENOENT;
4216 if (d_is_negative(old_dentry)) 4222 if (d_is_negative(old_dentry))
4217 goto exit4; 4223 goto exit4;
4224 new_dentry = lookup_hash(&newnd);
4225 error = PTR_ERR(new_dentry);
4226 if (IS_ERR(new_dentry))
4227 goto exit4;
4228 error = -EEXIST;
4229 if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
4230 goto exit5;
4231 if (flags & RENAME_EXCHANGE) {
4232 error = -ENOENT;
4233 if (d_is_negative(new_dentry))
4234 goto exit5;
4235
4236 if (!d_is_dir(new_dentry)) {
4237 error = -ENOTDIR;
4238 if (newnd.last.name[newnd.last.len])
4239 goto exit5;
4240 }
4241 }
4218 /* unless the source is a directory trailing slashes give -ENOTDIR */ 4242 /* unless the source is a directory trailing slashes give -ENOTDIR */
4219 if (!d_is_directory(old_dentry) && !d_is_autodir(old_dentry)) { 4243 if (!d_is_dir(old_dentry)) {
4220 error = -ENOTDIR; 4244 error = -ENOTDIR;
4221 if (oldnd.last.name[oldnd.last.len]) 4245 if (oldnd.last.name[oldnd.last.len])
4222 goto exit4; 4246 goto exit5;
4223 if (newnd.last.name[newnd.last.len]) 4247 if (!(flags & RENAME_EXCHANGE) && newnd.last.name[newnd.last.len])
4224 goto exit4; 4248 goto exit5;
4225 } 4249 }
4226 /* source should not be ancestor of target */ 4250 /* source should not be ancestor of target */
4227 error = -EINVAL; 4251 error = -EINVAL;
4228 if (old_dentry == trap) 4252 if (old_dentry == trap)
4229 goto exit4; 4253 goto exit5;
4230 new_dentry = lookup_hash(&newnd);
4231 error = PTR_ERR(new_dentry);
4232 if (IS_ERR(new_dentry))
4233 goto exit4;
4234 /* target should not be an ancestor of source */ 4254 /* target should not be an ancestor of source */
4235 error = -ENOTEMPTY; 4255 if (!(flags & RENAME_EXCHANGE))
4256 error = -ENOTEMPTY;
4236 if (new_dentry == trap) 4257 if (new_dentry == trap)
4237 goto exit5; 4258 goto exit5;
4238 4259
4239 error = security_path_rename(&oldnd.path, old_dentry, 4260 error = security_path_rename(&oldnd.path, old_dentry,
4240 &newnd.path, new_dentry); 4261 &newnd.path, new_dentry, flags);
4241 if (error) 4262 if (error)
4242 goto exit5; 4263 goto exit5;
4243 error = vfs_rename(old_dir->d_inode, old_dentry, 4264 error = vfs_rename(old_dir->d_inode, old_dentry,
4244 new_dir->d_inode, new_dentry, 4265 new_dir->d_inode, new_dentry,
4245 &delegated_inode); 4266 &delegated_inode, flags);
4246exit5: 4267exit5:
4247 dput(new_dentry); 4268 dput(new_dentry);
4248exit4: 4269exit4:
@@ -4272,9 +4293,15 @@ exit:
4272 return error; 4293 return error;
4273} 4294}
4274 4295
4296SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
4297 int, newdfd, const char __user *, newname)
4298{
4299 return sys_renameat2(olddfd, oldname, newdfd, newname, 0);
4300}
4301
4275SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) 4302SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
4276{ 4303{
4277 return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname); 4304 return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4278} 4305}
4279 4306
4280int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link) 4307int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 2cf2ebecb55f..647d86d2db39 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -99,6 +99,7 @@ static void destroy_inodecache(void)
99 99
100static int ncp_remount(struct super_block *sb, int *flags, char* data) 100static int ncp_remount(struct super_block *sb, int *flags, char* data)
101{ 101{
102 sync_filesystem(sb);
102 *flags |= MS_NODIRATIME; 103 *flags |= MS_NODIRATIME;
103 return 0; 104 return 0;
104} 105}
@@ -296,7 +297,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
296static void 297static void
297ncp_evict_inode(struct inode *inode) 298ncp_evict_inode(struct inode *inode)
298{ 299{
299 truncate_inode_pages(&inode->i_data, 0); 300 truncate_inode_pages_final(&inode->i_data);
300 clear_inode(inode); 301 clear_inode(inode);
301 302
302 if (S_ISDIR(inode->i_mode)) { 303 if (S_ISDIR(inode->i_mode)) {
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 56ff823ca82e..65d849bdf77a 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1213,7 +1213,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
1213 end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); 1213 end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
1214 if (end != NFS_I(inode)->npages) { 1214 if (end != NFS_I(inode)->npages) {
1215 rcu_read_lock(); 1215 rcu_read_lock();
1216 end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX); 1216 end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
1217 rcu_read_unlock(); 1217 rcu_read_unlock();
1218 } 1218 }
1219 1219
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 360114ae8b82..c4702baa22b8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -128,7 +128,7 @@ EXPORT_SYMBOL_GPL(nfs_clear_inode);
128 128
129void nfs_evict_inode(struct inode *inode) 129void nfs_evict_inode(struct inode *inode)
130{ 130{
131 truncate_inode_pages(&inode->i_data, 0); 131 truncate_inode_pages_final(&inode->i_data);
132 clear_inode(inode); 132 clear_inode(inode);
133 nfs_clear_inode(inode); 133 nfs_clear_inode(inode);
134} 134}
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 808f29574412..6f340f02f2ba 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -90,7 +90,7 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc)
90 */ 90 */
91static void nfs4_evict_inode(struct inode *inode) 91static void nfs4_evict_inode(struct inode *inode)
92{ 92{
93 truncate_inode_pages(&inode->i_data, 0); 93 truncate_inode_pages_final(&inode->i_data);
94 clear_inode(inode); 94 clear_inode(inode);
95 pnfs_return_layout(inode); 95 pnfs_return_layout(inode);
96 pnfs_destroy_layout(NFS_I(inode)); 96 pnfs_destroy_layout(NFS_I(inode));
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 910ed906eb82..2cb56943e232 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2215,6 +2215,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
2215 struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data; 2215 struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;
2216 u32 nfsvers = nfss->nfs_client->rpc_ops->version; 2216 u32 nfsvers = nfss->nfs_client->rpc_ops->version;
2217 2217
2218 sync_filesystem(sb);
2219
2218 /* 2220 /*
2219 * Userspace mount programs that send binary options generally send 2221 * Userspace mount programs that send binary options generally send
2220 * them populated with default values. We have no way to know which 2222 * them populated with default values. We have no way to know which
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 06cddd572264..2645be435e75 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -71,10 +71,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
71 if (gid_eq(new->fsgid, INVALID_GID)) 71 if (gid_eq(new->fsgid, INVALID_GID))
72 new->fsgid = exp->ex_anon_gid; 72 new->fsgid = exp->ex_anon_gid;
73 73
74 ret = set_groups(new, gi); 74 set_groups(new, gi);
75 put_group_info(gi); 75 put_group_info(gi);
76 if (ret < 0)
77 goto error;
78 76
79 if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID)) 77 if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID))
80 new->cap_effective = cap_drop_nfsd_set(new->cap_effective); 78 new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
@@ -89,7 +87,6 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
89 87
90oom: 88oom:
91 ret = -ENOMEM; 89 ret = -ENOMEM;
92error:
93 abort_creds(new); 90 abort_creds(new);
94 return ret; 91 return ret;
95} 92}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6d7be3f80356..915808b36df7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1694,7 +1694,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1694 if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) 1694 if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
1695 goto out_dput_new; 1695 goto out_dput_new;
1696 1696
1697 host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL); 1697 host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
1698 if (!host_err) { 1698 if (!host_err) {
1699 host_err = commit_metadata(tfhp); 1699 host_err = commit_metadata(tfhp);
1700 if (!host_err) 1700 if (!host_err)
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index deaa3d33a0aa..0d58075f34e2 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -942,6 +942,18 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
942 struct inode *cpfile; 942 struct inode *cpfile;
943 int err; 943 int err;
944 944
945 if (cpsize > sb->s_blocksize) {
946 printk(KERN_ERR
947 "NILFS: too large checkpoint size: %zu bytes.\n",
948 cpsize);
949 return -EINVAL;
950 } else if (cpsize < NILFS_MIN_CHECKPOINT_SIZE) {
951 printk(KERN_ERR
952 "NILFS: too small checkpoint size: %zu bytes.\n",
953 cpsize);
954 return -EINVAL;
955 }
956
945 cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO); 957 cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO);
946 if (unlikely(!cpfile)) 958 if (unlikely(!cpfile))
947 return -ENOMEM; 959 return -ENOMEM;
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index fa0f80308c2d..0d5fada91191 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -484,6 +484,18 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size,
484 struct nilfs_dat_info *di; 484 struct nilfs_dat_info *di;
485 int err; 485 int err;
486 486
487 if (entry_size > sb->s_blocksize) {
488 printk(KERN_ERR
489 "NILFS: too large DAT entry size: %zu bytes.\n",
490 entry_size);
491 return -EINVAL;
492 } else if (entry_size < NILFS_MIN_DAT_ENTRY_SIZE) {
493 printk(KERN_ERR
494 "NILFS: too small DAT entry size: %zu bytes.\n",
495 entry_size);
496 return -EINVAL;
497 }
498
487 dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO); 499 dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO);
488 if (unlikely(!dat)) 500 if (unlikely(!dat))
489 return -ENOMEM; 501 return -ENOMEM;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 7e350c562e0e..b9c5726120e3 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -783,16 +783,14 @@ void nilfs_evict_inode(struct inode *inode)
783 int ret; 783 int ret;
784 784
785 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) { 785 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
786 if (inode->i_data.nrpages) 786 truncate_inode_pages_final(&inode->i_data);
787 truncate_inode_pages(&inode->i_data, 0);
788 clear_inode(inode); 787 clear_inode(inode);
789 nilfs_clear_inode(inode); 788 nilfs_clear_inode(inode);
790 return; 789 return;
791 } 790 }
792 nilfs_transaction_begin(sb, &ti, 0); /* never fails */ 791 nilfs_transaction_begin(sb, &ti, 0); /* never fails */
793 792
794 if (inode->i_data.nrpages) 793 truncate_inode_pages_final(&inode->i_data);
795 truncate_inode_pages(&inode->i_data, 0);
796 794
797 /* TODO: some of the following operations may fail. */ 795 /* TODO: some of the following operations may fail. */
798 nilfs_truncate_bmap(ii, 0); 796 nilfs_truncate_bmap(ii, 0);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 2b34021948e4..422fb54b7377 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1072,6 +1072,48 @@ out:
1072} 1072}
1073 1073
1074/** 1074/**
1075 * nilfs_ioctl_trim_fs() - trim ioctl handle function
1076 * @inode: inode object
1077 * @argp: pointer on argument from userspace
1078 *
1079 * Decription: nilfs_ioctl_trim_fs is the FITRIM ioctl handle function. It
1080 * checks the arguments from userspace and calls nilfs_sufile_trim_fs, which
1081 * performs the actual trim operation.
1082 *
1083 * Return Value: On success, 0 is returned or negative error code, otherwise.
1084 */
1085static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp)
1086{
1087 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
1088 struct request_queue *q = bdev_get_queue(nilfs->ns_bdev);
1089 struct fstrim_range range;
1090 int ret;
1091
1092 if (!capable(CAP_SYS_ADMIN))
1093 return -EPERM;
1094
1095 if (!blk_queue_discard(q))
1096 return -EOPNOTSUPP;
1097
1098 if (copy_from_user(&range, argp, sizeof(range)))
1099 return -EFAULT;
1100
1101 range.minlen = max_t(u64, range.minlen, q->limits.discard_granularity);
1102
1103 down_read(&nilfs->ns_segctor_sem);
1104 ret = nilfs_sufile_trim_fs(nilfs->ns_sufile, &range);
1105 up_read(&nilfs->ns_segctor_sem);
1106
1107 if (ret < 0)
1108 return ret;
1109
1110 if (copy_to_user(argp, &range, sizeof(range)))
1111 return -EFAULT;
1112
1113 return 0;
1114}
1115
1116/**
1075 * nilfs_ioctl_set_alloc_range - limit range of segments to be allocated 1117 * nilfs_ioctl_set_alloc_range - limit range of segments to be allocated
1076 * @inode: inode object 1118 * @inode: inode object
1077 * @argp: pointer on argument from userspace 1119 * @argp: pointer on argument from userspace
@@ -1163,6 +1205,95 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
1163 return ret; 1205 return ret;
1164} 1206}
1165 1207
1208/**
1209 * nilfs_ioctl_set_suinfo - set segment usage info
1210 * @inode: inode object
1211 * @filp: file object
1212 * @cmd: ioctl's request code
1213 * @argp: pointer on argument from userspace
1214 *
1215 * Description: Expects an array of nilfs_suinfo_update structures
1216 * encapsulated in nilfs_argv and updates the segment usage info
1217 * according to the flags in nilfs_suinfo_update.
1218 *
1219 * Return Value: On success, 0 is returned. On error, one of the
1220 * following negative error codes is returned.
1221 *
1222 * %-EPERM - Not enough permissions
1223 *
1224 * %-EFAULT - Error copying input data
1225 *
1226 * %-EIO - I/O error.
1227 *
1228 * %-ENOMEM - Insufficient amount of memory available.
1229 *
1230 * %-EINVAL - Invalid values in input (segment number, flags or nblocks)
1231 */
1232static int nilfs_ioctl_set_suinfo(struct inode *inode, struct file *filp,
1233 unsigned int cmd, void __user *argp)
1234{
1235 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
1236 struct nilfs_transaction_info ti;
1237 struct nilfs_argv argv;
1238 size_t len;
1239 void __user *base;
1240 void *kbuf;
1241 int ret;
1242
1243 if (!capable(CAP_SYS_ADMIN))
1244 return -EPERM;
1245
1246 ret = mnt_want_write_file(filp);
1247 if (ret)
1248 return ret;
1249
1250 ret = -EFAULT;
1251 if (copy_from_user(&argv, argp, sizeof(argv)))
1252 goto out;
1253
1254 ret = -EINVAL;
1255 if (argv.v_size < sizeof(struct nilfs_suinfo_update))
1256 goto out;
1257
1258 if (argv.v_nmembs > nilfs->ns_nsegments)
1259 goto out;
1260
1261 if (argv.v_nmembs >= UINT_MAX / argv.v_size)
1262 goto out;
1263
1264 len = argv.v_size * argv.v_nmembs;
1265 if (!len) {
1266 ret = 0;
1267 goto out;
1268 }
1269
1270 base = (void __user *)(unsigned long)argv.v_base;
1271 kbuf = vmalloc(len);
1272 if (!kbuf) {
1273 ret = -ENOMEM;
1274 goto out;
1275 }
1276
1277 if (copy_from_user(kbuf, base, len)) {
1278 ret = -EFAULT;
1279 goto out_free;
1280 }
1281
1282 nilfs_transaction_begin(inode->i_sb, &ti, 0);
1283 ret = nilfs_sufile_set_suinfo(nilfs->ns_sufile, kbuf, argv.v_size,
1284 argv.v_nmembs);
1285 if (unlikely(ret < 0))
1286 nilfs_transaction_abort(inode->i_sb);
1287 else
1288 nilfs_transaction_commit(inode->i_sb); /* never fails */
1289
1290out_free:
1291 vfree(kbuf);
1292out:
1293 mnt_drop_write_file(filp);
1294 return ret;
1295}
1296
1166long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 1297long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1167{ 1298{
1168 struct inode *inode = file_inode(filp); 1299 struct inode *inode = file_inode(filp);
@@ -1189,6 +1320,8 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1189 return nilfs_ioctl_get_info(inode, filp, cmd, argp, 1320 return nilfs_ioctl_get_info(inode, filp, cmd, argp,
1190 sizeof(struct nilfs_suinfo), 1321 sizeof(struct nilfs_suinfo),
1191 nilfs_ioctl_do_get_suinfo); 1322 nilfs_ioctl_do_get_suinfo);
1323 case NILFS_IOCTL_SET_SUINFO:
1324 return nilfs_ioctl_set_suinfo(inode, filp, cmd, argp);
1192 case NILFS_IOCTL_GET_SUSTAT: 1325 case NILFS_IOCTL_GET_SUSTAT:
1193 return nilfs_ioctl_get_sustat(inode, filp, cmd, argp); 1326 return nilfs_ioctl_get_sustat(inode, filp, cmd, argp);
1194 case NILFS_IOCTL_GET_VINFO: 1327 case NILFS_IOCTL_GET_VINFO:
@@ -1205,6 +1338,8 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1205 return nilfs_ioctl_resize(inode, filp, argp); 1338 return nilfs_ioctl_resize(inode, filp, argp);
1206 case NILFS_IOCTL_SET_ALLOC_RANGE: 1339 case NILFS_IOCTL_SET_ALLOC_RANGE:
1207 return nilfs_ioctl_set_alloc_range(inode, argp); 1340 return nilfs_ioctl_set_alloc_range(inode, argp);
1341 case FITRIM:
1342 return nilfs_ioctl_trim_fs(inode, argp);
1208 default: 1343 default:
1209 return -ENOTTY; 1344 return -ENOTTY;
1210 } 1345 }
@@ -1228,6 +1363,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1228 case NILFS_IOCTL_GET_CPINFO: 1363 case NILFS_IOCTL_GET_CPINFO:
1229 case NILFS_IOCTL_GET_CPSTAT: 1364 case NILFS_IOCTL_GET_CPSTAT:
1230 case NILFS_IOCTL_GET_SUINFO: 1365 case NILFS_IOCTL_GET_SUINFO:
1366 case NILFS_IOCTL_SET_SUINFO:
1231 case NILFS_IOCTL_GET_SUSTAT: 1367 case NILFS_IOCTL_GET_SUSTAT:
1232 case NILFS_IOCTL_GET_VINFO: 1368 case NILFS_IOCTL_GET_VINFO:
1233 case NILFS_IOCTL_GET_BDESCS: 1369 case NILFS_IOCTL_GET_BDESCS:
@@ -1235,6 +1371,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1235 case NILFS_IOCTL_SYNC: 1371 case NILFS_IOCTL_SYNC:
1236 case NILFS_IOCTL_RESIZE: 1372 case NILFS_IOCTL_RESIZE:
1237 case NILFS_IOCTL_SET_ALLOC_RANGE: 1373 case NILFS_IOCTL_SET_ALLOC_RANGE:
1374 case FITRIM:
1238 break; 1375 break;
1239 default: 1376 default:
1240 return -ENOIOCTLCMD; 1377 return -ENOIOCTLCMD;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 3127e9f438a7..2a869c35c362 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -870,6 +870,289 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
870} 870}
871 871
872/** 872/**
873 * nilfs_sufile_set_suinfo - sets segment usage info
874 * @sufile: inode of segment usage file
875 * @buf: array of suinfo_update
876 * @supsz: byte size of suinfo_update
877 * @nsup: size of suinfo_update array
878 *
879 * Description: Takes an array of nilfs_suinfo_update structs and updates
880 * segment usage accordingly. Only the fields indicated by the sup_flags
881 * are updated.
882 *
883 * Return Value: On success, 0 is returned. On error, one of the
884 * following negative error codes is returned.
885 *
886 * %-EIO - I/O error.
887 *
888 * %-ENOMEM - Insufficient amount of memory available.
889 *
890 * %-EINVAL - Invalid values in input (segment number, flags or nblocks)
891 */
892ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
893 unsigned supsz, size_t nsup)
894{
895 struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
896 struct buffer_head *header_bh, *bh;
897 struct nilfs_suinfo_update *sup, *supend = buf + supsz * nsup;
898 struct nilfs_segment_usage *su;
899 void *kaddr;
900 unsigned long blkoff, prev_blkoff;
901 int cleansi, cleansu, dirtysi, dirtysu;
902 long ncleaned = 0, ndirtied = 0;
903 int ret = 0;
904
905 if (unlikely(nsup == 0))
906 return ret;
907
908 for (sup = buf; sup < supend; sup = (void *)sup + supsz) {
909 if (sup->sup_segnum >= nilfs->ns_nsegments
910 || (sup->sup_flags &
911 (~0UL << __NR_NILFS_SUINFO_UPDATE_FIELDS))
912 || (nilfs_suinfo_update_nblocks(sup) &&
913 sup->sup_sui.sui_nblocks >
914 nilfs->ns_blocks_per_segment))
915 return -EINVAL;
916 }
917
918 down_write(&NILFS_MDT(sufile)->mi_sem);
919
920 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
921 if (ret < 0)
922 goto out_sem;
923
924 sup = buf;
925 blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum);
926 ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh);
927 if (ret < 0)
928 goto out_header;
929
930 for (;;) {
931 kaddr = kmap_atomic(bh->b_page);
932 su = nilfs_sufile_block_get_segment_usage(
933 sufile, sup->sup_segnum, bh, kaddr);
934
935 if (nilfs_suinfo_update_lastmod(sup))
936 su->su_lastmod = cpu_to_le64(sup->sup_sui.sui_lastmod);
937
938 if (nilfs_suinfo_update_nblocks(sup))
939 su->su_nblocks = cpu_to_le32(sup->sup_sui.sui_nblocks);
940
941 if (nilfs_suinfo_update_flags(sup)) {
942 /*
943 * Active flag is a virtual flag projected by running
944 * nilfs kernel code - drop it not to write it to
945 * disk.
946 */
947 sup->sup_sui.sui_flags &=
948 ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
949
950 cleansi = nilfs_suinfo_clean(&sup->sup_sui);
951 cleansu = nilfs_segment_usage_clean(su);
952 dirtysi = nilfs_suinfo_dirty(&sup->sup_sui);
953 dirtysu = nilfs_segment_usage_dirty(su);
954
955 if (cleansi && !cleansu)
956 ++ncleaned;
957 else if (!cleansi && cleansu)
958 --ncleaned;
959
960 if (dirtysi && !dirtysu)
961 ++ndirtied;
962 else if (!dirtysi && dirtysu)
963 --ndirtied;
964
965 su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags);
966 }
967
968 kunmap_atomic(kaddr);
969
970 sup = (void *)sup + supsz;
971 if (sup >= supend)
972 break;
973
974 prev_blkoff = blkoff;
975 blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum);
976 if (blkoff == prev_blkoff)
977 continue;
978
979 /* get different block */
980 mark_buffer_dirty(bh);
981 put_bh(bh);
982 ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh);
983 if (unlikely(ret < 0))
984 goto out_mark;
985 }
986 mark_buffer_dirty(bh);
987 put_bh(bh);
988
989 out_mark:
990 if (ncleaned || ndirtied) {
991 nilfs_sufile_mod_counter(header_bh, (u64)ncleaned,
992 (u64)ndirtied);
993 NILFS_SUI(sufile)->ncleansegs += ncleaned;
994 }
995 nilfs_mdt_mark_dirty(sufile);
996 out_header:
997 put_bh(header_bh);
998 out_sem:
999 up_write(&NILFS_MDT(sufile)->mi_sem);
1000 return ret;
1001}
1002
1003/**
1004 * nilfs_sufile_trim_fs() - trim ioctl handle function
1005 * @sufile: inode of segment usage file
1006 * @range: fstrim_range structure
1007 *
1008 * start: First Byte to trim
1009 * len: number of Bytes to trim from start
1010 * minlen: minimum extent length in Bytes
1011 *
1012 * Decription: nilfs_sufile_trim_fs goes through all segments containing bytes
1013 * from start to start+len. start is rounded up to the next block boundary
1014 * and start+len is rounded down. For each clean segment blkdev_issue_discard
1015 * function is invoked.
1016 *
1017 * Return Value: On success, 0 is returned or negative error code, otherwise.
1018 */
1019int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
1020{
1021 struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
1022 struct buffer_head *su_bh;
1023 struct nilfs_segment_usage *su;
1024 void *kaddr;
1025 size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size;
1026 sector_t seg_start, seg_end, start_block, end_block;
1027 sector_t start = 0, nblocks = 0;
1028 u64 segnum, segnum_end, minlen, len, max_blocks, ndiscarded = 0;
1029 int ret = 0;
1030 unsigned int sects_per_block;
1031
1032 sects_per_block = (1 << nilfs->ns_blocksize_bits) /
1033 bdev_logical_block_size(nilfs->ns_bdev);
1034 len = range->len >> nilfs->ns_blocksize_bits;
1035 minlen = range->minlen >> nilfs->ns_blocksize_bits;
1036 max_blocks = ((u64)nilfs->ns_nsegments * nilfs->ns_blocks_per_segment);
1037
1038 if (!len || range->start >= max_blocks << nilfs->ns_blocksize_bits)
1039 return -EINVAL;
1040
1041 start_block = (range->start + nilfs->ns_blocksize - 1) >>
1042 nilfs->ns_blocksize_bits;
1043
1044 /*
1045 * range->len can be very large (actually, it is set to
1046 * ULLONG_MAX by default) - truncate upper end of the range
1047 * carefully so as not to overflow.
1048 */
1049 if (max_blocks - start_block < len)
1050 end_block = max_blocks - 1;
1051 else
1052 end_block = start_block + len - 1;
1053
1054 segnum = nilfs_get_segnum_of_block(nilfs, start_block);
1055 segnum_end = nilfs_get_segnum_of_block(nilfs, end_block);
1056
1057 down_read(&NILFS_MDT(sufile)->mi_sem);
1058
1059 while (segnum <= segnum_end) {
1060 n = nilfs_sufile_segment_usages_in_block(sufile, segnum,
1061 segnum_end);
1062
1063 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
1064 &su_bh);
1065 if (ret < 0) {
1066 if (ret != -ENOENT)
1067 goto out_sem;
1068 /* hole */
1069 segnum += n;
1070 continue;
1071 }
1072
1073 kaddr = kmap_atomic(su_bh->b_page);
1074 su = nilfs_sufile_block_get_segment_usage(sufile, segnum,
1075 su_bh, kaddr);
1076 for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) {
1077 if (!nilfs_segment_usage_clean(su))
1078 continue;
1079
1080 nilfs_get_segment_range(nilfs, segnum, &seg_start,
1081 &seg_end);
1082
1083 if (!nblocks) {
1084 /* start new extent */
1085 start = seg_start;
1086 nblocks = seg_end - seg_start + 1;
1087 continue;
1088 }
1089
1090 if (start + nblocks == seg_start) {
1091 /* add to previous extent */
1092 nblocks += seg_end - seg_start + 1;
1093 continue;
1094 }
1095
1096 /* discard previous extent */
1097 if (start < start_block) {
1098 nblocks -= start_block - start;
1099 start = start_block;
1100 }
1101
1102 if (nblocks >= minlen) {
1103 kunmap_atomic(kaddr);
1104
1105 ret = blkdev_issue_discard(nilfs->ns_bdev,
1106 start * sects_per_block,
1107 nblocks * sects_per_block,
1108 GFP_NOFS, 0);
1109 if (ret < 0) {
1110 put_bh(su_bh);
1111 goto out_sem;
1112 }
1113
1114 ndiscarded += nblocks;
1115 kaddr = kmap_atomic(su_bh->b_page);
1116 su = nilfs_sufile_block_get_segment_usage(
1117 sufile, segnum, su_bh, kaddr);
1118 }
1119
1120 /* start new extent */
1121 start = seg_start;
1122 nblocks = seg_end - seg_start + 1;
1123 }
1124 kunmap_atomic(kaddr);
1125 put_bh(su_bh);
1126 }
1127
1128
1129 if (nblocks) {
1130 /* discard last extent */
1131 if (start < start_block) {
1132 nblocks -= start_block - start;
1133 start = start_block;
1134 }
1135 if (start + nblocks > end_block + 1)
1136 nblocks = end_block - start + 1;
1137
1138 if (nblocks >= minlen) {
1139 ret = blkdev_issue_discard(nilfs->ns_bdev,
1140 start * sects_per_block,
1141 nblocks * sects_per_block,
1142 GFP_NOFS, 0);
1143 if (!ret)
1144 ndiscarded += nblocks;
1145 }
1146 }
1147
1148out_sem:
1149 up_read(&NILFS_MDT(sufile)->mi_sem);
1150
1151 range->len = ndiscarded << nilfs->ns_blocksize_bits;
1152 return ret;
1153}
1154
1155/**
873 * nilfs_sufile_read - read or get sufile inode 1156 * nilfs_sufile_read - read or get sufile inode
874 * @sb: super block instance 1157 * @sb: super block instance
875 * @susize: size of a segment usage entry 1158 * @susize: size of a segment usage entry
@@ -886,6 +1169,18 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
886 void *kaddr; 1169 void *kaddr;
887 int err; 1170 int err;
888 1171
1172 if (susize > sb->s_blocksize) {
1173 printk(KERN_ERR
1174 "NILFS: too large segment usage size: %zu bytes.\n",
1175 susize);
1176 return -EINVAL;
1177 } else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) {
1178 printk(KERN_ERR
1179 "NILFS: too small segment usage size: %zu bytes.\n",
1180 susize);
1181 return -EINVAL;
1182 }
1183
889 sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO); 1184 sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO);
890 if (unlikely(!sufile)) 1185 if (unlikely(!sufile))
891 return -ENOMEM; 1186 return -ENOMEM;
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index e84bc5b51fc1..b8afd72f2379 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -44,6 +44,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
44int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); 44int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
45ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned, 45ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
46 size_t); 46 size_t);
47ssize_t nilfs_sufile_set_suinfo(struct inode *, void *, unsigned , size_t);
47 48
48int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *, 49int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *,
49 void (*dofunc)(struct inode *, __u64, 50 void (*dofunc)(struct inode *, __u64,
@@ -65,6 +66,7 @@ void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
65int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs); 66int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs);
66int nilfs_sufile_read(struct super_block *sb, size_t susize, 67int nilfs_sufile_read(struct super_block *sb, size_t susize,
67 struct nilfs_inode *raw_inode, struct inode **inodep); 68 struct nilfs_inode *raw_inode, struct inode **inodep);
69int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range);
68 70
69/** 71/**
70 * nilfs_sufile_scrap - make a segment garbage 72 * nilfs_sufile_scrap - make a segment garbage
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 7ac2a122ca1d..8c532b2ca3ab 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1129,6 +1129,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
1129 unsigned long old_mount_opt; 1129 unsigned long old_mount_opt;
1130 int err; 1130 int err;
1131 1131
1132 sync_filesystem(sb);
1132 old_sb_flags = sb->s_flags; 1133 old_sb_flags = sb->s_flags;
1133 old_mount_opt = nilfs->ns_mount_opt; 1134 old_mount_opt = nilfs->ns_mount_opt;
1134 1135
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 94c451ce6d24..8ba8229ba076 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -399,6 +399,16 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
399 return -EINVAL; 399 return -EINVAL;
400 400
401 nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size); 401 nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
402 if (nilfs->ns_inode_size > nilfs->ns_blocksize) {
403 printk(KERN_ERR "NILFS: too large inode size: %d bytes.\n",
404 nilfs->ns_inode_size);
405 return -EINVAL;
406 } else if (nilfs->ns_inode_size < NILFS_MIN_INODE_SIZE) {
407 printk(KERN_ERR "NILFS: too small inode size: %d bytes.\n",
408 nilfs->ns_inode_size);
409 return -EINVAL;
410 }
411
402 nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino); 412 nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
403 413
404 nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); 414 nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index dc638f786d5c..ee9cb3795c2b 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -60,8 +60,8 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
60} 60}
61 61
62#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 62#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
63static int fanotify_get_response_from_access(struct fsnotify_group *group, 63static int fanotify_get_response(struct fsnotify_group *group,
64 struct fanotify_event_info *event) 64 struct fanotify_perm_event_info *event)
65{ 65{
66 int ret; 66 int ret;
67 67
@@ -142,6 +142,40 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
142 return false; 142 return false;
143} 143}
144 144
145struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
146 struct path *path)
147{
148 struct fanotify_event_info *event;
149
150#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
151 if (mask & FAN_ALL_PERM_EVENTS) {
152 struct fanotify_perm_event_info *pevent;
153
154 pevent = kmem_cache_alloc(fanotify_perm_event_cachep,
155 GFP_KERNEL);
156 if (!pevent)
157 return NULL;
158 event = &pevent->fae;
159 pevent->response = 0;
160 goto init;
161 }
162#endif
163 event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL);
164 if (!event)
165 return NULL;
166init: __maybe_unused
167 fsnotify_init_event(&event->fse, inode, mask);
168 event->tgid = get_pid(task_tgid(current));
169 if (path) {
170 event->path = *path;
171 path_get(&event->path);
172 } else {
173 event->path.mnt = NULL;
174 event->path.dentry = NULL;
175 }
176 return event;
177}
178
145static int fanotify_handle_event(struct fsnotify_group *group, 179static int fanotify_handle_event(struct fsnotify_group *group,
146 struct inode *inode, 180 struct inode *inode,
147 struct fsnotify_mark *inode_mark, 181 struct fsnotify_mark *inode_mark,
@@ -171,25 +205,11 @@ static int fanotify_handle_event(struct fsnotify_group *group,
171 pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, 205 pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
172 mask); 206 mask);
173 207
174 event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); 208 event = fanotify_alloc_event(inode, mask, data);
175 if (unlikely(!event)) 209 if (unlikely(!event))
176 return -ENOMEM; 210 return -ENOMEM;
177 211
178 fsn_event = &event->fse; 212 fsn_event = &event->fse;
179 fsnotify_init_event(fsn_event, inode, mask);
180 event->tgid = get_pid(task_tgid(current));
181 if (data_type == FSNOTIFY_EVENT_PATH) {
182 struct path *path = data;
183 event->path = *path;
184 path_get(&event->path);
185 } else {
186 event->path.mnt = NULL;
187 event->path.dentry = NULL;
188 }
189#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
190 event->response = 0;
191#endif
192
193 ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge); 213 ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge);
194 if (ret) { 214 if (ret) {
195 /* Permission events shouldn't be merged */ 215 /* Permission events shouldn't be merged */
@@ -202,7 +222,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
202 222
203#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 223#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
204 if (mask & FAN_ALL_PERM_EVENTS) { 224 if (mask & FAN_ALL_PERM_EVENTS) {
205 ret = fanotify_get_response_from_access(group, event); 225 ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event));
206 fsnotify_destroy_event(group, fsn_event); 226 fsnotify_destroy_event(group, fsn_event);
207 } 227 }
208#endif 228#endif
@@ -225,6 +245,13 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
225 event = FANOTIFY_E(fsn_event); 245 event = FANOTIFY_E(fsn_event);
226 path_put(&event->path); 246 path_put(&event->path);
227 put_pid(event->tgid); 247 put_pid(event->tgid);
248#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
249 if (fsn_event->mask & FAN_ALL_PERM_EVENTS) {
250 kmem_cache_free(fanotify_perm_event_cachep,
251 FANOTIFY_PE(fsn_event));
252 return;
253 }
254#endif
228 kmem_cache_free(fanotify_event_cachep, event); 255 kmem_cache_free(fanotify_event_cachep, event);
229} 256}
230 257
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 32a2f034fb94..2a5fb14115df 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -3,13 +3,12 @@
3#include <linux/slab.h> 3#include <linux/slab.h>
4 4
5extern struct kmem_cache *fanotify_event_cachep; 5extern struct kmem_cache *fanotify_event_cachep;
6extern struct kmem_cache *fanotify_perm_event_cachep;
6 7
7/* 8/*
8 * Lifetime of the structure differs for normal and permission events. In both 9 * Structure for normal fanotify events. It gets allocated in
9 * cases the structure is allocated in fanotify_handle_event(). For normal 10 * fanotify_handle_event() and freed when the information is retrieved by
10 * events the structure is freed immediately after reporting it to userspace. 11 * userspace
11 * For permission events we free it only after we receive response from
12 * userspace.
13 */ 12 */
14struct fanotify_event_info { 13struct fanotify_event_info {
15 struct fsnotify_event fse; 14 struct fsnotify_event fse;
@@ -19,12 +18,33 @@ struct fanotify_event_info {
19 */ 18 */
20 struct path path; 19 struct path path;
21 struct pid *tgid; 20 struct pid *tgid;
21};
22
22#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 23#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
23 u32 response; /* userspace answer to question */ 24/*
24#endif 25 * Structure for permission fanotify events. It gets allocated and freed in
26 * fanotify_handle_event() since we wait there for user response. When the
27 * information is retrieved by userspace the structure is moved from
28 * group->notification_list to group->fanotify_data.access_list to wait for
29 * user response.
30 */
31struct fanotify_perm_event_info {
32 struct fanotify_event_info fae;
33 int response; /* userspace answer to question */
34 int fd; /* fd we passed to userspace for this event */
25}; 35};
26 36
37static inline struct fanotify_perm_event_info *
38FANOTIFY_PE(struct fsnotify_event *fse)
39{
40 return container_of(fse, struct fanotify_perm_event_info, fae.fse);
41}
42#endif
43
27static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) 44static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
28{ 45{
29 return container_of(fse, struct fanotify_event_info, fse); 46 return container_of(fse, struct fanotify_event_info, fse);
30} 47}
48
49struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
50 struct path *path);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 287a22c04149..4e565c814309 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -28,14 +28,8 @@
28extern const struct fsnotify_ops fanotify_fsnotify_ops; 28extern const struct fsnotify_ops fanotify_fsnotify_ops;
29 29
30static struct kmem_cache *fanotify_mark_cache __read_mostly; 30static struct kmem_cache *fanotify_mark_cache __read_mostly;
31static struct kmem_cache *fanotify_response_event_cache __read_mostly;
32struct kmem_cache *fanotify_event_cachep __read_mostly; 31struct kmem_cache *fanotify_event_cachep __read_mostly;
33 32struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
34struct fanotify_response_event {
35 struct list_head list;
36 __s32 fd;
37 struct fanotify_event_info *event;
38};
39 33
40/* 34/*
41 * Get an fsnotify notification event if one exists and is small 35 * Get an fsnotify notification event if one exists and is small
@@ -135,33 +129,34 @@ static int fill_event_metadata(struct fsnotify_group *group,
135} 129}
136 130
137#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 131#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
138static struct fanotify_response_event *dequeue_re(struct fsnotify_group *group, 132static struct fanotify_perm_event_info *dequeue_event(
139 __s32 fd) 133 struct fsnotify_group *group, int fd)
140{ 134{
141 struct fanotify_response_event *re, *return_re = NULL; 135 struct fanotify_perm_event_info *event, *return_e = NULL;
142 136
143 mutex_lock(&group->fanotify_data.access_mutex); 137 spin_lock(&group->fanotify_data.access_lock);
144 list_for_each_entry(re, &group->fanotify_data.access_list, list) { 138 list_for_each_entry(event, &group->fanotify_data.access_list,
145 if (re->fd != fd) 139 fae.fse.list) {
140 if (event->fd != fd)
146 continue; 141 continue;
147 142
148 list_del_init(&re->list); 143 list_del_init(&event->fae.fse.list);
149 return_re = re; 144 return_e = event;
150 break; 145 break;
151 } 146 }
152 mutex_unlock(&group->fanotify_data.access_mutex); 147 spin_unlock(&group->fanotify_data.access_lock);
153 148
154 pr_debug("%s: found return_re=%p\n", __func__, return_re); 149 pr_debug("%s: found return_re=%p\n", __func__, return_e);
155 150
156 return return_re; 151 return return_e;
157} 152}
158 153
159static int process_access_response(struct fsnotify_group *group, 154static int process_access_response(struct fsnotify_group *group,
160 struct fanotify_response *response_struct) 155 struct fanotify_response *response_struct)
161{ 156{
162 struct fanotify_response_event *re; 157 struct fanotify_perm_event_info *event;
163 __s32 fd = response_struct->fd; 158 int fd = response_struct->fd;
164 __u32 response = response_struct->response; 159 int response = response_struct->response;
165 160
166 pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group, 161 pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
167 fd, response); 162 fd, response);
@@ -181,58 +176,15 @@ static int process_access_response(struct fsnotify_group *group,
181 if (fd < 0) 176 if (fd < 0)
182 return -EINVAL; 177 return -EINVAL;
183 178
184 re = dequeue_re(group, fd); 179 event = dequeue_event(group, fd);
185 if (!re) 180 if (!event)
186 return -ENOENT; 181 return -ENOENT;
187 182
188 re->event->response = response; 183 event->response = response;
189
190 wake_up(&group->fanotify_data.access_waitq); 184 wake_up(&group->fanotify_data.access_waitq);
191 185
192 kmem_cache_free(fanotify_response_event_cache, re);
193
194 return 0;
195}
196
197static int prepare_for_access_response(struct fsnotify_group *group,
198 struct fsnotify_event *event,
199 __s32 fd)
200{
201 struct fanotify_response_event *re;
202
203 if (!(event->mask & FAN_ALL_PERM_EVENTS))
204 return 0;
205
206 re = kmem_cache_alloc(fanotify_response_event_cache, GFP_KERNEL);
207 if (!re)
208 return -ENOMEM;
209
210 re->event = FANOTIFY_E(event);
211 re->fd = fd;
212
213 mutex_lock(&group->fanotify_data.access_mutex);
214
215 if (atomic_read(&group->fanotify_data.bypass_perm)) {
216 mutex_unlock(&group->fanotify_data.access_mutex);
217 kmem_cache_free(fanotify_response_event_cache, re);
218 FANOTIFY_E(event)->response = FAN_ALLOW;
219 return 0;
220 }
221
222 list_add_tail(&re->list, &group->fanotify_data.access_list);
223 mutex_unlock(&group->fanotify_data.access_mutex);
224
225 return 0;
226}
227
228#else
229static int prepare_for_access_response(struct fsnotify_group *group,
230 struct fsnotify_event *event,
231 __s32 fd)
232{
233 return 0; 186 return 0;
234} 187}
235
236#endif 188#endif
237 189
238static ssize_t copy_event_to_user(struct fsnotify_group *group, 190static ssize_t copy_event_to_user(struct fsnotify_group *group,
@@ -247,7 +199,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
247 199
248 ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f); 200 ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f);
249 if (ret < 0) 201 if (ret < 0)
250 goto out; 202 return ret;
251 203
252 fd = fanotify_event_metadata.fd; 204 fd = fanotify_event_metadata.fd;
253 ret = -EFAULT; 205 ret = -EFAULT;
@@ -255,9 +207,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
255 fanotify_event_metadata.event_len)) 207 fanotify_event_metadata.event_len))
256 goto out_close_fd; 208 goto out_close_fd;
257 209
258 ret = prepare_for_access_response(group, event, fd); 210#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
259 if (ret) 211 if (event->mask & FAN_ALL_PERM_EVENTS)
260 goto out_close_fd; 212 FANOTIFY_PE(event)->fd = fd;
213#endif
261 214
262 if (fd != FAN_NOFD) 215 if (fd != FAN_NOFD)
263 fd_install(fd, f); 216 fd_install(fd, f);
@@ -268,13 +221,6 @@ out_close_fd:
268 put_unused_fd(fd); 221 put_unused_fd(fd);
269 fput(f); 222 fput(f);
270 } 223 }
271out:
272#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
273 if (event->mask & FAN_ALL_PERM_EVENTS) {
274 FANOTIFY_E(event)->response = FAN_DENY;
275 wake_up(&group->fanotify_data.access_waitq);
276 }
277#endif
278 return ret; 224 return ret;
279} 225}
280 226
@@ -314,35 +260,50 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
314 kevent = get_one_event(group, count); 260 kevent = get_one_event(group, count);
315 mutex_unlock(&group->notification_mutex); 261 mutex_unlock(&group->notification_mutex);
316 262
317 if (kevent) { 263 if (IS_ERR(kevent)) {
318 ret = PTR_ERR(kevent); 264 ret = PTR_ERR(kevent);
319 if (IS_ERR(kevent)) 265 break;
266 }
267
268 if (!kevent) {
269 ret = -EAGAIN;
270 if (file->f_flags & O_NONBLOCK)
320 break; 271 break;
321 ret = copy_event_to_user(group, kevent, buf); 272
322 /* 273 ret = -ERESTARTSYS;
323 * Permission events get destroyed after we 274 if (signal_pending(current))
324 * receive response 275 break;
325 */ 276
326 if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) 277 if (start != buf)
327 fsnotify_destroy_event(group, kevent);
328 if (ret < 0)
329 break; 278 break;
330 buf += ret; 279 schedule();
331 count -= ret;
332 continue; 280 continue;
333 } 281 }
334 282
335 ret = -EAGAIN; 283 ret = copy_event_to_user(group, kevent, buf);
336 if (file->f_flags & O_NONBLOCK) 284 /*
337 break; 285 * Permission events get queued to wait for response. Other
338 ret = -ERESTARTSYS; 286 * events can be destroyed now.
339 if (signal_pending(current)) 287 */
340 break; 288 if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) {
341 289 fsnotify_destroy_event(group, kevent);
342 if (start != buf) 290 if (ret < 0)
343 break; 291 break;
344 292 } else {
345 schedule(); 293#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
294 if (ret < 0) {
295 FANOTIFY_PE(kevent)->response = FAN_DENY;
296 wake_up(&group->fanotify_data.access_waitq);
297 break;
298 }
299 spin_lock(&group->fanotify_data.access_lock);
300 list_add_tail(&kevent->list,
301 &group->fanotify_data.access_list);
302 spin_unlock(&group->fanotify_data.access_lock);
303#endif
304 }
305 buf += ret;
306 count -= ret;
346 } 307 }
347 308
348 finish_wait(&group->notification_waitq, &wait); 309 finish_wait(&group->notification_waitq, &wait);
@@ -383,22 +344,21 @@ static int fanotify_release(struct inode *ignored, struct file *file)
383 struct fsnotify_group *group = file->private_data; 344 struct fsnotify_group *group = file->private_data;
384 345
385#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 346#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
386 struct fanotify_response_event *re, *lre; 347 struct fanotify_perm_event_info *event, *next;
387 348
388 mutex_lock(&group->fanotify_data.access_mutex); 349 spin_lock(&group->fanotify_data.access_lock);
389 350
390 atomic_inc(&group->fanotify_data.bypass_perm); 351 atomic_inc(&group->fanotify_data.bypass_perm);
391 352
392 list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) { 353 list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
393 pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group, 354 fae.fse.list) {
394 re, re->event); 355 pr_debug("%s: found group=%p event=%p\n", __func__, group,
356 event);
395 357
396 list_del_init(&re->list); 358 list_del_init(&event->fae.fse.list);
397 re->event->response = FAN_ALLOW; 359 event->response = FAN_ALLOW;
398
399 kmem_cache_free(fanotify_response_event_cache, re);
400 } 360 }
401 mutex_unlock(&group->fanotify_data.access_mutex); 361 spin_unlock(&group->fanotify_data.access_lock);
402 362
403 wake_up(&group->fanotify_data.access_waitq); 363 wake_up(&group->fanotify_data.access_waitq);
404#endif 364#endif
@@ -731,21 +691,16 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
731 group->fanotify_data.user = user; 691 group->fanotify_data.user = user;
732 atomic_inc(&user->fanotify_listeners); 692 atomic_inc(&user->fanotify_listeners);
733 693
734 oevent = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); 694 oevent = fanotify_alloc_event(NULL, FS_Q_OVERFLOW, NULL);
735 if (unlikely(!oevent)) { 695 if (unlikely(!oevent)) {
736 fd = -ENOMEM; 696 fd = -ENOMEM;
737 goto out_destroy_group; 697 goto out_destroy_group;
738 } 698 }
739 group->overflow_event = &oevent->fse; 699 group->overflow_event = &oevent->fse;
740 fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW);
741 oevent->tgid = get_pid(task_tgid(current));
742 oevent->path.mnt = NULL;
743 oevent->path.dentry = NULL;
744 700
745 group->fanotify_data.f_flags = event_f_flags; 701 group->fanotify_data.f_flags = event_f_flags;
746#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 702#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
747 oevent->response = 0; 703 spin_lock_init(&group->fanotify_data.access_lock);
748 mutex_init(&group->fanotify_data.access_mutex);
749 init_waitqueue_head(&group->fanotify_data.access_waitq); 704 init_waitqueue_head(&group->fanotify_data.access_waitq);
750 INIT_LIST_HEAD(&group->fanotify_data.access_list); 705 INIT_LIST_HEAD(&group->fanotify_data.access_list);
751 atomic_set(&group->fanotify_data.bypass_perm, 0); 706 atomic_set(&group->fanotify_data.bypass_perm, 0);
@@ -920,9 +875,11 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
920static int __init fanotify_user_setup(void) 875static int __init fanotify_user_setup(void)
921{ 876{
922 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); 877 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
923 fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
924 SLAB_PANIC);
925 fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); 878 fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
879#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
880 fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event_info,
881 SLAB_PANIC);
882#endif
926 883
927 return 0; 884 return 0;
928} 885}
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index ffb9b3675736..9d8153ebacfb 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2259,7 +2259,7 @@ void ntfs_evict_big_inode(struct inode *vi)
2259{ 2259{
2260 ntfs_inode *ni = NTFS_I(vi); 2260 ntfs_inode *ni = NTFS_I(vi);
2261 2261
2262 truncate_inode_pages(&vi->i_data, 0); 2262 truncate_inode_pages_final(&vi->i_data);
2263 clear_inode(vi); 2263 clear_inode(vi);
2264 2264
2265#ifdef NTFS_RW 2265#ifdef NTFS_RW
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 82650d52d916..bd5610d48242 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -468,6 +468,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
468 468
469 ntfs_debug("Entering with remount options string: %s", opt); 469 ntfs_debug("Entering with remount options string: %s", opt);
470 470
471 sync_filesystem(sb);
472
471#ifndef NTFS_RW 473#ifndef NTFS_RW
472 /* For read-only compiled driver, enforce read-only flag. */ 474 /* For read-only compiled driver, enforce read-only flag. */
473 *flags |= MS_RDONLY; 475 *flags |= MS_RDONLY;
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 555f4cddefe3..7e8282dcea2a 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -205,6 +205,7 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
205 di->i_mode = cpu_to_le16(inode->i_mode); 205 di->i_mode = cpu_to_le16(inode->i_mode);
206 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 206 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
207 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 207 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
208 ocfs2_update_inode_fsync_trans(handle, inode, 0);
208 209
209 ocfs2_journal_dirty(handle, di_bh); 210 ocfs2_journal_dirty(handle, di_bh);
210 211
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index e2edff38be52..b4deb5f750d9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5728,6 +5728,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5728 } 5728 }
5729 5729
5730 ocfs2_et_update_clusters(et, -len); 5730 ocfs2_et_update_clusters(et, -len);
5731 ocfs2_update_inode_fsync_trans(handle, inode, 1);
5731 5732
5732 ocfs2_journal_dirty(handle, et->et_root_bh); 5733 ocfs2_journal_dirty(handle, et->et_root_bh);
5733 5734
@@ -6932,6 +6933,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6932 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 6933 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6933 spin_unlock(&oi->ip_lock); 6934 spin_unlock(&oi->ip_lock);
6934 6935
6936 ocfs2_update_inode_fsync_trans(handle, inode, 1);
6935 ocfs2_dinode_new_extent_list(inode, di); 6937 ocfs2_dinode_new_extent_list(inode, di);
6936 6938
6937 ocfs2_journal_dirty(handle, di_bh); 6939 ocfs2_journal_dirty(handle, di_bh);
@@ -7208,6 +7210,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7208 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 7210 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7209 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 7211 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7210 7212
7213 ocfs2_update_inode_fsync_trans(handle, inode, 1);
7211 ocfs2_journal_dirty(handle, di_bh); 7214 ocfs2_journal_dirty(handle, di_bh);
7212 7215
7213out_commit: 7216out_commit:
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index aeb44e879c51..d310d12a9adc 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -571,7 +571,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
571{ 571{
572 struct inode *inode = file_inode(iocb->ki_filp); 572 struct inode *inode = file_inode(iocb->ki_filp);
573 int level; 573 int level;
574 wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
575 574
576 /* this io's submitter should not have unlocked this before we could */ 575 /* this io's submitter should not have unlocked this before we could */
577 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 576 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
@@ -582,10 +581,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
582 if (ocfs2_iocb_is_unaligned_aio(iocb)) { 581 if (ocfs2_iocb_is_unaligned_aio(iocb)) {
583 ocfs2_iocb_clear_unaligned_aio(iocb); 582 ocfs2_iocb_clear_unaligned_aio(iocb);
584 583
585 if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) && 584 mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
586 waitqueue_active(wq)) {
587 wake_up_all(wq);
588 }
589 } 585 }
590 586
591 ocfs2_iocb_clear_rw_locked(iocb); 587 ocfs2_iocb_clear_rw_locked(iocb);
@@ -2043,6 +2039,7 @@ out_write_size:
2043 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2039 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2044 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); 2040 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
2045 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 2041 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
2042 ocfs2_update_inode_fsync_trans(handle, inode, 1);
2046 ocfs2_journal_dirty(handle, wc->w_di_bh); 2043 ocfs2_journal_dirty(handle, wc->w_di_bh);
2047 2044
2048 ocfs2_commit_trans(osb, handle); 2045 ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index f671e49beb34..6cae155d54df 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -102,9 +102,4 @@ enum ocfs2_iocb_lock_bits {
102#define ocfs2_iocb_is_unaligned_aio(iocb) \ 102#define ocfs2_iocb_is_unaligned_aio(iocb) \
103 test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) 103 test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
104 104
105#define OCFS2_IOEND_WQ_HASH_SZ 37
106#define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\
107 OCFS2_IOEND_WQ_HASH_SZ])
108extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
109
110#endif /* OCFS2_FILE_H */ 105#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 5b704c63a103..1edcb141f639 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -90,7 +90,6 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
90 * information for this bh as it's not marked locally 90 * information for this bh as it's not marked locally
91 * uptodate. */ 91 * uptodate. */
92 ret = -EIO; 92 ret = -EIO;
93 put_bh(bh);
94 mlog_errno(ret); 93 mlog_errno(ret);
95 } 94 }
96 95
@@ -420,7 +419,6 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
420 419
421 if (!buffer_uptodate(bh)) { 420 if (!buffer_uptodate(bh)) {
422 ret = -EIO; 421 ret = -EIO;
423 put_bh(bh);
424 mlog_errno(ret); 422 mlog_errno(ret);
425 } 423 }
426 424
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 2cd2406b4140..eb649d23a4de 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -262,17 +262,17 @@ static void o2net_update_recv_stats(struct o2net_sock_container *sc)
262 262
263#endif /* CONFIG_OCFS2_FS_STATS */ 263#endif /* CONFIG_OCFS2_FS_STATS */
264 264
265static inline int o2net_reconnect_delay(void) 265static inline unsigned int o2net_reconnect_delay(void)
266{ 266{
267 return o2nm_single_cluster->cl_reconnect_delay_ms; 267 return o2nm_single_cluster->cl_reconnect_delay_ms;
268} 268}
269 269
270static inline int o2net_keepalive_delay(void) 270static inline unsigned int o2net_keepalive_delay(void)
271{ 271{
272 return o2nm_single_cluster->cl_keepalive_delay_ms; 272 return o2nm_single_cluster->cl_keepalive_delay_ms;
273} 273}
274 274
275static inline int o2net_idle_timeout(void) 275static inline unsigned int o2net_idle_timeout(void)
276{ 276{
277 return o2nm_single_cluster->cl_idle_timeout_ms; 277 return o2nm_single_cluster->cl_idle_timeout_ms;
278} 278}
@@ -1964,18 +1964,30 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes)
1964 goto out; 1964 goto out;
1965 } 1965 }
1966 1966
1967 /* ->sk_data_ready is also called for a newly established child socket 1967 /* This callback may called twice when a new connection
1968 * before it has been accepted and the acceptor has set up their 1968 * is being established as a child socket inherits everything
1969 * data_ready.. we only want to queue listen work for our listening 1969 * from a parent LISTEN socket, including the data_ready cb of
1970 * socket */ 1970 * the parent. This leads to a hazard. In o2net_accept_one()
1971 * we are still initializing the child socket but have not
1972 * changed the inherited data_ready callback yet when
1973 * data starts arriving.
1974 * We avoid this hazard by checking the state.
1975 * For the listening socket, the state will be TCP_LISTEN; for the new
1976 * socket, will be TCP_ESTABLISHED. Also, in this case,
1977 * sk->sk_user_data is not a valid function pointer.
1978 */
1979
1971 if (sk->sk_state == TCP_LISTEN) { 1980 if (sk->sk_state == TCP_LISTEN) {
1972 mlog(ML_TCP, "bytes: %d\n", bytes); 1981 mlog(ML_TCP, "bytes: %d\n", bytes);
1973 queue_work(o2net_wq, &o2net_listen_work); 1982 queue_work(o2net_wq, &o2net_listen_work);
1983 } else {
1984 ready = NULL;
1974 } 1985 }
1975 1986
1976out: 1987out:
1977 read_unlock(&sk->sk_callback_lock); 1988 read_unlock(&sk->sk_callback_lock);
1978 ready(sk, bytes); 1989 if (ready != NULL)
1990 ready(sk, bytes);
1979} 1991}
1980 1992
1981static int o2net_open_listening_sock(__be32 addr, __be16 port) 1993static int o2net_open_listening_sock(__be32 addr, __be16 port)
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 0d3a97d2d5f6..e2e05a106beb 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -37,7 +37,6 @@
37#include "dlmglue.h" 37#include "dlmglue.h"
38#include "file.h" 38#include "file.h"
39#include "inode.h" 39#include "inode.h"
40#include "super.h"
41#include "ocfs2_trace.h" 40#include "ocfs2_trace.h"
42 41
43void ocfs2_dentry_attach_gen(struct dentry *dentry) 42void ocfs2_dentry_attach_gen(struct dentry *dentry)
@@ -346,52 +345,6 @@ out_attach:
346 return ret; 345 return ret;
347} 346}
348 347
349DEFINE_SPINLOCK(dentry_list_lock);
350
351/* We limit the number of dentry locks to drop in one go. We have
352 * this limit so that we don't starve other users of ocfs2_wq. */
353#define DL_INODE_DROP_COUNT 64
354
355/* Drop inode references from dentry locks */
356static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count)
357{
358 struct ocfs2_dentry_lock *dl;
359
360 spin_lock(&dentry_list_lock);
361 while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) {
362 dl = osb->dentry_lock_list;
363 osb->dentry_lock_list = dl->dl_next;
364 spin_unlock(&dentry_list_lock);
365 iput(dl->dl_inode);
366 kfree(dl);
367 spin_lock(&dentry_list_lock);
368 }
369 spin_unlock(&dentry_list_lock);
370}
371
372void ocfs2_drop_dl_inodes(struct work_struct *work)
373{
374 struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
375 dentry_lock_work);
376
377 __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT);
378 /*
379 * Don't queue dropping if umount is in progress. We flush the
380 * list in ocfs2_dismount_volume
381 */
382 spin_lock(&dentry_list_lock);
383 if (osb->dentry_lock_list &&
384 !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
385 queue_work(ocfs2_wq, &osb->dentry_lock_work);
386 spin_unlock(&dentry_list_lock);
387}
388
389/* Flush the whole work queue */
390void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
391{
392 __ocfs2_drop_dl_inodes(osb, -1);
393}
394
395/* 348/*
396 * ocfs2_dentry_iput() and friends. 349 * ocfs2_dentry_iput() and friends.
397 * 350 *
@@ -416,24 +369,16 @@ void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
416static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, 369static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
417 struct ocfs2_dentry_lock *dl) 370 struct ocfs2_dentry_lock *dl)
418{ 371{
372 iput(dl->dl_inode);
419 ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); 373 ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
420 ocfs2_lock_res_free(&dl->dl_lockres); 374 ocfs2_lock_res_free(&dl->dl_lockres);
421 375 kfree(dl);
422 /* We leave dropping of inode reference to ocfs2_wq as that can
423 * possibly lead to inode deletion which gets tricky */
424 spin_lock(&dentry_list_lock);
425 if (!osb->dentry_lock_list &&
426 !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
427 queue_work(ocfs2_wq, &osb->dentry_lock_work);
428 dl->dl_next = osb->dentry_lock_list;
429 osb->dentry_lock_list = dl;
430 spin_unlock(&dentry_list_lock);
431} 376}
432 377
433void ocfs2_dentry_lock_put(struct ocfs2_super *osb, 378void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
434 struct ocfs2_dentry_lock *dl) 379 struct ocfs2_dentry_lock *dl)
435{ 380{
436 int unlock; 381 int unlock = 0;
437 382
438 BUG_ON(dl->dl_count == 0); 383 BUG_ON(dl->dl_count == 0);
439 384
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index b79eff709958..55f58892b153 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -29,13 +29,8 @@
29extern const struct dentry_operations ocfs2_dentry_ops; 29extern const struct dentry_operations ocfs2_dentry_ops;
30 30
31struct ocfs2_dentry_lock { 31struct ocfs2_dentry_lock {
32 /* Use count of dentry lock */
33 unsigned int dl_count; 32 unsigned int dl_count;
34 union { 33 u64 dl_parent_blkno;
35 /* Linked list of dentry locks to release */
36 struct ocfs2_dentry_lock *dl_next;
37 u64 dl_parent_blkno;
38 };
39 34
40 /* 35 /*
41 * The ocfs2_dentry_lock keeps an inode reference until 36 * The ocfs2_dentry_lock keeps an inode reference until
@@ -49,14 +44,9 @@ struct ocfs2_dentry_lock {
49int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, 44int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
50 u64 parent_blkno); 45 u64 parent_blkno);
51 46
52extern spinlock_t dentry_list_lock;
53
54void ocfs2_dentry_lock_put(struct ocfs2_super *osb, 47void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
55 struct ocfs2_dentry_lock *dl); 48 struct ocfs2_dentry_lock *dl);
56 49
57void ocfs2_drop_dl_inodes(struct work_struct *work);
58void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb);
59
60struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, 50struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
61 int skip_unhashed); 51 int skip_unhashed);
62 52
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 91a7e85ac8fd..0717662b4aef 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2957,6 +2957,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2957 ocfs2_init_dir_trailer(dir, dirdata_bh, i); 2957 ocfs2_init_dir_trailer(dir, dirdata_bh, i);
2958 } 2958 }
2959 2959
2960 ocfs2_update_inode_fsync_trans(handle, dir, 1);
2960 ocfs2_journal_dirty(handle, dirdata_bh); 2961 ocfs2_journal_dirty(handle, dirdata_bh);
2961 2962
2962 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { 2963 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
@@ -3005,6 +3006,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3005 di->i_size = cpu_to_le64(sb->s_blocksize); 3006 di->i_size = cpu_to_le64(sb->s_blocksize);
3006 di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); 3007 di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
3007 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); 3008 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
3009 ocfs2_update_inode_fsync_trans(handle, dir, 1);
3008 3010
3009 /* 3011 /*
3010 * This should never fail as our extent list is empty and all 3012 * This should never fail as our extent list is empty and all
@@ -3338,6 +3340,7 @@ do_extend:
3338 } else { 3340 } else {
3339 de->rec_len = cpu_to_le16(sb->s_blocksize); 3341 de->rec_len = cpu_to_le16(sb->s_blocksize);
3340 } 3342 }
3343 ocfs2_update_inode_fsync_trans(handle, dir, 1);
3341 ocfs2_journal_dirty(handle, new_bh); 3344 ocfs2_journal_dirty(handle, new_bh);
3342 3345
3343 dir_i_size += dir->i_sb->s_blocksize; 3346 dir_i_size += dir->i_sb->s_blocksize;
@@ -3896,6 +3899,7 @@ out_commit:
3896 dquot_free_space_nodirty(dir, 3899 dquot_free_space_nodirty(dir,
3897 ocfs2_clusters_to_bytes(dir->i_sb, 1)); 3900 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3898 3901
3902 ocfs2_update_inode_fsync_trans(handle, dir, 1);
3899 ocfs2_commit_trans(osb, handle); 3903 ocfs2_commit_trans(osb, handle);
3900 3904
3901out: 3905out:
@@ -4134,6 +4138,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
4134 mlog_errno(ret); 4138 mlog_errno(ret);
4135 did_quota = 0; 4139 did_quota = 0;
4136 4140
4141 ocfs2_update_inode_fsync_trans(handle, dir, 1);
4137 ocfs2_journal_dirty(handle, dx_root_bh); 4142 ocfs2_journal_dirty(handle, dx_root_bh);
4138 4143
4139out_commit: 4144out_commit:
@@ -4401,6 +4406,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
4401 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); 4406 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
4402 spin_unlock(&OCFS2_I(dir)->ip_lock); 4407 spin_unlock(&OCFS2_I(dir)->ip_lock);
4403 di->i_dx_root = cpu_to_le64(0ULL); 4408 di->i_dx_root = cpu_to_le64(0ULL);
4409 ocfs2_update_inode_fsync_trans(handle, dir, 1);
4404 4410
4405 ocfs2_journal_dirty(handle, di_bh); 4411 ocfs2_journal_dirty(handle, di_bh);
4406 4412
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 33660a4a52fa..c973690dc0bc 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1123,7 +1123,6 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
1123 struct dlm_ctxt *dlm = NULL; 1123 struct dlm_ctxt *dlm = NULL;
1124 char *local = NULL; 1124 char *local = NULL;
1125 int status = 0; 1125 int status = 0;
1126 int locked = 0;
1127 1126
1128 qr = (struct dlm_query_region *) msg->buf; 1127 qr = (struct dlm_query_region *) msg->buf;
1129 1128
@@ -1132,10 +1131,8 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
1132 1131
1133 /* buffer used in dlm_mast_regions() */ 1132 /* buffer used in dlm_mast_regions() */
1134 local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); 1133 local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
1135 if (!local) { 1134 if (!local)
1136 status = -ENOMEM; 1135 return -ENOMEM;
1137 goto bail;
1138 }
1139 1136
1140 status = -EINVAL; 1137 status = -EINVAL;
1141 1138
@@ -1144,16 +1141,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
1144 if (!dlm) { 1141 if (!dlm) {
1145 mlog(ML_ERROR, "Node %d queried hb regions on domain %s " 1142 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1146 "before join domain\n", qr->qr_node, qr->qr_domain); 1143 "before join domain\n", qr->qr_node, qr->qr_domain);
1147 goto bail; 1144 goto out_domain_lock;
1148 } 1145 }
1149 1146
1150 spin_lock(&dlm->spinlock); 1147 spin_lock(&dlm->spinlock);
1151 locked = 1;
1152 if (dlm->joining_node != qr->qr_node) { 1148 if (dlm->joining_node != qr->qr_node) {
1153 mlog(ML_ERROR, "Node %d queried hb regions on domain %s " 1149 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1154 "but joining node is %d\n", qr->qr_node, qr->qr_domain, 1150 "but joining node is %d\n", qr->qr_node, qr->qr_domain,
1155 dlm->joining_node); 1151 dlm->joining_node);
1156 goto bail; 1152 goto out_dlm_lock;
1157 } 1153 }
1158 1154
1159 /* Support for global heartbeat was added in 1.1 */ 1155 /* Support for global heartbeat was added in 1.1 */
@@ -1163,14 +1159,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
1163 "but active dlm protocol is %d.%d\n", qr->qr_node, 1159 "but active dlm protocol is %d.%d\n", qr->qr_node,
1164 qr->qr_domain, dlm->dlm_locking_proto.pv_major, 1160 qr->qr_domain, dlm->dlm_locking_proto.pv_major,
1165 dlm->dlm_locking_proto.pv_minor); 1161 dlm->dlm_locking_proto.pv_minor);
1166 goto bail; 1162 goto out_dlm_lock;
1167 } 1163 }
1168 1164
1169 status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions)); 1165 status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions));
1170 1166
1171bail: 1167out_dlm_lock:
1172 if (locked) 1168 spin_unlock(&dlm->spinlock);
1173 spin_unlock(&dlm->spinlock); 1169
1170out_domain_lock:
1174 spin_unlock(&dlm_domain_lock); 1171 spin_unlock(&dlm_domain_lock);
1175 1172
1176 kfree(local); 1173 kfree(local);
@@ -1877,19 +1874,19 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
1877 goto bail; 1874 goto bail;
1878 } 1875 }
1879 1876
1880 status = dlm_debug_init(dlm); 1877 status = dlm_launch_thread(dlm);
1881 if (status < 0) { 1878 if (status < 0) {
1882 mlog_errno(status); 1879 mlog_errno(status);
1883 goto bail; 1880 goto bail;
1884 } 1881 }
1885 1882
1886 status = dlm_launch_thread(dlm); 1883 status = dlm_launch_recovery_thread(dlm);
1887 if (status < 0) { 1884 if (status < 0) {
1888 mlog_errno(status); 1885 mlog_errno(status);
1889 goto bail; 1886 goto bail;
1890 } 1887 }
1891 1888
1892 status = dlm_launch_recovery_thread(dlm); 1889 status = dlm_debug_init(dlm);
1893 if (status < 0) { 1890 if (status < 0) {
1894 mlog_errno(status); 1891 mlog_errno(status);
1895 goto bail; 1892 goto bail;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 7035af09cc03..fe29f7978f81 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -537,7 +537,10 @@ master_here:
537 /* success! see if any other nodes need recovery */ 537 /* success! see if any other nodes need recovery */
538 mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n", 538 mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
539 dlm->name, dlm->reco.dead_node, dlm->node_num); 539 dlm->name, dlm->reco.dead_node, dlm->node_num);
540 dlm_reset_recovery(dlm); 540 spin_lock(&dlm->spinlock);
541 __dlm_reset_recovery(dlm);
542 dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
543 spin_unlock(&dlm->spinlock);
541 } 544 }
542 dlm_end_recovery(dlm); 545 dlm_end_recovery(dlm);
543 546
@@ -695,6 +698,14 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
695 if (all_nodes_done) { 698 if (all_nodes_done) {
696 int ret; 699 int ret;
697 700
701 /* Set this flag on recovery master to avoid
702 * a new recovery for another dead node start
703 * before the recovery is not done. That may
704 * cause recovery hung.*/
705 spin_lock(&dlm->spinlock);
706 dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
707 spin_unlock(&dlm->spinlock);
708
698 /* all nodes are now in DLM_RECO_NODE_DATA_DONE state 709 /* all nodes are now in DLM_RECO_NODE_DATA_DONE state
699 * just send a finalize message to everyone and 710 * just send a finalize message to everyone and
700 * clean up */ 711 * clean up */
@@ -1750,13 +1761,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1750 struct dlm_migratable_lockres *mres) 1761 struct dlm_migratable_lockres *mres)
1751{ 1762{
1752 struct dlm_migratable_lock *ml; 1763 struct dlm_migratable_lock *ml;
1753 struct list_head *queue; 1764 struct list_head *queue, *iter;
1754 struct list_head *tmpq = NULL; 1765 struct list_head *tmpq = NULL;
1755 struct dlm_lock *newlock = NULL; 1766 struct dlm_lock *newlock = NULL;
1756 struct dlm_lockstatus *lksb = NULL; 1767 struct dlm_lockstatus *lksb = NULL;
1757 int ret = 0; 1768 int ret = 0;
1758 int i, j, bad; 1769 int i, j, bad;
1759 struct dlm_lock *lock = NULL; 1770 struct dlm_lock *lock;
1760 u8 from = O2NM_MAX_NODES; 1771 u8 from = O2NM_MAX_NODES;
1761 unsigned int added = 0; 1772 unsigned int added = 0;
1762 __be64 c; 1773 __be64 c;
@@ -1791,14 +1802,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1791 /* MIGRATION ONLY! */ 1802 /* MIGRATION ONLY! */
1792 BUG_ON(!(mres->flags & DLM_MRES_MIGRATION)); 1803 BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
1793 1804
1805 lock = NULL;
1794 spin_lock(&res->spinlock); 1806 spin_lock(&res->spinlock);
1795 for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { 1807 for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
1796 tmpq = dlm_list_idx_to_ptr(res, j); 1808 tmpq = dlm_list_idx_to_ptr(res, j);
1797 list_for_each_entry(lock, tmpq, list) { 1809 list_for_each(iter, tmpq) {
1798 if (lock->ml.cookie != ml->cookie) 1810 lock = list_entry(iter,
1799 lock = NULL; 1811 struct dlm_lock, list);
1800 else 1812 if (lock->ml.cookie == ml->cookie)
1801 break; 1813 break;
1814 lock = NULL;
1802 } 1815 }
1803 if (lock) 1816 if (lock)
1804 break; 1817 break;
@@ -2882,8 +2895,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
2882 BUG(); 2895 BUG();
2883 } 2896 }
2884 dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; 2897 dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
2898 __dlm_reset_recovery(dlm);
2885 spin_unlock(&dlm->spinlock); 2899 spin_unlock(&dlm->spinlock);
2886 dlm_reset_recovery(dlm);
2887 dlm_kick_recovery_thread(dlm); 2900 dlm_kick_recovery_thread(dlm);
2888 break; 2901 break;
2889 default: 2902 default:
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 19986959d149..6bd690b5a061 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3144,22 +3144,60 @@ out:
3144 return 0; 3144 return 0;
3145} 3145}
3146 3146
3147static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3148 struct ocfs2_lock_res *lockres);
3149
3147/* Mark the lockres as being dropped. It will no longer be 3150/* Mark the lockres as being dropped. It will no longer be
3148 * queued if blocking, but we still may have to wait on it 3151 * queued if blocking, but we still may have to wait on it
3149 * being dequeued from the downconvert thread before we can consider 3152 * being dequeued from the downconvert thread before we can consider
3150 * it safe to drop. 3153 * it safe to drop.
3151 * 3154 *
3152 * You can *not* attempt to call cluster_lock on this lockres anymore. */ 3155 * You can *not* attempt to call cluster_lock on this lockres anymore. */
3153void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) 3156void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
3157 struct ocfs2_lock_res *lockres)
3154{ 3158{
3155 int status; 3159 int status;
3156 struct ocfs2_mask_waiter mw; 3160 struct ocfs2_mask_waiter mw;
3157 unsigned long flags; 3161 unsigned long flags, flags2;
3158 3162
3159 ocfs2_init_mask_waiter(&mw); 3163 ocfs2_init_mask_waiter(&mw);
3160 3164
3161 spin_lock_irqsave(&lockres->l_lock, flags); 3165 spin_lock_irqsave(&lockres->l_lock, flags);
3162 lockres->l_flags |= OCFS2_LOCK_FREEING; 3166 lockres->l_flags |= OCFS2_LOCK_FREEING;
3167 if (lockres->l_flags & OCFS2_LOCK_QUEUED && current == osb->dc_task) {
3168 /*
3169 * We know the downconvert is queued but not in progress
3170 * because we are the downconvert thread and processing
3171 * different lock. So we can just remove the lock from the
3172 * queue. This is not only an optimization but also a way
3173 * to avoid the following deadlock:
3174 * ocfs2_dentry_post_unlock()
3175 * ocfs2_dentry_lock_put()
3176 * ocfs2_drop_dentry_lock()
3177 * iput()
3178 * ocfs2_evict_inode()
3179 * ocfs2_clear_inode()
3180 * ocfs2_mark_lockres_freeing()
3181 * ... blocks waiting for OCFS2_LOCK_QUEUED
3182 * since we are the downconvert thread which
3183 * should clear the flag.
3184 */
3185 spin_unlock_irqrestore(&lockres->l_lock, flags);
3186 spin_lock_irqsave(&osb->dc_task_lock, flags2);
3187 list_del_init(&lockres->l_blocked_list);
3188 osb->blocked_lock_count--;
3189 spin_unlock_irqrestore(&osb->dc_task_lock, flags2);
3190 /*
3191 * Warn if we recurse into another post_unlock call. Strictly
3192 * speaking it isn't a problem but we need to be careful if
3193 * that happens (stack overflow, deadlocks, ...) so warn if
3194 * ocfs2 grows a path for which this can happen.
3195 */
3196 WARN_ON_ONCE(lockres->l_ops->post_unlock);
3197 /* Since the lock is freeing we don't do much in the fn below */
3198 ocfs2_process_blocked_lock(osb, lockres);
3199 return;
3200 }
3163 while (lockres->l_flags & OCFS2_LOCK_QUEUED) { 3201 while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
3164 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0); 3202 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
3165 spin_unlock_irqrestore(&lockres->l_lock, flags); 3203 spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -3180,7 +3218,7 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
3180{ 3218{
3181 int ret; 3219 int ret;
3182 3220
3183 ocfs2_mark_lockres_freeing(lockres); 3221 ocfs2_mark_lockres_freeing(osb, lockres);
3184 ret = ocfs2_drop_lock(osb, lockres); 3222 ret = ocfs2_drop_lock(osb, lockres);
3185 if (ret) 3223 if (ret)
3186 mlog_errno(ret); 3224 mlog_errno(ret);
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 1d596d8c4a4a..d293a22c32c5 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -157,7 +157,8 @@ int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex);
157void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex); 157void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
158 158
159 159
160void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); 160void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
161 struct ocfs2_lock_res *lockres);
161void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, 162void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
162 struct ocfs2_lock_res *lockres); 163 struct ocfs2_lock_res *lockres);
163 164
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 51632c40e896..ff33c5ef87f2 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -175,9 +175,13 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
175 int datasync) 175 int datasync)
176{ 176{
177 int err = 0; 177 int err = 0;
178 journal_t *journal;
179 struct inode *inode = file->f_mapping->host; 178 struct inode *inode = file->f_mapping->host;
180 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 179 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
180 struct ocfs2_inode_info *oi = OCFS2_I(inode);
181 journal_t *journal = osb->journal->j_journal;
182 int ret;
183 tid_t commit_tid;
184 bool needs_barrier = false;
181 185
182 trace_ocfs2_sync_file(inode, file, file->f_path.dentry, 186 trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
183 OCFS2_I(inode)->ip_blkno, 187 OCFS2_I(inode)->ip_blkno,
@@ -192,29 +196,19 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
192 if (err) 196 if (err)
193 return err; 197 return err;
194 198
195 /* 199 commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid;
196 * Probably don't need the i_mutex at all in here, just putting it here 200 if (journal->j_flags & JBD2_BARRIER &&
197 * to be consistent with how fsync used to be called, someone more 201 !jbd2_trans_will_send_data_barrier(journal, commit_tid))
198 * familiar with the fs could possibly remove it. 202 needs_barrier = true;
199 */ 203 err = jbd2_complete_transaction(journal, commit_tid);
200 mutex_lock(&inode->i_mutex); 204 if (needs_barrier) {
201 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { 205 ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
202 /* 206 if (!err)
203 * We still have to flush drive's caches to get data to the 207 err = ret;
204 * platter
205 */
206 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
207 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
208 goto bail;
209 } 208 }
210 209
211 journal = osb->journal->j_journal;
212 err = jbd2_journal_force_commit(journal);
213
214bail:
215 if (err) 210 if (err)
216 mlog_errno(err); 211 mlog_errno(err);
217 mutex_unlock(&inode->i_mutex);
218 212
219 return (err < 0) ? -EIO : 0; 213 return (err < 0) ? -EIO : 0;
220} 214}
@@ -292,6 +286,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
292 inode->i_atime = CURRENT_TIME; 286 inode->i_atime = CURRENT_TIME;
293 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); 287 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
294 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); 288 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
289 ocfs2_update_inode_fsync_trans(handle, inode, 0);
295 ocfs2_journal_dirty(handle, bh); 290 ocfs2_journal_dirty(handle, bh);
296 291
297out_commit: 292out_commit:
@@ -341,6 +336,7 @@ int ocfs2_simple_size_update(struct inode *inode,
341 if (ret < 0) 336 if (ret < 0)
342 mlog_errno(ret); 337 mlog_errno(ret);
343 338
339 ocfs2_update_inode_fsync_trans(handle, inode, 0);
344 ocfs2_commit_trans(osb, handle); 340 ocfs2_commit_trans(osb, handle);
345out: 341out:
346 return ret; 342 return ret;
@@ -435,6 +431,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
435 di->i_size = cpu_to_le64(new_i_size); 431 di->i_size = cpu_to_le64(new_i_size);
436 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 432 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
437 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 433 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
434 ocfs2_update_inode_fsync_trans(handle, inode, 0);
438 435
439 ocfs2_journal_dirty(handle, fe_bh); 436 ocfs2_journal_dirty(handle, fe_bh);
440 437
@@ -650,7 +647,7 @@ restarted_transaction:
650 mlog_errno(status); 647 mlog_errno(status);
651 goto leave; 648 goto leave;
652 } 649 }
653 650 ocfs2_update_inode_fsync_trans(handle, inode, 1);
654 ocfs2_journal_dirty(handle, bh); 651 ocfs2_journal_dirty(handle, bh);
655 652
656 spin_lock(&OCFS2_I(inode)->ip_lock); 653 spin_lock(&OCFS2_I(inode)->ip_lock);
@@ -743,6 +740,7 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
743 OCFS2_JOURNAL_ACCESS_WRITE); 740 OCFS2_JOURNAL_ACCESS_WRITE);
744 if (ret) 741 if (ret)
745 mlog_errno(ret); 742 mlog_errno(ret);
743 ocfs2_update_inode_fsync_trans(handle, inode, 1);
746 744
747out: 745out:
748 if (ret) { 746 if (ret) {
@@ -840,6 +838,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
840 di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 838 di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
841 di->i_mtime_nsec = di->i_ctime_nsec; 839 di->i_mtime_nsec = di->i_ctime_nsec;
842 ocfs2_journal_dirty(handle, di_bh); 840 ocfs2_journal_dirty(handle, di_bh);
841 ocfs2_update_inode_fsync_trans(handle, inode, 1);
843 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 842 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
844 } 843 }
845 844
@@ -1344,6 +1343,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
1344 1343
1345 di = (struct ocfs2_dinode *) bh->b_data; 1344 di = (struct ocfs2_dinode *) bh->b_data;
1346 di->i_mode = cpu_to_le16(inode->i_mode); 1345 di->i_mode = cpu_to_le16(inode->i_mode);
1346 ocfs2_update_inode_fsync_trans(handle, inode, 0);
1347 1347
1348 ocfs2_journal_dirty(handle, bh); 1348 ocfs2_journal_dirty(handle, bh);
1349 1349
@@ -1576,6 +1576,7 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
1576 if (ret) 1576 if (ret)
1577 mlog_errno(ret); 1577 mlog_errno(ret);
1578 } 1578 }
1579 ocfs2_update_inode_fsync_trans(handle, inode, 1);
1579 1580
1580 ocfs2_commit_trans(osb, handle); 1581 ocfs2_commit_trans(osb, handle);
1581out: 1582out:
@@ -2061,13 +2062,6 @@ out:
2061 return ret; 2062 return ret;
2062} 2063}
2063 2064
2064static void ocfs2_aiodio_wait(struct inode *inode)
2065{
2066 wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
2067
2068 wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
2069}
2070
2071static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) 2065static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
2072{ 2066{
2073 int blockmask = inode->i_sb->s_blocksize - 1; 2067 int blockmask = inode->i_sb->s_blocksize - 1;
@@ -2345,10 +2339,8 @@ relock:
2345 * Wait on previous unaligned aio to complete before 2339 * Wait on previous unaligned aio to complete before
2346 * proceeding. 2340 * proceeding.
2347 */ 2341 */
2348 ocfs2_aiodio_wait(inode); 2342 mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio);
2349 2343 /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */
2350 /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
2351 atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
2352 ocfs2_iocb_set_unaligned_aio(iocb); 2344 ocfs2_iocb_set_unaligned_aio(iocb);
2353 } 2345 }
2354 2346
@@ -2428,7 +2420,7 @@ out_dio:
2428 2420
2429 if (unaligned_dio) { 2421 if (unaligned_dio) {
2430 ocfs2_iocb_clear_unaligned_aio(iocb); 2422 ocfs2_iocb_clear_unaligned_aio(iocb);
2431 atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio); 2423 mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
2432 } 2424 }
2433 2425
2434out: 2426out:
@@ -2645,7 +2637,16 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
2645 case SEEK_SET: 2637 case SEEK_SET:
2646 break; 2638 break;
2647 case SEEK_END: 2639 case SEEK_END:
2648 offset += inode->i_size; 2640 /* SEEK_END requires the OCFS2 inode lock for the file
2641 * because it references the file's size.
2642 */
2643 ret = ocfs2_inode_lock(inode, NULL, 0);
2644 if (ret < 0) {
2645 mlog_errno(ret);
2646 goto out;
2647 }
2648 offset += i_size_read(inode);
2649 ocfs2_inode_unlock(inode, 0);
2649 break; 2650 break;
2650 case SEEK_CUR: 2651 case SEEK_CUR:
2651 if (offset == 0) { 2652 if (offset == 0) {
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index f29a90fde619..437de7f768c6 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -130,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
130 struct inode *inode = NULL; 130 struct inode *inode = NULL;
131 struct super_block *sb = osb->sb; 131 struct super_block *sb = osb->sb;
132 struct ocfs2_find_inode_args args; 132 struct ocfs2_find_inode_args args;
133 journal_t *journal = OCFS2_SB(sb)->journal->j_journal;
133 134
134 trace_ocfs2_iget_begin((unsigned long long)blkno, flags, 135 trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
135 sysfile_type); 136 sysfile_type);
@@ -169,6 +170,32 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
169 goto bail; 170 goto bail;
170 } 171 }
171 172
173 /*
174 * Set transaction id's of transactions that have to be committed
175 * to finish f[data]sync. We set them to currently running transaction
176 * as we cannot be sure that the inode or some of its metadata isn't
177 * part of the transaction - the inode could have been reclaimed and
178 * now it is reread from disk.
179 */
180 if (journal) {
181 transaction_t *transaction;
182 tid_t tid;
183 struct ocfs2_inode_info *oi = OCFS2_I(inode);
184
185 read_lock(&journal->j_state_lock);
186 if (journal->j_running_transaction)
187 transaction = journal->j_running_transaction;
188 else
189 transaction = journal->j_committing_transaction;
190 if (transaction)
191 tid = transaction->t_tid;
192 else
193 tid = journal->j_commit_sequence;
194 read_unlock(&journal->j_state_lock);
195 oi->i_sync_tid = tid;
196 oi->i_datasync_tid = tid;
197 }
198
172bail: 199bail:
173 if (!IS_ERR(inode)) { 200 if (!IS_ERR(inode)) {
174 trace_ocfs2_iget_end(inode, 201 trace_ocfs2_iget_end(inode,
@@ -804,11 +831,13 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
804 goto bail; 831 goto bail;
805 } 832 }
806 833
807 /* If we're coming from downconvert_thread we can't go into our own 834 /*
808 * voting [hello, deadlock city!], so unforuntately we just 835 * If we're coming from downconvert_thread we can't go into our own
809 * have to skip deleting this guy. That's OK though because 836 * voting [hello, deadlock city!] so we cannot delete the inode. But
810 * the node who's doing the actual deleting should handle it 837 * since we dropped last inode ref when downconverting dentry lock,
811 * anyway. */ 838 * we cannot have the file open and thus the node doing unlink will
839 * take care of deleting the inode.
840 */
812 if (current == osb->dc_task) 841 if (current == osb->dc_task)
813 goto bail; 842 goto bail;
814 843
@@ -822,12 +851,6 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
822 goto bail_unlock; 851 goto bail_unlock;
823 } 852 }
824 853
825 /* If we have allowd wipe of this inode for another node, it
826 * will be marked here so we can safely skip it. Recovery will
827 * cleanup any inodes we might inadvertently skip here. */
828 if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE)
829 goto bail_unlock;
830
831 ret = 1; 854 ret = 1;
832bail_unlock: 855bail_unlock:
833 spin_unlock(&oi->ip_lock); 856 spin_unlock(&oi->ip_lock);
@@ -941,7 +964,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
941 (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); 964 (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
942 if (sync_data) 965 if (sync_data)
943 filemap_write_and_wait(inode->i_mapping); 966 filemap_write_and_wait(inode->i_mapping);
944 truncate_inode_pages(&inode->i_data, 0); 967 truncate_inode_pages_final(&inode->i_data);
945} 968}
946 969
947static void ocfs2_delete_inode(struct inode *inode) 970static void ocfs2_delete_inode(struct inode *inode)
@@ -960,8 +983,6 @@ static void ocfs2_delete_inode(struct inode *inode)
960 if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) 983 if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno)
961 goto bail; 984 goto bail;
962 985
963 dquot_initialize(inode);
964
965 if (!ocfs2_inode_is_valid_to_delete(inode)) { 986 if (!ocfs2_inode_is_valid_to_delete(inode)) {
966 /* It's probably not necessary to truncate_inode_pages 987 /* It's probably not necessary to truncate_inode_pages
967 * here but we do it for safety anyway (it will most 988 * here but we do it for safety anyway (it will most
@@ -970,6 +991,8 @@ static void ocfs2_delete_inode(struct inode *inode)
970 goto bail; 991 goto bail;
971 } 992 }
972 993
994 dquot_initialize(inode);
995
973 /* We want to block signals in delete_inode as the lock and 996 /* We want to block signals in delete_inode as the lock and
974 * messaging paths may return us -ERESTARTSYS. Which would 997 * messaging paths may return us -ERESTARTSYS. Which would
975 * cause us to exit early, resulting in inodes being orphaned 998 * cause us to exit early, resulting in inodes being orphaned
@@ -1057,6 +1080,7 @@ static void ocfs2_clear_inode(struct inode *inode)
1057{ 1080{
1058 int status; 1081 int status;
1059 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1082 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1083 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1060 1084
1061 clear_inode(inode); 1085 clear_inode(inode);
1062 trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, 1086 trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno,
@@ -1073,9 +1097,9 @@ static void ocfs2_clear_inode(struct inode *inode)
1073 1097
1074 /* Do these before all the other work so that we don't bounce 1098 /* Do these before all the other work so that we don't bounce
1075 * the downconvert thread while waiting to destroy the locks. */ 1099 * the downconvert thread while waiting to destroy the locks. */
1076 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); 1100 ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres);
1077 ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres); 1101 ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres);
1078 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); 1102 ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres);
1079 1103
1080 ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap, 1104 ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
1081 &oi->ip_la_data_resv); 1105 &oi->ip_la_data_resv);
@@ -1157,7 +1181,7 @@ void ocfs2_evict_inode(struct inode *inode)
1157 (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) { 1181 (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) {
1158 ocfs2_delete_inode(inode); 1182 ocfs2_delete_inode(inode);
1159 } else { 1183 } else {
1160 truncate_inode_pages(&inode->i_data, 0); 1184 truncate_inode_pages_final(&inode->i_data);
1161 } 1185 }
1162 ocfs2_clear_inode(inode); 1186 ocfs2_clear_inode(inode);
1163} 1187}
@@ -1260,6 +1284,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1260 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 1284 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1261 1285
1262 ocfs2_journal_dirty(handle, bh); 1286 ocfs2_journal_dirty(handle, bh);
1287 ocfs2_update_inode_fsync_trans(handle, inode, 1);
1263leave: 1288leave:
1264 return status; 1289 return status;
1265} 1290}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 621fc73bf23d..a6c991c0fc98 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -44,7 +44,7 @@ struct ocfs2_inode_info
44 struct rw_semaphore ip_xattr_sem; 44 struct rw_semaphore ip_xattr_sem;
45 45
46 /* Number of outstanding AIO's which are not page aligned */ 46 /* Number of outstanding AIO's which are not page aligned */
47 atomic_t ip_unaligned_aio; 47 struct mutex ip_unaligned_aio;
48 48
49 /* These fields are protected by ip_lock */ 49 /* These fields are protected by ip_lock */
50 spinlock_t ip_lock; 50 spinlock_t ip_lock;
@@ -73,6 +73,13 @@ struct ocfs2_inode_info
73 u32 ip_dir_lock_gen; 73 u32 ip_dir_lock_gen;
74 74
75 struct ocfs2_alloc_reservation ip_la_data_resv; 75 struct ocfs2_alloc_reservation ip_la_data_resv;
76
77 /*
78 * Transactions that contain inode's metadata needed to complete
79 * fsync and fdatasync, respectively.
80 */
81 tid_t i_sync_tid;
82 tid_t i_datasync_tid;
76}; 83};
77 84
78/* 85/*
@@ -84,8 +91,6 @@ struct ocfs2_inode_info
84#define OCFS2_INODE_BITMAP 0x00000004 91#define OCFS2_INODE_BITMAP 0x00000004
85/* This inode has been wiped from disk */ 92/* This inode has been wiped from disk */
86#define OCFS2_INODE_DELETED 0x00000008 93#define OCFS2_INODE_DELETED 0x00000008
87/* Another node is deleting, so our delete is a nop */
88#define OCFS2_INODE_SKIP_DELETE 0x00000010
89/* Has the inode been orphaned on another node? 94/* Has the inode been orphaned on another node?
90 * 95 *
91 * This hints to ocfs2_drop_inode that it should clear i_nlink before 96 * This hints to ocfs2_drop_inode that it should clear i_nlink before
@@ -100,11 +105,11 @@ struct ocfs2_inode_info
100 * rely on ocfs2_delete_inode to sort things out under the proper 105 * rely on ocfs2_delete_inode to sort things out under the proper
101 * cluster locks. 106 * cluster locks.
102 */ 107 */
103#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 108#define OCFS2_INODE_MAYBE_ORPHANED 0x00000010
104/* Does someone have the file open O_DIRECT */ 109/* Does someone have the file open O_DIRECT */
105#define OCFS2_INODE_OPEN_DIRECT 0x00000040 110#define OCFS2_INODE_OPEN_DIRECT 0x00000020
106/* Tell the inode wipe code it's not in orphan dir */ 111/* Tell the inode wipe code it's not in orphan dir */
107#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000080 112#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040
108 113
109static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) 114static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
110{ 115{
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 8ca3c29accbf..490229f43731 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -413,11 +413,12 @@ int ocfs2_info_handle_freeinode(struct inode *inode,
413 } 413 }
414 414
415 status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i); 415 status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
416 if (status < 0)
417 goto bail;
418 416
419 iput(inode_alloc); 417 iput(inode_alloc);
420 inode_alloc = NULL; 418 inode_alloc = NULL;
419
420 if (status < 0)
421 goto bail;
421 } 422 }
422 423
423 o2info_set_request_filled(&oifi->ifi_req); 424 o2info_set_request_filled(&oifi->ifi_req);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 44fc3e530c3d..03ea9314fecd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -2132,12 +2132,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
2132 iter = oi->ip_next_orphan; 2132 iter = oi->ip_next_orphan;
2133 2133
2134 spin_lock(&oi->ip_lock); 2134 spin_lock(&oi->ip_lock);
2135 /* The remote delete code may have set these on the
2136 * assumption that the other node would wipe them
2137 * successfully. If they are still in the node's
2138 * orphan dir, we need to reset that state. */
2139 oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
2140
2141 /* Set the proper information to get us going into 2135 /* Set the proper information to get us going into
2142 * ocfs2_delete_inode. */ 2136 * ocfs2_delete_inode. */
2143 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; 2137 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 9ff4e8cf9d97..7f8cde94abfe 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -626,4 +626,15 @@ static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
626 new_size); 626 new_size);
627} 627}
628 628
629static inline void ocfs2_update_inode_fsync_trans(handle_t *handle,
630 struct inode *inode,
631 int datasync)
632{
633 struct ocfs2_inode_info *oi = OCFS2_I(inode);
634
635 oi->i_sync_tid = handle->h_transaction->t_tid;
636 if (datasync)
637 oi->i_datasync_tid = handle->h_transaction->t_tid;
638}
639
629#endif /* OCFS2_JOURNAL_H */ 640#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index e57c804069ea..6b6d092b0998 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -82,6 +82,8 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
82 } 82 }
83 83
84 ret = flock_lock_file_wait(file, fl); 84 ret = flock_lock_file_wait(file, fl);
85 if (ret)
86 ocfs2_file_unlock(file);
85 87
86out: 88out:
87 mutex_unlock(&fp->fp_mutex); 89 mutex_unlock(&fp->fp_mutex);
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 64c304d668f0..599eb4c4c8be 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -151,6 +151,7 @@ static int __ocfs2_move_extent(handle_t *handle,
151 old_blkno, len); 151 old_blkno, len);
152 } 152 }
153 153
154 ocfs2_update_inode_fsync_trans(handle, inode, 0);
154out: 155out:
155 ocfs2_free_path(path); 156 ocfs2_free_path(path);
156 return ret; 157 return ret;
@@ -690,8 +691,11 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
690 691
691 ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, 692 ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
692 goal_bit, len); 693 goal_bit, len);
693 if (ret) 694 if (ret) {
695 ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len,
696 le16_to_cpu(gd->bg_chain));
694 mlog_errno(ret); 697 mlog_errno(ret);
698 }
695 699
696 /* 700 /*
697 * Here we should write the new page out first if we are 701 * Here we should write the new page out first if we are
@@ -957,6 +961,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
957 inode->i_ctime = CURRENT_TIME; 961 inode->i_ctime = CURRENT_TIME;
958 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 962 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
959 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 963 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
964 ocfs2_update_inode_fsync_trans(handle, inode, 0);
960 965
961 ocfs2_journal_dirty(handle, di_bh); 966 ocfs2_journal_dirty(handle, di_bh);
962 967
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 3683643f3f0e..2060fc398445 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -450,7 +450,6 @@ leave:
450 450
451 brelse(new_fe_bh); 451 brelse(new_fe_bh);
452 brelse(parent_fe_bh); 452 brelse(parent_fe_bh);
453 kfree(si.name);
454 kfree(si.value); 453 kfree(si.value);
455 454
456 ocfs2_free_dir_lookup_result(&lookup); 455 ocfs2_free_dir_lookup_result(&lookup);
@@ -495,6 +494,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
495 struct ocfs2_dinode *fe = NULL; 494 struct ocfs2_dinode *fe = NULL;
496 struct ocfs2_extent_list *fel; 495 struct ocfs2_extent_list *fel;
497 u16 feat; 496 u16 feat;
497 struct ocfs2_inode_info *oi = OCFS2_I(inode);
498 498
499 *new_fe_bh = NULL; 499 *new_fe_bh = NULL;
500 500
@@ -576,8 +576,8 @@ static int __ocfs2_mknod_locked(struct inode *dir,
576 mlog_errno(status); 576 mlog_errno(status);
577 } 577 }
578 578
579 status = 0; /* error in ocfs2_create_new_inode_locks is not 579 oi->i_sync_tid = handle->h_transaction->t_tid;
580 * critical */ 580 oi->i_datasync_tid = handle->h_transaction->t_tid;
581 581
582leave: 582leave:
583 if (status < 0) { 583 if (status < 0) {
@@ -1855,7 +1855,6 @@ bail:
1855 1855
1856 brelse(new_fe_bh); 1856 brelse(new_fe_bh);
1857 brelse(parent_fe_bh); 1857 brelse(parent_fe_bh);
1858 kfree(si.name);
1859 kfree(si.value); 1858 kfree(si.value);
1860 ocfs2_free_dir_lookup_result(&lookup); 1859 ocfs2_free_dir_lookup_result(&lookup);
1861 if (inode_ac) 1860 if (inode_ac)
@@ -2481,6 +2480,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2481 di->i_orphaned_slot = 0; 2480 di->i_orphaned_slot = 0;
2482 set_nlink(inode, 1); 2481 set_nlink(inode, 1);
2483 ocfs2_set_links_count(di, inode->i_nlink); 2482 ocfs2_set_links_count(di, inode->i_nlink);
2483 ocfs2_update_inode_fsync_trans(handle, inode, 1);
2484 ocfs2_journal_dirty(handle, di_bh); 2484 ocfs2_journal_dirty(handle, di_bh);
2485 2485
2486 status = ocfs2_add_entry(handle, dentry, inode, 2486 status = ocfs2_add_entry(handle, dentry, inode,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 553f53cc73ae..8d64a97a9d5e 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -30,6 +30,7 @@
30#include <linux/sched.h> 30#include <linux/sched.h>
31#include <linux/wait.h> 31#include <linux/wait.h>
32#include <linux/list.h> 32#include <linux/list.h>
33#include <linux/llist.h>
33#include <linux/rbtree.h> 34#include <linux/rbtree.h>
34#include <linux/workqueue.h> 35#include <linux/workqueue.h>
35#include <linux/kref.h> 36#include <linux/kref.h>
@@ -274,19 +275,16 @@ enum ocfs2_mount_options
274 OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ 275 OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
275}; 276};
276 277
277#define OCFS2_OSB_SOFT_RO 0x0001 278#define OCFS2_OSB_SOFT_RO 0x0001
278#define OCFS2_OSB_HARD_RO 0x0002 279#define OCFS2_OSB_HARD_RO 0x0002
279#define OCFS2_OSB_ERROR_FS 0x0004 280#define OCFS2_OSB_ERROR_FS 0x0004
280#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008 281#define OCFS2_DEFAULT_ATIME_QUANTUM 60
281
282#define OCFS2_DEFAULT_ATIME_QUANTUM 60
283 282
284struct ocfs2_journal; 283struct ocfs2_journal;
285struct ocfs2_slot_info; 284struct ocfs2_slot_info;
286struct ocfs2_recovery_map; 285struct ocfs2_recovery_map;
287struct ocfs2_replay_map; 286struct ocfs2_replay_map;
288struct ocfs2_quota_recovery; 287struct ocfs2_quota_recovery;
289struct ocfs2_dentry_lock;
290struct ocfs2_super 288struct ocfs2_super
291{ 289{
292 struct task_struct *commit_task; 290 struct task_struct *commit_task;
@@ -414,10 +412,9 @@ struct ocfs2_super
414 struct list_head blocked_lock_list; 412 struct list_head blocked_lock_list;
415 unsigned long blocked_lock_count; 413 unsigned long blocked_lock_count;
416 414
417 /* List of dentry locks to release. Anyone can add locks to 415 /* List of dquot structures to drop last reference to */
418 * the list, ocfs2_wq processes the list */ 416 struct llist_head dquot_drop_list;
419 struct ocfs2_dentry_lock *dentry_lock_list; 417 struct work_struct dquot_drop_work;
420 struct work_struct dentry_lock_work;
421 418
422 wait_queue_head_t osb_mount_event; 419 wait_queue_head_t osb_mount_event;
423 420
@@ -449,6 +446,8 @@ struct ocfs2_super
449 /* rb tree root for refcount lock. */ 446 /* rb tree root for refcount lock. */
450 struct rb_root osb_rf_lock_tree; 447 struct rb_root osb_rf_lock_tree;
451 struct ocfs2_refcount_tree *osb_ref_tree_lru; 448 struct ocfs2_refcount_tree *osb_ref_tree_lru;
449
450 struct mutex system_file_mutex;
452}; 451};
453 452
454#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 453#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
@@ -579,18 +578,6 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
579 spin_unlock(&osb->osb_lock); 578 spin_unlock(&osb->osb_lock);
580} 579}
581 580
582
583static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb,
584 unsigned long flag)
585{
586 unsigned long ret;
587
588 spin_lock(&osb->osb_lock);
589 ret = osb->osb_flags & flag;
590 spin_unlock(&osb->osb_lock);
591 return ret;
592}
593
594static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, 581static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
595 int hard) 582 int hard)
596{ 583{
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index d5ab56cbe5c5..f266d67df3c6 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -28,6 +28,7 @@ struct ocfs2_dquot {
28 unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */ 28 unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */
29 s64 dq_origspace; /* Last globally synced space usage */ 29 s64 dq_origspace; /* Last globally synced space usage */
30 s64 dq_originodes; /* Last globally synced inode usage */ 30 s64 dq_originodes; /* Last globally synced inode usage */
31 struct llist_node list; /* Member of list of dquots to drop */
31}; 32};
32 33
33/* Description of one chunk to recover in memory */ 34/* Description of one chunk to recover in memory */
@@ -110,6 +111,7 @@ int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
110int ocfs2_create_local_dquot(struct dquot *dquot); 111int ocfs2_create_local_dquot(struct dquot *dquot);
111int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot); 112int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
112int ocfs2_local_write_dquot(struct dquot *dquot); 113int ocfs2_local_write_dquot(struct dquot *dquot);
114void ocfs2_drop_dquot_refs(struct work_struct *work);
113 115
114extern const struct dquot_operations ocfs2_quota_operations; 116extern const struct dquot_operations ocfs2_quota_operations;
115extern struct quota_format_type ocfs2_quota_format; 117extern struct quota_format_type ocfs2_quota_format;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index d7b5108789e2..b990a62cff50 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -10,6 +10,7 @@
10#include <linux/jiffies.h> 10#include <linux/jiffies.h>
11#include <linux/writeback.h> 11#include <linux/writeback.h>
12#include <linux/workqueue.h> 12#include <linux/workqueue.h>
13#include <linux/llist.h>
13 14
14#include <cluster/masklog.h> 15#include <cluster/masklog.h>
15 16
@@ -679,6 +680,27 @@ static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
679 OCFS2_INODE_UPDATE_CREDITS; 680 OCFS2_INODE_UPDATE_CREDITS;
680} 681}
681 682
683void ocfs2_drop_dquot_refs(struct work_struct *work)
684{
685 struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
686 dquot_drop_work);
687 struct llist_node *list;
688 struct ocfs2_dquot *odquot, *next_odquot;
689
690 list = llist_del_all(&osb->dquot_drop_list);
691 llist_for_each_entry_safe(odquot, next_odquot, list, list) {
692 /* Drop the reference we acquired in ocfs2_dquot_release() */
693 dqput(&odquot->dq_dquot);
694 }
695}
696
697/*
698 * Called when the last reference to dquot is dropped. If we are called from
699 * downconvert thread, we cannot do all the handling here because grabbing
700 * quota lock could deadlock (the node holding the quota lock could need some
701 * other cluster lock to proceed but with blocked downconvert thread we cannot
702 * release any lock).
703 */
682static int ocfs2_release_dquot(struct dquot *dquot) 704static int ocfs2_release_dquot(struct dquot *dquot)
683{ 705{
684 handle_t *handle; 706 handle_t *handle;
@@ -694,6 +716,19 @@ static int ocfs2_release_dquot(struct dquot *dquot)
694 /* Check whether we are not racing with some other dqget() */ 716 /* Check whether we are not racing with some other dqget() */
695 if (atomic_read(&dquot->dq_count) > 1) 717 if (atomic_read(&dquot->dq_count) > 1)
696 goto out; 718 goto out;
719 /* Running from downconvert thread? Postpone quota processing to wq */
720 if (current == osb->dc_task) {
721 /*
722 * Grab our own reference to dquot and queue it for delayed
723 * dropping. Quota code rechecks after calling
724 * ->release_dquot() and won't free dquot structure.
725 */
726 dqgrab(dquot);
727 /* First entry on list -> queue work */
728 if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
729 queue_work(ocfs2_wq, &osb->dquot_drop_work);
730 goto out;
731 }
697 status = ocfs2_lock_global_qf(oinfo, 1); 732 status = ocfs2_lock_global_qf(oinfo, 1);
698 if (status < 0) 733 if (status < 0)
699 goto out; 734 goto out;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index ca5ce14cbddc..5c8343fe7438 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -603,11 +603,25 @@ static struct kobj_attribute ocfs2_attr_cluster_stack =
603 ocfs2_cluster_stack_show, 603 ocfs2_cluster_stack_show,
604 ocfs2_cluster_stack_store); 604 ocfs2_cluster_stack_store);
605 605
606
607
608static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj,
609 struct kobj_attribute *attr,
610 char *buf)
611{
612 return snprintf(buf, PAGE_SIZE, "1\n");
613}
614
615static struct kobj_attribute ocfs2_attr_dlm_recover_support =
616 __ATTR(dlm_recover_callback_support, S_IRUGO,
617 ocfs2_dlm_recover_show, NULL);
618
606static struct attribute *ocfs2_attrs[] = { 619static struct attribute *ocfs2_attrs[] = {
607 &ocfs2_attr_max_locking_protocol.attr, 620 &ocfs2_attr_max_locking_protocol.attr,
608 &ocfs2_attr_loaded_cluster_plugins.attr, 621 &ocfs2_attr_loaded_cluster_plugins.attr,
609 &ocfs2_attr_active_cluster_plugin.attr, 622 &ocfs2_attr_active_cluster_plugin.attr,
610 &ocfs2_attr_cluster_stack.attr, 623 &ocfs2_attr_cluster_stack.attr,
624 &ocfs2_attr_dlm_recover_support.attr,
611 NULL, 625 NULL,
612}; 626};
613 627
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 47ae2663a6f5..0cb889a17ae1 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -771,6 +771,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
771 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); 771 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
772 i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); 772 i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
773 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 773 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
774 ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
774 775
775 status = 0; 776 status = 0;
776 777
@@ -1607,6 +1608,21 @@ out:
1607 return ret; 1608 return ret;
1608} 1609}
1609 1610
1611void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
1612 struct buffer_head *di_bh,
1613 u32 num_bits,
1614 u16 chain)
1615{
1616 u32 tmp_used;
1617 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1618 struct ocfs2_chain_list *cl;
1619
1620 cl = (struct ocfs2_chain_list *)&di->id2.i_chain;
1621 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1622 di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits);
1623 le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits);
1624}
1625
1610static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, 1626static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1611 struct ocfs2_extent_rec *rec, 1627 struct ocfs2_extent_rec *rec,
1612 struct ocfs2_chain_list *cl) 1628 struct ocfs2_chain_list *cl)
@@ -1707,8 +1723,12 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1707 1723
1708 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, 1724 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1709 res->sr_bit_offset, res->sr_bits); 1725 res->sr_bit_offset, res->sr_bits);
1710 if (ret < 0) 1726 if (ret < 0) {
1727 ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
1728 res->sr_bits,
1729 le16_to_cpu(gd->bg_chain));
1711 mlog_errno(ret); 1730 mlog_errno(ret);
1731 }
1712 1732
1713out_loc_only: 1733out_loc_only:
1714 *bits_left = le16_to_cpu(gd->bg_free_bits_count); 1734 *bits_left = le16_to_cpu(gd->bg_free_bits_count);
@@ -1838,6 +1858,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1838 res->sr_bit_offset, 1858 res->sr_bit_offset,
1839 res->sr_bits); 1859 res->sr_bits);
1840 if (status < 0) { 1860 if (status < 0) {
1861 ocfs2_rollback_alloc_dinode_counts(alloc_inode,
1862 ac->ac_bh, res->sr_bits, chain);
1841 mlog_errno(status); 1863 mlog_errno(status);
1842 goto bail; 1864 goto bail;
1843 } 1865 }
@@ -2091,7 +2113,7 @@ int ocfs2_find_new_inode_loc(struct inode *dir,
2091 2113
2092 ac->ac_find_loc_priv = res; 2114 ac->ac_find_loc_priv = res;
2093 *fe_blkno = res->sr_blkno; 2115 *fe_blkno = res->sr_blkno;
2094 2116 ocfs2_update_inode_fsync_trans(handle, dir, 0);
2095out: 2117out:
2096 if (handle) 2118 if (handle)
2097 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); 2119 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
@@ -2149,6 +2171,8 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2149 res->sr_bit_offset, 2171 res->sr_bit_offset,
2150 res->sr_bits); 2172 res->sr_bits);
2151 if (ret < 0) { 2173 if (ret < 0) {
2174 ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
2175 ac->ac_bh, res->sr_bits, chain);
2152 mlog_errno(ret); 2176 mlog_errno(ret);
2153 goto out; 2177 goto out;
2154 } 2178 }
@@ -2870,6 +2894,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2870 status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); 2894 status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2871 if (status < 0) { 2895 if (status < 0) {
2872 mutex_unlock(&inode_alloc_inode->i_mutex); 2896 mutex_unlock(&inode_alloc_inode->i_mutex);
2897 iput(inode_alloc_inode);
2873 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", 2898 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2874 (u32)suballoc_slot, status); 2899 (u32)suballoc_slot, status);
2875 goto bail; 2900 goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 218d8036b3e7..2d2501767c0c 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -91,6 +91,10 @@ int ocfs2_alloc_dinode_update_counts(struct inode *inode,
91 struct buffer_head *di_bh, 91 struct buffer_head *di_bh,
92 u32 num_bits, 92 u32 num_bits,
93 u16 chain); 93 u16 chain);
94void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
95 struct buffer_head *di_bh,
96 u32 num_bits,
97 u16 chain);
94int ocfs2_block_group_set_bits(handle_t *handle, 98int ocfs2_block_group_set_bits(handle_t *handle,
95 struct inode *alloc_inode, 99 struct inode *alloc_inode,
96 struct ocfs2_group_desc *bg, 100 struct ocfs2_group_desc *bg,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 49d84f80f36c..a7cdd56f4c79 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -561,6 +561,9 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
561 if (!oi) 561 if (!oi)
562 return NULL; 562 return NULL;
563 563
564 oi->i_sync_tid = 0;
565 oi->i_datasync_tid = 0;
566
564 jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode); 567 jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
565 return &oi->vfs_inode; 568 return &oi->vfs_inode;
566} 569}
@@ -631,6 +634,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
631 struct ocfs2_super *osb = OCFS2_SB(sb); 634 struct ocfs2_super *osb = OCFS2_SB(sb);
632 u32 tmp; 635 u32 tmp;
633 636
637 sync_filesystem(sb);
638
634 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || 639 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
635 !ocfs2_check_set_options(sb, &parsed_options)) { 640 !ocfs2_check_set_options(sb, &parsed_options)) {
636 ret = -EINVAL; 641 ret = -EINVAL;
@@ -1238,30 +1243,11 @@ static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
1238 return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); 1243 return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
1239} 1244}
1240 1245
1241static void ocfs2_kill_sb(struct super_block *sb)
1242{
1243 struct ocfs2_super *osb = OCFS2_SB(sb);
1244
1245 /* Failed mount? */
1246 if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED)
1247 goto out;
1248
1249 /* Prevent further queueing of inode drop events */
1250 spin_lock(&dentry_list_lock);
1251 ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED);
1252 spin_unlock(&dentry_list_lock);
1253 /* Wait for work to finish and/or remove it */
1254 cancel_work_sync(&osb->dentry_lock_work);
1255out:
1256 kill_block_super(sb);
1257}
1258
1259static struct file_system_type ocfs2_fs_type = { 1246static struct file_system_type ocfs2_fs_type = {
1260 .owner = THIS_MODULE, 1247 .owner = THIS_MODULE,
1261 .name = "ocfs2", 1248 .name = "ocfs2",
1262 .mount = ocfs2_mount, 1249 .mount = ocfs2_mount,
1263 .kill_sb = ocfs2_kill_sb, 1250 .kill_sb = kill_block_super,
1264
1265 .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, 1251 .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
1266 .next = NULL 1252 .next = NULL
1267}; 1253};
@@ -1612,14 +1598,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
1612 return 0; 1598 return 0;
1613} 1599}
1614 1600
1615wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
1616
1617static int __init ocfs2_init(void) 1601static int __init ocfs2_init(void)
1618{ 1602{
1619 int status, i; 1603 int status;
1620
1621 for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
1622 init_waitqueue_head(&ocfs2__ioend_wq[i]);
1623 1604
1624 status = init_ocfs2_uptodate_cache(); 1605 status = init_ocfs2_uptodate_cache();
1625 if (status < 0) 1606 if (status < 0)
@@ -1761,7 +1742,7 @@ static void ocfs2_inode_init_once(void *data)
1761 ocfs2_extent_map_init(&oi->vfs_inode); 1742 ocfs2_extent_map_init(&oi->vfs_inode);
1762 INIT_LIST_HEAD(&oi->ip_io_markers); 1743 INIT_LIST_HEAD(&oi->ip_io_markers);
1763 oi->ip_dir_start_lookup = 0; 1744 oi->ip_dir_start_lookup = 0;
1764 atomic_set(&oi->ip_unaligned_aio, 0); 1745 mutex_init(&oi->ip_unaligned_aio);
1765 init_rwsem(&oi->ip_alloc_sem); 1746 init_rwsem(&oi->ip_alloc_sem);
1766 init_rwsem(&oi->ip_xattr_sem); 1747 init_rwsem(&oi->ip_xattr_sem);
1767 mutex_init(&oi->ip_io_mutex); 1748 mutex_init(&oi->ip_io_mutex);
@@ -1932,17 +1913,16 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1932 1913
1933 debugfs_remove(osb->osb_ctxt); 1914 debugfs_remove(osb->osb_ctxt);
1934 1915
1935 /*
1936 * Flush inode dropping work queue so that deletes are
1937 * performed while the filesystem is still working
1938 */
1939 ocfs2_drop_all_dl_inodes(osb);
1940
1941 /* Orphan scan should be stopped as early as possible */ 1916 /* Orphan scan should be stopped as early as possible */
1942 ocfs2_orphan_scan_stop(osb); 1917 ocfs2_orphan_scan_stop(osb);
1943 1918
1944 ocfs2_disable_quotas(osb); 1919 ocfs2_disable_quotas(osb);
1945 1920
1921 /* All dquots should be freed by now */
1922 WARN_ON(!llist_empty(&osb->dquot_drop_list));
1923 /* Wait for worker to be done with the work structure in osb */
1924 cancel_work_sync(&osb->dquot_drop_work);
1925
1946 ocfs2_shutdown_local_alloc(osb); 1926 ocfs2_shutdown_local_alloc(osb);
1947 1927
1948 /* This will disable recovery and flush any recovery work. */ 1928 /* This will disable recovery and flush any recovery work. */
@@ -2077,7 +2057,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
2077 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; 2057 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
2078 struct inode *inode = NULL; 2058 struct inode *inode = NULL;
2079 struct ocfs2_journal *journal; 2059 struct ocfs2_journal *journal;
2080 __le32 uuid_net_key;
2081 struct ocfs2_super *osb; 2060 struct ocfs2_super *osb;
2082 u64 total_blocks; 2061 u64 total_blocks;
2083 2062
@@ -2123,6 +2102,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
2123 spin_lock_init(&osb->osb_xattr_lock); 2102 spin_lock_init(&osb->osb_xattr_lock);
2124 ocfs2_init_steal_slots(osb); 2103 ocfs2_init_steal_slots(osb);
2125 2104
2105 mutex_init(&osb->system_file_mutex);
2106
2126 atomic_set(&osb->alloc_stats.moves, 0); 2107 atomic_set(&osb->alloc_stats.moves, 0);
2127 atomic_set(&osb->alloc_stats.local_data, 0); 2108 atomic_set(&osb->alloc_stats.local_data, 0);
2128 atomic_set(&osb->alloc_stats.bitmap_data, 0); 2109 atomic_set(&osb->alloc_stats.bitmap_data, 0);
@@ -2276,8 +2257,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
2276 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); 2257 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
2277 journal->j_state = OCFS2_JOURNAL_FREE; 2258 journal->j_state = OCFS2_JOURNAL_FREE;
2278 2259
2279 INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes); 2260 INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
2280 osb->dentry_lock_list = NULL; 2261 init_llist_head(&osb->dquot_drop_list);
2281 2262
2282 /* get some pseudo constants for clustersize bits */ 2263 /* get some pseudo constants for clustersize bits */
2283 osb->s_clustersize_bits = 2264 osb->s_clustersize_bits =
@@ -2311,8 +2292,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
2311 goto bail; 2292 goto bail;
2312 } 2293 }
2313 2294
2314 memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
2315
2316 strncpy(osb->vol_label, di->id2.i_super.s_label, 63); 2295 strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
2317 osb->vol_label[63] = '\0'; 2296 osb->vol_label[63] = '\0';
2318 osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); 2297 osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index f053688d22a3..af155c183123 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -113,9 +113,11 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
113 } else 113 } else
114 arr = get_local_system_inode(osb, type, slot); 114 arr = get_local_system_inode(osb, type, slot);
115 115
116 mutex_lock(&osb->system_file_mutex);
116 if (arr && ((inode = *arr) != NULL)) { 117 if (arr && ((inode = *arr) != NULL)) {
117 /* get a ref in addition to the array ref */ 118 /* get a ref in addition to the array ref */
118 inode = igrab(inode); 119 inode = igrab(inode);
120 mutex_unlock(&osb->system_file_mutex);
119 BUG_ON(!inode); 121 BUG_ON(!inode);
120 122
121 return inode; 123 return inode;
@@ -129,6 +131,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
129 *arr = igrab(inode); 131 *arr = igrab(inode);
130 BUG_ON(!*arr); 132 BUG_ON(!*arr);
131 } 133 }
134 mutex_unlock(&osb->system_file_mutex);
132 return inode; 135 return inode;
133} 136}
134 137
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 185fa3b7f962..016f01df3825 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -369,7 +369,7 @@ static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
369 * them fully. 369 * them fully.
370 */ 370 */
371static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket, 371static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
372 u64 xb_blkno) 372 u64 xb_blkno, int new)
373{ 373{
374 int i, rc = 0; 374 int i, rc = 0;
375 375
@@ -383,9 +383,16 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
383 } 383 }
384 384
385 if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode), 385 if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
386 bucket->bu_bhs[i])) 386 bucket->bu_bhs[i])) {
387 ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode), 387 if (new)
388 bucket->bu_bhs[i]); 388 ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
389 bucket->bu_bhs[i]);
390 else {
391 set_buffer_uptodate(bucket->bu_bhs[i]);
392 ocfs2_set_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
393 bucket->bu_bhs[i]);
394 }
395 }
389 } 396 }
390 397
391 if (rc) 398 if (rc)
@@ -2602,6 +2609,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
2602 oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL); 2609 oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);
2603 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 2610 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
2604 spin_unlock(&oi->ip_lock); 2611 spin_unlock(&oi->ip_lock);
2612 ocfs2_update_inode_fsync_trans(handle, inode, 0);
2605 2613
2606 ocfs2_journal_dirty(handle, di_bh); 2614 ocfs2_journal_dirty(handle, di_bh);
2607out_commit: 2615out_commit:
@@ -3200,8 +3208,15 @@ meta_guess:
3200 clusters_add += 1; 3208 clusters_add += 1;
3201 } 3209 }
3202 } else { 3210 } else {
3203 meta_add += 1;
3204 credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; 3211 credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
3212 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
3213 struct ocfs2_extent_list *el = &def_xv.xv.xr_list;
3214 meta_add += ocfs2_extend_meta_needed(el);
3215 credits += ocfs2_calc_extend_credits(inode->i_sb,
3216 el);
3217 } else {
3218 meta_add += 1;
3219 }
3205 } 3220 }
3206out: 3221out:
3207 if (clusters_need) 3222 if (clusters_need)
@@ -3614,6 +3629,7 @@ int ocfs2_xattr_set(struct inode *inode,
3614 } 3629 }
3615 3630
3616 ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); 3631 ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
3632 ocfs2_update_inode_fsync_trans(ctxt.handle, inode, 0);
3617 3633
3618 ocfs2_commit_trans(osb, ctxt.handle); 3634 ocfs2_commit_trans(osb, ctxt.handle);
3619 3635
@@ -4294,7 +4310,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
4294 4310
4295 trace_ocfs2_xattr_create_index_block((unsigned long long)blkno); 4311 trace_ocfs2_xattr_create_index_block((unsigned long long)blkno);
4296 4312
4297 ret = ocfs2_init_xattr_bucket(xs->bucket, blkno); 4313 ret = ocfs2_init_xattr_bucket(xs->bucket, blkno, 1);
4298 if (ret) { 4314 if (ret) {
4299 mlog_errno(ret); 4315 mlog_errno(ret);
4300 goto out; 4316 goto out;
@@ -4638,7 +4654,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4638 * Even if !new_bucket_head, we're overwriting t_bucket. Thus, 4654 * Even if !new_bucket_head, we're overwriting t_bucket. Thus,
4639 * there's no need to read it. 4655 * there's no need to read it.
4640 */ 4656 */
4641 ret = ocfs2_init_xattr_bucket(t_bucket, new_blk); 4657 ret = ocfs2_init_xattr_bucket(t_bucket, new_blk, new_bucket_head);
4642 if (ret) { 4658 if (ret) {
4643 mlog_errno(ret); 4659 mlog_errno(ret);
4644 goto out; 4660 goto out;
@@ -4804,7 +4820,7 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
4804 * Even if !t_is_new, we're overwriting t_bucket. Thus, 4820 * Even if !t_is_new, we're overwriting t_bucket. Thus,
4805 * there's no need to read it. 4821 * there's no need to read it.
4806 */ 4822 */
4807 ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno); 4823 ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno, t_is_new);
4808 if (ret) 4824 if (ret)
4809 goto out; 4825 goto out;
4810 4826
@@ -5476,6 +5492,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
5476 ret = ocfs2_truncate_log_append(osb, handle, blkno, len); 5492 ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
5477 if (ret) 5493 if (ret)
5478 mlog_errno(ret); 5494 mlog_errno(ret);
5495 ocfs2_update_inode_fsync_trans(handle, inode, 0);
5479 5496
5480out_commit: 5497out_commit:
5481 ocfs2_commit_trans(osb, handle); 5498 ocfs2_commit_trans(osb, handle);
@@ -6830,7 +6847,7 @@ static int ocfs2_reflink_xattr_bucket(handle_t *handle,
6830 break; 6847 break;
6831 } 6848 }
6832 6849
6833 ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno); 6850 ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno, 1);
6834 if (ret) { 6851 if (ret) {
6835 mlog_errno(ret); 6852 mlog_errno(ret);
6836 break; 6853 break;
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index d8b0afde2179..ec58c7659183 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -183,7 +183,7 @@ int omfs_sync_inode(struct inode *inode)
183 */ 183 */
184static void omfs_evict_inode(struct inode *inode) 184static void omfs_evict_inode(struct inode *inode)
185{ 185{
186 truncate_inode_pages(&inode->i_data, 0); 186 truncate_inode_pages_final(&inode->i_data);
187 clear_inode(inode); 187 clear_inode(inode);
188 188
189 if (inode->i_nlink) 189 if (inode->i_nlink)
diff --git a/fs/open.c b/fs/open.c
index b9ed8b25c108..631aea815def 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -231,7 +231,13 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
231 return -EINVAL; 231 return -EINVAL;
232 232
233 /* Return error if mode is not supported */ 233 /* Return error if mode is not supported */
234 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 234 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
235 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
236 return -EOPNOTSUPP;
237
238 /* Punch hole and zero range are mutually exclusive */
239 if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) ==
240 (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
235 return -EOPNOTSUPP; 241 return -EOPNOTSUPP;
236 242
237 /* Punch hole must have keep size set */ 243 /* Punch hole must have keep size set */
@@ -239,11 +245,20 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
239 !(mode & FALLOC_FL_KEEP_SIZE)) 245 !(mode & FALLOC_FL_KEEP_SIZE))
240 return -EOPNOTSUPP; 246 return -EOPNOTSUPP;
241 247
248 /* Collapse range should only be used exclusively. */
249 if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
250 (mode & ~FALLOC_FL_COLLAPSE_RANGE))
251 return -EINVAL;
252
242 if (!(file->f_mode & FMODE_WRITE)) 253 if (!(file->f_mode & FMODE_WRITE))
243 return -EBADF; 254 return -EBADF;
244 255
245 /* It's not possible punch hole on append only file */ 256 /*
246 if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode)) 257 * It's not possible to punch hole or perform collapse range
258 * on append only file
259 */
260 if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)
261 && IS_APPEND(inode))
247 return -EPERM; 262 return -EPERM;
248 263
249 if (IS_IMMUTABLE(inode)) 264 if (IS_IMMUTABLE(inode))
@@ -271,6 +286,14 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
271 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) 286 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
272 return -EFBIG; 287 return -EFBIG;
273 288
289 /*
290 * There is no need to overlap collapse range with EOF, in which case
291 * it is effectively a truncate operation
292 */
293 if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
294 (offset + len >= i_size_read(inode)))
295 return -EINVAL;
296
274 if (!file->f_op->fallocate) 297 if (!file->f_op->fallocate)
275 return -EOPNOTSUPP; 298 return -EOPNOTSUPP;
276 299
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 8c0ceb8dd1f7..15e4500cda3e 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -368,6 +368,7 @@ static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
368 368
369static int openprom_remount(struct super_block *sb, int *flags, char *data) 369static int openprom_remount(struct super_block *sb, int *flags, char *data)
370{ 370{
371 sync_filesystem(sb);
371 *flags |= MS_NOATIME; 372 *flags |= MS_NOATIME;
372 return 0; 373 return 0;
373} 374}
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 11c54fd51e16..9e363e41dacc 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -723,7 +723,7 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
723 void *buffer, size_t size) 723 void *buffer, size_t size)
724{ 724{
725 posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer; 725 posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer;
726 posix_acl_xattr_entry *ext_entry = ext_acl->a_entries; 726 posix_acl_xattr_entry *ext_entry;
727 int real_size, n; 727 int real_size, n;
728 728
729 real_size = posix_acl_xattr_size(acl->a_count); 729 real_size = posix_acl_xattr_size(acl->a_count);
@@ -731,7 +731,8 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
731 return real_size; 731 return real_size;
732 if (real_size > size) 732 if (real_size > size)
733 return -ERANGE; 733 return -ERANGE;
734 734
735 ext_entry = ext_acl->a_entries;
735 ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); 736 ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
736 737
737 for (n=0; n < acl->a_count; n++, ext_entry++) { 738 for (n=0; n < acl->a_count; n++, ext_entry++) {
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index ab30716584f5..239493ec718e 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -27,6 +27,5 @@ proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
27proc-$(CONFIG_NET) += proc_net.o 27proc-$(CONFIG_NET) += proc_net.o
28proc-$(CONFIG_PROC_KCORE) += kcore.o 28proc-$(CONFIG_PROC_KCORE) += kcore.o
29proc-$(CONFIG_PROC_VMCORE) += vmcore.o 29proc-$(CONFIG_PROC_VMCORE) += vmcore.o
30proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o
31proc-$(CONFIG_PRINTK) += kmsg.o 30proc-$(CONFIG_PRINTK) += kmsg.o
32proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o 31proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 124fc43c7090..8f20e3404fd2 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -35,7 +35,7 @@ static void proc_evict_inode(struct inode *inode)
35 const struct proc_ns_operations *ns_ops; 35 const struct proc_ns_operations *ns_ops;
36 void *ns; 36 void *ns;
37 37
38 truncate_inode_pages(&inode->i_data, 0); 38 truncate_inode_pages_final(&inode->i_data);
39 clear_inode(inode); 39 clear_inode(inode);
40 40
41 /* Stop tracking associated processes */ 41 /* Stop tracking associated processes */
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 651d09a11dde..3ab6d14e71c5 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -211,13 +211,6 @@ extern int proc_fill_super(struct super_block *);
211extern void proc_entry_rundown(struct proc_dir_entry *); 211extern void proc_entry_rundown(struct proc_dir_entry *);
212 212
213/* 213/*
214 * proc_devtree.c
215 */
216#ifdef CONFIG_PROC_DEVICETREE
217extern void proc_device_tree_init(void);
218#endif
219
220/*
221 * proc_namespaces.c 214 * proc_namespaces.c
222 */ 215 */
223extern const struct inode_operations proc_ns_dir_inode_operations; 216extern const struct inode_operations proc_ns_dir_inode_operations;
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
deleted file mode 100644
index c82dd5147845..000000000000
--- a/fs/proc/proc_devtree.c
+++ /dev/null
@@ -1,241 +0,0 @@
1/*
2 * proc_devtree.c - handles /proc/device-tree
3 *
4 * Copyright 1997 Paul Mackerras
5 */
6#include <linux/errno.h>
7#include <linux/init.h>
8#include <linux/time.h>
9#include <linux/proc_fs.h>
10#include <linux/seq_file.h>
11#include <linux/printk.h>
12#include <linux/stat.h>
13#include <linux/string.h>
14#include <linux/of.h>
15#include <linux/export.h>
16#include <linux/slab.h>
17#include <asm/uaccess.h>
18#include "internal.h"
19
20static inline void set_node_proc_entry(struct device_node *np,
21 struct proc_dir_entry *de)
22{
23 np->pde = de;
24}
25
26static struct proc_dir_entry *proc_device_tree;
27
28/*
29 * Supply data on a read from /proc/device-tree/node/property.
30 */
31static int property_proc_show(struct seq_file *m, void *v)
32{
33 struct property *pp = m->private;
34
35 seq_write(m, pp->value, pp->length);
36 return 0;
37}
38
39static int property_proc_open(struct inode *inode, struct file *file)
40{
41 return single_open(file, property_proc_show, __PDE_DATA(inode));
42}
43
44static const struct file_operations property_proc_fops = {
45 .owner = THIS_MODULE,
46 .open = property_proc_open,
47 .read = seq_read,
48 .llseek = seq_lseek,
49 .release = single_release,
50};
51
52/*
53 * For a node with a name like "gc@10", we make symlinks called "gc"
54 * and "@10" to it.
55 */
56
57/*
58 * Add a property to a node
59 */
60static struct proc_dir_entry *
61__proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp,
62 const char *name)
63{
64 struct proc_dir_entry *ent;
65
66 /*
67 * Unfortunately proc_register puts each new entry
68 * at the beginning of the list. So we rearrange them.
69 */
70 ent = proc_create_data(name,
71 strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR,
72 de, &property_proc_fops, pp);
73 if (ent == NULL)
74 return NULL;
75
76 if (!strncmp(name, "security-", 9))
77 proc_set_size(ent, 0); /* don't leak number of password chars */
78 else
79 proc_set_size(ent, pp->length);
80
81 return ent;
82}
83
84
85void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop)
86{
87 __proc_device_tree_add_prop(pde, prop, prop->name);
88}
89
90void proc_device_tree_remove_prop(struct proc_dir_entry *pde,
91 struct property *prop)
92{
93 remove_proc_entry(prop->name, pde);
94}
95
96void proc_device_tree_update_prop(struct proc_dir_entry *pde,
97 struct property *newprop,
98 struct property *oldprop)
99{
100 struct proc_dir_entry *ent;
101
102 if (!oldprop) {
103 proc_device_tree_add_prop(pde, newprop);
104 return;
105 }
106
107 for (ent = pde->subdir; ent != NULL; ent = ent->next)
108 if (ent->data == oldprop)
109 break;
110 if (ent == NULL) {
111 pr_warn("device-tree: property \"%s\" does not exist\n",
112 oldprop->name);
113 } else {
114 ent->data = newprop;
115 ent->size = newprop->length;
116 }
117}
118
119/*
120 * Various dodgy firmware might give us nodes and/or properties with
121 * conflicting names. That's generally ok, except for exporting via /proc,
122 * so munge names here to ensure they're unique.
123 */
124
125static int duplicate_name(struct proc_dir_entry *de, const char *name)
126{
127 struct proc_dir_entry *ent;
128 int found = 0;
129
130 spin_lock(&proc_subdir_lock);
131
132 for (ent = de->subdir; ent != NULL; ent = ent->next) {
133 if (strcmp(ent->name, name) == 0) {
134 found = 1;
135 break;
136 }
137 }
138
139 spin_unlock(&proc_subdir_lock);
140
141 return found;
142}
143
144static const char *fixup_name(struct device_node *np, struct proc_dir_entry *de,
145 const char *name)
146{
147 char *fixed_name;
148 int fixup_len = strlen(name) + 2 + 1; /* name + #x + \0 */
149 int i = 1, size;
150
151realloc:
152 fixed_name = kmalloc(fixup_len, GFP_KERNEL);
153 if (fixed_name == NULL) {
154 pr_err("device-tree: Out of memory trying to fixup "
155 "name \"%s\"\n", name);
156 return name;
157 }
158
159retry:
160 size = snprintf(fixed_name, fixup_len, "%s#%d", name, i);
161 size++; /* account for NULL */
162
163 if (size > fixup_len) {
164 /* We ran out of space, free and reallocate. */
165 kfree(fixed_name);
166 fixup_len = size;
167 goto realloc;
168 }
169
170 if (duplicate_name(de, fixed_name)) {
171 /* Multiple duplicates. Retry with a different offset. */
172 i++;
173 goto retry;
174 }
175
176 pr_warn("device-tree: Duplicate name in %s, renamed to \"%s\"\n",
177 np->full_name, fixed_name);
178
179 return fixed_name;
180}
181
182/*
183 * Process a node, adding entries for its children and its properties.
184 */
185void proc_device_tree_add_node(struct device_node *np,
186 struct proc_dir_entry *de)
187{
188 struct property *pp;
189 struct proc_dir_entry *ent;
190 struct device_node *child;
191 const char *p;
192
193 set_node_proc_entry(np, de);
194 for (child = NULL; (child = of_get_next_child(np, child));) {
195 /* Use everything after the last slash, or the full name */
196 p = kbasename(child->full_name);
197
198 if (duplicate_name(de, p))
199 p = fixup_name(np, de, p);
200
201 ent = proc_mkdir(p, de);
202 if (ent == NULL)
203 break;
204 proc_device_tree_add_node(child, ent);
205 }
206 of_node_put(child);
207
208 for (pp = np->properties; pp != NULL; pp = pp->next) {
209 p = pp->name;
210
211 if (strchr(p, '/'))
212 continue;
213
214 if (duplicate_name(de, p))
215 p = fixup_name(np, de, p);
216
217 ent = __proc_device_tree_add_prop(de, pp, p);
218 if (ent == NULL)
219 break;
220 }
221}
222
223/*
224 * Called on initialization to set up the /proc/device-tree subtree
225 */
226void __init proc_device_tree_init(void)
227{
228 struct device_node *root;
229
230 proc_device_tree = proc_mkdir("device-tree", NULL);
231 if (proc_device_tree == NULL)
232 return;
233 root = of_find_node_by_path("/");
234 if (root == NULL) {
235 remove_proc_entry("device-tree", NULL);
236 pr_debug("/proc/device-tree: can't find root\n");
237 return;
238 }
239 proc_device_tree_add_node(root, proc_device_tree);
240 of_node_put(root);
241}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 87dbcbef7fe4..5dbadecb234d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -92,6 +92,8 @@ static int proc_parse_options(char *options, struct pid_namespace *pid)
92int proc_remount(struct super_block *sb, int *flags, char *data) 92int proc_remount(struct super_block *sb, int *flags, char *data)
93{ 93{
94 struct pid_namespace *pid = sb->s_fs_info; 94 struct pid_namespace *pid = sb->s_fs_info;
95
96 sync_filesystem(sb);
95 return !proc_parse_options(data, pid); 97 return !proc_parse_options(data, pid);
96} 98}
97 99
@@ -183,9 +185,6 @@ void __init proc_root_init(void)
183 proc_mkdir("openprom", NULL); 185 proc_mkdir("openprom", NULL);
184#endif 186#endif
185 proc_tty_init(); 187 proc_tty_init();
186#ifdef CONFIG_PROC_DEVICETREE
187 proc_device_tree_init();
188#endif
189 proc_mkdir("bus", NULL); 188 proc_mkdir("bus", NULL);
190 proc_sys_init(); 189 proc_sys_init();
191} 190}
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 6f599c62f0cc..9d231e9e5f0e 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -9,7 +9,7 @@
9#include <linux/slab.h> 9#include <linux/slab.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/irqnr.h> 11#include <linux/irqnr.h>
12#include <asm/cputime.h> 12#include <linux/cputime.h>
13#include <linux/tick.h> 13#include <linux/tick.h>
14 14
15#ifndef arch_irq_stat_cpu 15#ifndef arch_irq_stat_cpu
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 7141b8d0ca9e..33de567c25af 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -5,7 +5,7 @@
5#include <linux/seq_file.h> 5#include <linux/seq_file.h>
6#include <linux/time.h> 6#include <linux/time.h>
7#include <linux/kernel_stat.h> 7#include <linux/kernel_stat.h>
8#include <asm/cputime.h> 8#include <linux/cputime.h>
9 9
10static int uptime_proc_show(struct seq_file *m, void *v) 10static int uptime_proc_show(struct seq_file *m, void *v)
11{ 11{
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 12823845d324..192297b0090d 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -249,6 +249,7 @@ static void parse_options(char *options)
249 249
250static int pstore_remount(struct super_block *sb, int *flags, char *data) 250static int pstore_remount(struct super_block *sb, int *flags, char *data)
251{ 251{
252 sync_filesystem(sb);
252 parse_options(data); 253 parse_options(data);
253 254
254 return 0; 255 return 0;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 78c3c2097787..46d269e38706 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -497,6 +497,7 @@ void pstore_get_records(int quiet)
497 big_oops_buf_sz); 497 big_oops_buf_sz);
498 498
499 if (unzipped_len > 0) { 499 if (unzipped_len > 0) {
500 kfree(buf);
500 buf = big_oops_buf; 501 buf = big_oops_buf;
501 size = unzipped_len; 502 size = unzipped_len;
502 compressed = false; 503 compressed = false;
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index fa8cef2cca3a..3b5744306ed8 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -86,6 +86,7 @@ struct ramoops_context {
86 struct persistent_ram_ecc_info ecc_info; 86 struct persistent_ram_ecc_info ecc_info;
87 unsigned int max_dump_cnt; 87 unsigned int max_dump_cnt;
88 unsigned int dump_write_cnt; 88 unsigned int dump_write_cnt;
89 /* _read_cnt need clear on ramoops_pstore_open */
89 unsigned int dump_read_cnt; 90 unsigned int dump_read_cnt;
90 unsigned int console_read_cnt; 91 unsigned int console_read_cnt;
91 unsigned int ftrace_read_cnt; 92 unsigned int ftrace_read_cnt;
@@ -101,6 +102,7 @@ static int ramoops_pstore_open(struct pstore_info *psi)
101 102
102 cxt->dump_read_cnt = 0; 103 cxt->dump_read_cnt = 0;
103 cxt->console_read_cnt = 0; 104 cxt->console_read_cnt = 0;
105 cxt->ftrace_read_cnt = 0;
104 return 0; 106 return 0;
105} 107}
106 108
@@ -117,13 +119,15 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
117 return NULL; 119 return NULL;
118 120
119 prz = przs[i]; 121 prz = przs[i];
122 if (!prz)
123 return NULL;
120 124
121 if (update) { 125 /* Update old/shadowed buffer. */
122 /* Update old/shadowed buffer. */ 126 if (update)
123 persistent_ram_save_old(prz); 127 persistent_ram_save_old(prz);
124 if (!persistent_ram_old_size(prz)) 128
125 return NULL; 129 if (!persistent_ram_old_size(prz))
126 } 130 return NULL;
127 131
128 *typep = type; 132 *typep = type;
129 *id = i; 133 *id = i;
@@ -316,6 +320,7 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
316{ 320{
317 int i; 321 int i;
318 322
323 cxt->max_dump_cnt = 0;
319 if (!cxt->przs) 324 if (!cxt->przs)
320 return; 325 return;
321 326
@@ -346,7 +351,7 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
346 GFP_KERNEL); 351 GFP_KERNEL);
347 if (!cxt->przs) { 352 if (!cxt->przs) {
348 dev_err(dev, "failed to initialize a prz array for dumps\n"); 353 dev_err(dev, "failed to initialize a prz array for dumps\n");
349 return -ENOMEM; 354 goto fail_prz;
350 } 355 }
351 356
352 for (i = 0; i < cxt->max_dump_cnt; i++) { 357 for (i = 0; i < cxt->max_dump_cnt; i++) {
@@ -428,7 +433,6 @@ static int ramoops_probe(struct platform_device *pdev)
428 if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size)) 433 if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
429 pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size); 434 pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
430 435
431 cxt->dump_read_cnt = 0;
432 cxt->size = pdata->mem_size; 436 cxt->size = pdata->mem_size;
433 cxt->phys_addr = pdata->mem_address; 437 cxt->phys_addr = pdata->mem_address;
434 cxt->record_size = pdata->record_size; 438 cxt->record_size = pdata->record_size;
@@ -505,7 +509,6 @@ fail_buf:
505 kfree(cxt->pstore.buf); 509 kfree(cxt->pstore.buf);
506fail_clear: 510fail_clear:
507 cxt->pstore.bufsize = 0; 511 cxt->pstore.bufsize = 0;
508 cxt->max_dump_cnt = 0;
509fail_cnt: 512fail_cnt:
510 kfree(cxt->fprz); 513 kfree(cxt->fprz);
511fail_init_fprz: 514fail_init_fprz:
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index de272d426763..ff7e3d4df5a1 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -54,7 +54,7 @@ static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a)
54 do { 54 do {
55 old = atomic_read(&prz->buffer->start); 55 old = atomic_read(&prz->buffer->start);
56 new = old + a; 56 new = old + a;
57 while (unlikely(new > prz->buffer_size)) 57 while (unlikely(new >= prz->buffer_size))
58 new -= prz->buffer_size; 58 new -= prz->buffer_size;
59 } while (atomic_cmpxchg(&prz->buffer->start, old, new) != old); 59 } while (atomic_cmpxchg(&prz->buffer->start, old, new) != old);
60 60
@@ -91,7 +91,7 @@ static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a)
91 91
92 old = atomic_read(&prz->buffer->start); 92 old = atomic_read(&prz->buffer->start);
93 new = old + a; 93 new = old + a;
94 while (unlikely(new > prz->buffer_size)) 94 while (unlikely(new >= prz->buffer_size))
95 new -= prz->buffer_size; 95 new -= prz->buffer_size;
96 atomic_set(&prz->buffer->start, new); 96 atomic_set(&prz->buffer->start, new);
97 97
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 89558810381c..c4bcb778886e 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -44,6 +44,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data)
44{ 44{
45 struct qnx4_sb_info *qs; 45 struct qnx4_sb_info *qs;
46 46
47 sync_filesystem(sb);
47 qs = qnx4_sb(sb); 48 qs = qnx4_sb(sb);
48 qs->Version = QNX4_VERSION; 49 qs->Version = QNX4_VERSION;
49 *flags |= MS_RDONLY; 50 *flags |= MS_RDONLY;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 8d941edfefa1..65cdaab3ed49 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -55,6 +55,7 @@ static int qnx6_show_options(struct seq_file *seq, struct dentry *root)
55 55
56static int qnx6_remount(struct super_block *sb, int *flags, char *data) 56static int qnx6_remount(struct super_block *sb, int *flags, char *data)
57{ 57{
58 sync_filesystem(sb);
58 *flags |= MS_RDONLY; 59 *flags |= MS_RDONLY;
59 return 0; 60 return 0;
60} 61}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index cfc8dcc16043..9cd5f63715c0 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -528,7 +528,7 @@ restart:
528 if (atomic_read(&dquot->dq_count)) { 528 if (atomic_read(&dquot->dq_count)) {
529 DEFINE_WAIT(wait); 529 DEFINE_WAIT(wait);
530 530
531 atomic_inc(&dquot->dq_count); 531 dqgrab(dquot);
532 prepare_to_wait(&dquot->dq_wait_unused, &wait, 532 prepare_to_wait(&dquot->dq_wait_unused, &wait,
533 TASK_UNINTERRUPTIBLE); 533 TASK_UNINTERRUPTIBLE);
534 spin_unlock(&dq_list_lock); 534 spin_unlock(&dq_list_lock);
@@ -632,7 +632,7 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
632 /* Now we have active dquot from which someone is 632 /* Now we have active dquot from which someone is
633 * holding reference so we can safely just increase 633 * holding reference so we can safely just increase
634 * use count */ 634 * use count */
635 atomic_inc(&dquot->dq_count); 635 dqgrab(dquot);
636 spin_unlock(&dq_list_lock); 636 spin_unlock(&dq_list_lock);
637 dqstats_inc(DQST_LOOKUPS); 637 dqstats_inc(DQST_LOOKUPS);
638 err = sb->dq_op->write_dquot(dquot); 638 err = sb->dq_op->write_dquot(dquot);
diff --git a/fs/read_write.c b/fs/read_write.c
index 28cc9c810744..31c6efa43183 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -994,9 +994,9 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
994 return ret; 994 return ret;
995} 995}
996 996
997COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, 997static long __compat_sys_preadv64(unsigned long fd,
998 const struct compat_iovec __user *,vec, 998 const struct compat_iovec __user *vec,
999 unsigned long, vlen, loff_t, pos) 999 unsigned long vlen, loff_t pos)
1000{ 1000{
1001 struct fd f; 1001 struct fd f;
1002 ssize_t ret; 1002 ssize_t ret;
@@ -1013,12 +1013,22 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1013 return ret; 1013 return ret;
1014} 1014}
1015 1015
1016#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1017COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1018 const struct compat_iovec __user *,vec,
1019 unsigned long, vlen, loff_t, pos)
1020{
1021 return __compat_sys_preadv64(fd, vec, vlen, pos);
1022}
1023#endif
1024
1016COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, 1025COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1017 const struct compat_iovec __user *,vec, 1026 const struct compat_iovec __user *,vec,
1018 compat_ulong_t, vlen, u32, pos_low, u32, pos_high) 1027 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1019{ 1028{
1020 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1029 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1021 return compat_sys_preadv64(fd, vec, vlen, pos); 1030
1031 return __compat_sys_preadv64(fd, vec, vlen, pos);
1022} 1032}
1023 1033
1024static size_t compat_writev(struct file *file, 1034static size_t compat_writev(struct file *file,
@@ -1061,9 +1071,9 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1061 return ret; 1071 return ret;
1062} 1072}
1063 1073
1064COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, 1074static long __compat_sys_pwritev64(unsigned long fd,
1065 const struct compat_iovec __user *,vec, 1075 const struct compat_iovec __user *vec,
1066 unsigned long, vlen, loff_t, pos) 1076 unsigned long vlen, loff_t pos)
1067{ 1077{
1068 struct fd f; 1078 struct fd f;
1069 ssize_t ret; 1079 ssize_t ret;
@@ -1080,12 +1090,22 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1080 return ret; 1090 return ret;
1081} 1091}
1082 1092
1093#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1094COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1095 const struct compat_iovec __user *,vec,
1096 unsigned long, vlen, loff_t, pos)
1097{
1098 return __compat_sys_pwritev64(fd, vec, vlen, pos);
1099}
1100#endif
1101
1083COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, 1102COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1084 const struct compat_iovec __user *,vec, 1103 const struct compat_iovec __user *,vec,
1085 compat_ulong_t, vlen, u32, pos_low, u32, pos_high) 1104 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1086{ 1105{
1087 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1106 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1088 return compat_sys_pwritev64(fd, vec, vlen, pos); 1107
1108 return __compat_sys_pwritev64(fd, vec, vlen, pos);
1089} 1109}
1090#endif 1110#endif
1091 1111
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ad62bdbb451e..bc8b8009897d 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -35,7 +35,7 @@ void reiserfs_evict_inode(struct inode *inode)
35 if (!inode->i_nlink && !is_bad_inode(inode)) 35 if (!inode->i_nlink && !is_bad_inode(inode))
36 dquot_initialize(inode); 36 dquot_initialize(inode);
37 37
38 truncate_inode_pages(&inode->i_data, 0); 38 truncate_inode_pages_final(&inode->i_data);
39 if (inode->i_nlink) 39 if (inode->i_nlink)
40 goto no_delete; 40 goto no_delete;
41 41
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 8d06adf89948..83d4eac8059a 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -2831,6 +2831,7 @@ void reiserfs_init_alloc_options(struct super_block *s);
2831 */ 2831 */
2832__le32 reiserfs_choose_packing(struct inode *dir); 2832__le32 reiserfs_choose_packing(struct inode *dir);
2833 2833
2834void show_alloc_options(struct seq_file *seq, struct super_block *s);
2834int reiserfs_init_bitmap_cache(struct super_block *sb); 2835int reiserfs_init_bitmap_cache(struct super_block *sb);
2835void reiserfs_free_bitmap_cache(struct super_block *sb); 2836void reiserfs_free_bitmap_cache(struct super_block *sb);
2836void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info); 2837void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 2c803353f8ac..9fb20426005e 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -62,7 +62,6 @@ static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs)
62 62
63static int reiserfs_remount(struct super_block *s, int *flags, char *data); 63static int reiserfs_remount(struct super_block *s, int *flags, char *data);
64static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf); 64static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
65void show_alloc_options(struct seq_file *seq, struct super_block *s);
66 65
67static int reiserfs_sync_fs(struct super_block *s, int wait) 66static int reiserfs_sync_fs(struct super_block *s, int wait)
68{ 67{
@@ -597,7 +596,7 @@ static void init_once(void *foo)
597 inode_init_once(&ei->vfs_inode); 596 inode_init_once(&ei->vfs_inode);
598} 597}
599 598
600static int init_inodecache(void) 599static int __init init_inodecache(void)
601{ 600{
602 reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache", 601 reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache",
603 sizeof(struct 602 sizeof(struct
@@ -1319,6 +1318,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1319 int i; 1318 int i;
1320#endif 1319#endif
1321 1320
1321 sync_filesystem(s);
1322 reiserfs_write_lock(s); 1322 reiserfs_write_lock(s);
1323 1323
1324#ifdef CONFIG_QUOTA 1324#ifdef CONFIG_QUOTA
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index d8418782862b..ef90e8bca95a 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -432,6 +432,7 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
432 */ 432 */
433static int romfs_remount(struct super_block *sb, int *flags, char *data) 433static int romfs_remount(struct super_block *sb, int *flags, char *data)
434{ 434{
435 sync_filesystem(sb);
435 *flags |= MS_RDONLY; 436 *flags |= MS_RDONLY;
436 return 0; 437 return 0;
437} 438}
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 202df6312d4e..031c8d67fd51 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -371,6 +371,7 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
371 371
372static int squashfs_remount(struct super_block *sb, int *flags, char *data) 372static int squashfs_remount(struct super_block *sb, int *flags, char *data)
373{ 373{
374 sync_filesystem(sb);
374 *flags |= MS_RDONLY; 375 *flags |= MS_RDONLY;
375 return 0; 376 return 0;
376} 377}
diff --git a/fs/super.c b/fs/super.c
index 80d5cf2ca765..e9dc3c3fe159 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -719,8 +719,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
719 } 719 }
720 } 720 }
721 721
722 sync_filesystem(sb);
723
724 if (sb->s_op->remount_fs) { 722 if (sb->s_op->remount_fs) {
725 retval = sb->s_op->remount_fs(sb, &flags, data); 723 retval = sb->s_op->remount_fs(sb, &flags, data);
726 if (retval) { 724 if (retval) {
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
index 8c41feacbac5..b2756014508c 100644
--- a/fs/sysfs/Kconfig
+++ b/fs/sysfs/Kconfig
@@ -1,6 +1,7 @@
1config SYSFS 1config SYSFS
2 bool "sysfs file system support" if EXPERT 2 bool "sysfs file system support" if EXPERT
3 default y 3 default y
4 select KERNFS
4 help 5 help
5 The sysfs filesystem is a virtual filesystem that the kernel uses to 6 The sysfs filesystem is a virtual filesystem that the kernel uses to
6 export internal kernel objects, their attributes, and their 7 export internal kernel objects, their attributes, and their
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index ee0d761c3179..0b45ff42f374 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -19,39 +19,18 @@
19 19
20DEFINE_SPINLOCK(sysfs_symlink_target_lock); 20DEFINE_SPINLOCK(sysfs_symlink_target_lock);
21 21
22/**
23 * sysfs_pathname - return full path to sysfs dirent
24 * @kn: kernfs_node whose path we want
25 * @path: caller allocated buffer of size PATH_MAX
26 *
27 * Gives the name "/" to the sysfs_root entry; any path returned
28 * is relative to wherever sysfs is mounted.
29 */
30static char *sysfs_pathname(struct kernfs_node *kn, char *path)
31{
32 if (kn->parent) {
33 sysfs_pathname(kn->parent, path);
34 strlcat(path, "/", PATH_MAX);
35 }
36 strlcat(path, kn->name, PATH_MAX);
37 return path;
38}
39
40void sysfs_warn_dup(struct kernfs_node *parent, const char *name) 22void sysfs_warn_dup(struct kernfs_node *parent, const char *name)
41{ 23{
42 char *path; 24 char *buf, *path = NULL;
43 25
44 path = kzalloc(PATH_MAX, GFP_KERNEL); 26 buf = kzalloc(PATH_MAX, GFP_KERNEL);
45 if (path) { 27 if (buf)
46 sysfs_pathname(parent, path); 28 path = kernfs_path(parent, buf, PATH_MAX);
47 strlcat(path, "/", PATH_MAX);
48 strlcat(path, name, PATH_MAX);
49 }
50 29
51 WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s'\n", 30 WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s/%s'\n",
52 path ? path : name); 31 path, name);
53 32
54 kfree(path); 33 kfree(buf);
55} 34}
56 35
57/** 36/**
@@ -122,9 +101,13 @@ void sysfs_remove_dir(struct kobject *kobj)
122int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, 101int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
123 const void *new_ns) 102 const void *new_ns)
124{ 103{
125 struct kernfs_node *parent = kobj->sd->parent; 104 struct kernfs_node *parent;
105 int ret;
126 106
127 return kernfs_rename_ns(kobj->sd, parent, new_name, new_ns); 107 parent = kernfs_get_parent(kobj->sd);
108 ret = kernfs_rename_ns(kobj->sd, parent, new_name, new_ns);
109 kernfs_put(parent);
110 return ret;
128} 111}
129 112
130int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, 113int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
@@ -133,7 +116,6 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
133 struct kernfs_node *kn = kobj->sd; 116 struct kernfs_node *kn = kobj->sd;
134 struct kernfs_node *new_parent; 117 struct kernfs_node *new_parent;
135 118
136 BUG_ON(!kn->parent);
137 new_parent = new_parent_kobj && new_parent_kobj->sd ? 119 new_parent = new_parent_kobj && new_parent_kobj->sd ?
138 new_parent_kobj->sd : sysfs_root_kn; 120 new_parent_kobj->sd : sysfs_root_kn;
139 121
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 810cf6e613e5..1b8b91b67fdb 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -372,6 +372,29 @@ void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
372} 372}
373EXPORT_SYMBOL_GPL(sysfs_remove_file_ns); 373EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);
374 374
375/**
376 * sysfs_remove_file_self - remove an object attribute from its own method
377 * @kobj: object we're acting for
378 * @attr: attribute descriptor
379 *
380 * See kernfs_remove_self() for details.
381 */
382bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr)
383{
384 struct kernfs_node *parent = kobj->sd;
385 struct kernfs_node *kn;
386 bool ret;
387
388 kn = kernfs_find_and_get(parent, attr->name);
389 if (WARN_ON_ONCE(!kn))
390 return false;
391
392 ret = kernfs_remove_self(kn);
393
394 kernfs_put(kn);
395 return ret;
396}
397
375void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr) 398void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr)
376{ 399{
377 int i; 400 int i;
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 6b579387c67a..aa0406895b53 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -70,8 +70,11 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
70 if (grp->bin_attrs) { 70 if (grp->bin_attrs) {
71 for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) { 71 for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
72 if (update) 72 if (update)
73 sysfs_remove_bin_file(kobj, *bin_attr); 73 kernfs_remove_by_name(parent,
74 error = sysfs_create_bin_file(kobj, *bin_attr); 74 (*bin_attr)->attr.name);
75 error = sysfs_add_file_mode_ns(parent,
76 &(*bin_attr)->attr, true,
77 (*bin_attr)->attr.mode, NULL);
75 if (error) 78 if (error)
76 break; 79 break;
77 } 80 }
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 3eaf5c6622eb..a66ad6196f59 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -63,7 +63,7 @@ int __init sysfs_init(void)
63{ 63{
64 int err; 64 int err;
65 65
66 sysfs_root = kernfs_create_root(NULL, NULL); 66 sysfs_root = kernfs_create_root(NULL, 0, NULL);
67 if (IS_ERR(sysfs_root)) 67 if (IS_ERR(sysfs_root))
68 return PTR_ERR(sysfs_root); 68 return PTR_ERR(sysfs_root);
69 69
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index c327d4ee1235..88956309cc86 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -60,6 +60,7 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data)
60{ 60{
61 struct sysv_sb_info *sbi = SYSV_SB(sb); 61 struct sysv_sb_info *sbi = SYSV_SB(sb);
62 62
63 sync_filesystem(sb);
63 if (sbi->s_forced_ro) 64 if (sbi->s_forced_ro)
64 *flags |= MS_RDONLY; 65 *flags |= MS_RDONLY;
65 return 0; 66 return 0;
@@ -295,7 +296,7 @@ int sysv_sync_inode(struct inode *inode)
295 296
296static void sysv_evict_inode(struct inode *inode) 297static void sysv_evict_inode(struct inode *inode)
297{ 298{
298 truncate_inode_pages(&inode->i_data, 0); 299 truncate_inode_pages_final(&inode->i_data);
299 if (!inode->i_nlink) { 300 if (!inode->i_nlink) {
300 inode->i_size = 0; 301 inode->i_size = 0;
301 sysv_truncate(inode); 302 sysv_truncate(inode);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 929312180dd0..0013142c0475 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -317,6 +317,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
317 (clockid != CLOCK_MONOTONIC && 317 (clockid != CLOCK_MONOTONIC &&
318 clockid != CLOCK_REALTIME && 318 clockid != CLOCK_REALTIME &&
319 clockid != CLOCK_REALTIME_ALARM && 319 clockid != CLOCK_REALTIME_ALARM &&
320 clockid != CLOCK_BOOTTIME &&
320 clockid != CLOCK_BOOTTIME_ALARM)) 321 clockid != CLOCK_BOOTTIME_ALARM))
321 return -EINVAL; 322 return -EINVAL;
322 323
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 5ded8490c0c6..a1266089eca1 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -351,7 +351,7 @@ static void ubifs_evict_inode(struct inode *inode)
351 dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); 351 dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
352 ubifs_assert(!atomic_read(&inode->i_count)); 352 ubifs_assert(!atomic_read(&inode->i_count));
353 353
354 truncate_inode_pages(&inode->i_data, 0); 354 truncate_inode_pages_final(&inode->i_data);
355 355
356 if (inode->i_nlink) 356 if (inode->i_nlink)
357 goto done; 357 goto done;
@@ -1827,6 +1827,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1827 int err; 1827 int err;
1828 struct ubifs_info *c = sb->s_fs_info; 1828 struct ubifs_info *c = sb->s_fs_info;
1829 1829
1830 sync_filesystem(sb);
1830 dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags); 1831 dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags);
1831 1832
1832 err = ubifs_parse_options(c, data, 1); 1833 err = ubifs_parse_options(c, data, 1);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 982ce05c87ed..5d643706212f 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -146,8 +146,8 @@ void udf_evict_inode(struct inode *inode)
146 want_delete = 1; 146 want_delete = 1;
147 udf_setsize(inode, 0); 147 udf_setsize(inode, 0);
148 udf_update_inode(inode, IS_SYNC(inode)); 148 udf_update_inode(inode, IS_SYNC(inode));
149 } else 149 }
150 truncate_inode_pages(&inode->i_data, 0); 150 truncate_inode_pages_final(&inode->i_data);
151 invalidate_inode_buffers(inode); 151 invalidate_inode_buffers(inode);
152 clear_inode(inode); 152 clear_inode(inode);
153 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && 153 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 3306b9f69bed..64f2b7334d08 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -646,6 +646,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
646 int error = 0; 646 int error = 0;
647 struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb); 647 struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb);
648 648
649 sync_filesystem(sb);
649 if (lvidiu) { 650 if (lvidiu) {
650 int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev); 651 int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev);
651 if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY)) 652 if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY))
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index c8ca96086784..61e8a9b021dd 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -885,7 +885,7 @@ void ufs_evict_inode(struct inode * inode)
885 if (!inode->i_nlink && !is_bad_inode(inode)) 885 if (!inode->i_nlink && !is_bad_inode(inode))
886 want_delete = 1; 886 want_delete = 1;
887 887
888 truncate_inode_pages(&inode->i_data, 0); 888 truncate_inode_pages_final(&inode->i_data);
889 if (want_delete) { 889 if (want_delete) {
890 loff_t old_i_size; 890 loff_t old_i_size;
891 /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/ 891 /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 329f2f53b7ed..b8c6791f046f 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1280,6 +1280,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1280 unsigned new_mount_opt, ufstype; 1280 unsigned new_mount_opt, ufstype;
1281 unsigned flags; 1281 unsigned flags;
1282 1282
1283 sync_filesystem(sb);
1283 lock_ufs(sb); 1284 lock_ufs(sb);
1284 mutex_lock(&UFS_SB(sb)->s_lock); 1285 mutex_lock(&UFS_SB(sb)->s_lock);
1285 uspi = UFS_SB(sb)->s_uspi; 1286 uspi = UFS_SB(sb)->s_uspi;
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 66a36befc5c0..844e288b9576 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -65,12 +65,31 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
65void * 65void *
66kmem_zalloc_large(size_t size, xfs_km_flags_t flags) 66kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
67{ 67{
68 unsigned noio_flag = 0;
68 void *ptr; 69 void *ptr;
70 gfp_t lflags;
69 71
70 ptr = kmem_zalloc(size, flags | KM_MAYFAIL); 72 ptr = kmem_zalloc(size, flags | KM_MAYFAIL);
71 if (ptr) 73 if (ptr)
72 return ptr; 74 return ptr;
73 return vzalloc(size); 75
76 /*
77 * __vmalloc() will allocate data pages and auxillary structures (e.g.
78 * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
79 * here. Hence we need to tell memory reclaim that we are in such a
80 * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
81 * the filesystem here and potentially deadlocking.
82 */
83 if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
84 noio_flag = memalloc_noio_save();
85
86 lflags = kmem_flags_convert(flags);
87 ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
88
89 if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
90 memalloc_noio_restore(noio_flag);
91
92 return ptr;
74} 93}
75 94
76void 95void
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 0ecec1896f25..6888ad886ff6 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -281,7 +281,7 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
281 if (!acl) 281 if (!acl)
282 goto set_acl; 282 goto set_acl;
283 283
284 error = -EINVAL; 284 error = -E2BIG;
285 if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb))) 285 if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb)))
286 return error; 286 return error;
287 287
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 3fc109819c34..0fdd4109c624 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -89,6 +89,8 @@ typedef struct xfs_agf {
89 /* structure must be padded to 64 bit alignment */ 89 /* structure must be padded to 64 bit alignment */
90} xfs_agf_t; 90} xfs_agf_t;
91 91
92#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc)
93
92#define XFS_AGF_MAGICNUM 0x00000001 94#define XFS_AGF_MAGICNUM 0x00000001
93#define XFS_AGF_VERSIONNUM 0x00000002 95#define XFS_AGF_VERSIONNUM 0x00000002
94#define XFS_AGF_SEQNO 0x00000004 96#define XFS_AGF_SEQNO 0x00000004
@@ -167,6 +169,8 @@ typedef struct xfs_agi {
167 /* structure must be padded to 64 bit alignment */ 169 /* structure must be padded to 64 bit alignment */
168} xfs_agi_t; 170} xfs_agi_t;
169 171
172#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc)
173
170#define XFS_AGI_MAGICNUM 0x00000001 174#define XFS_AGI_MAGICNUM 0x00000001
171#define XFS_AGI_VERSIONNUM 0x00000002 175#define XFS_AGI_VERSIONNUM 0x00000002
172#define XFS_AGI_SEQNO 0x00000004 176#define XFS_AGI_SEQNO 0x00000004
@@ -222,6 +226,8 @@ typedef struct xfs_agfl {
222 __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ 226 __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */
223} xfs_agfl_t; 227} xfs_agfl_t;
224 228
229#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
230
225/* 231/*
226 * tags for inode radix tree 232 * tags for inode radix tree
227 */ 233 */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 9eab2dfdcbb5..c1cf6a336a72 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -474,7 +474,6 @@ xfs_agfl_read_verify(
474 struct xfs_buf *bp) 474 struct xfs_buf *bp)
475{ 475{
476 struct xfs_mount *mp = bp->b_target->bt_mount; 476 struct xfs_mount *mp = bp->b_target->bt_mount;
477 int agfl_ok = 1;
478 477
479 /* 478 /*
480 * There is no verification of non-crc AGFLs because mkfs does not 479 * There is no verification of non-crc AGFLs because mkfs does not
@@ -485,15 +484,13 @@ xfs_agfl_read_verify(
485 if (!xfs_sb_version_hascrc(&mp->m_sb)) 484 if (!xfs_sb_version_hascrc(&mp->m_sb))
486 return; 485 return;
487 486
488 agfl_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 487 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
489 offsetof(struct xfs_agfl, agfl_crc)); 488 xfs_buf_ioerror(bp, EFSBADCRC);
490 489 else if (!xfs_agfl_verify(bp))
491 agfl_ok = agfl_ok && xfs_agfl_verify(bp);
492
493 if (!agfl_ok) {
494 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
495 xfs_buf_ioerror(bp, EFSCORRUPTED); 490 xfs_buf_ioerror(bp, EFSCORRUPTED);
496 } 491
492 if (bp->b_error)
493 xfs_verifier_error(bp);
497} 494}
498 495
499static void 496static void
@@ -508,16 +505,15 @@ xfs_agfl_write_verify(
508 return; 505 return;
509 506
510 if (!xfs_agfl_verify(bp)) { 507 if (!xfs_agfl_verify(bp)) {
511 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
512 xfs_buf_ioerror(bp, EFSCORRUPTED); 508 xfs_buf_ioerror(bp, EFSCORRUPTED);
509 xfs_verifier_error(bp);
513 return; 510 return;
514 } 511 }
515 512
516 if (bip) 513 if (bip)
517 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn); 514 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
518 515
519 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 516 xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
520 offsetof(struct xfs_agfl, agfl_crc));
521} 517}
522 518
523const struct xfs_buf_ops xfs_agfl_buf_ops = { 519const struct xfs_buf_ops xfs_agfl_buf_ops = {
@@ -2238,19 +2234,17 @@ xfs_agf_read_verify(
2238 struct xfs_buf *bp) 2234 struct xfs_buf *bp)
2239{ 2235{
2240 struct xfs_mount *mp = bp->b_target->bt_mount; 2236 struct xfs_mount *mp = bp->b_target->bt_mount;
2241 int agf_ok = 1;
2242
2243 if (xfs_sb_version_hascrc(&mp->m_sb))
2244 agf_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
2245 offsetof(struct xfs_agf, agf_crc));
2246 2237
2247 agf_ok = agf_ok && xfs_agf_verify(mp, bp); 2238 if (xfs_sb_version_hascrc(&mp->m_sb) &&
2248 2239 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
2249 if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, 2240 xfs_buf_ioerror(bp, EFSBADCRC);
2250 XFS_RANDOM_ALLOC_READ_AGF))) { 2241 else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
2251 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); 2242 XFS_ERRTAG_ALLOC_READ_AGF,
2243 XFS_RANDOM_ALLOC_READ_AGF))
2252 xfs_buf_ioerror(bp, EFSCORRUPTED); 2244 xfs_buf_ioerror(bp, EFSCORRUPTED);
2253 } 2245
2246 if (bp->b_error)
2247 xfs_verifier_error(bp);
2254} 2248}
2255 2249
2256static void 2250static void
@@ -2261,8 +2255,8 @@ xfs_agf_write_verify(
2261 struct xfs_buf_log_item *bip = bp->b_fspriv; 2255 struct xfs_buf_log_item *bip = bp->b_fspriv;
2262 2256
2263 if (!xfs_agf_verify(mp, bp)) { 2257 if (!xfs_agf_verify(mp, bp)) {
2264 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
2265 xfs_buf_ioerror(bp, EFSCORRUPTED); 2258 xfs_buf_ioerror(bp, EFSCORRUPTED);
2259 xfs_verifier_error(bp);
2266 return; 2260 return;
2267 } 2261 }
2268 2262
@@ -2272,8 +2266,7 @@ xfs_agf_write_verify(
2272 if (bip) 2266 if (bip)
2273 XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); 2267 XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
2274 2268
2275 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 2269 xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
2276 offsetof(struct xfs_agf, agf_crc));
2277} 2270}
2278 2271
2279const struct xfs_buf_ops xfs_agf_buf_ops = { 2272const struct xfs_buf_ops xfs_agf_buf_ops = {
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 13085429e523..cc1eadcbb049 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -355,12 +355,14 @@ static void
355xfs_allocbt_read_verify( 355xfs_allocbt_read_verify(
356 struct xfs_buf *bp) 356 struct xfs_buf *bp)
357{ 357{
358 if (!(xfs_btree_sblock_verify_crc(bp) && 358 if (!xfs_btree_sblock_verify_crc(bp))
359 xfs_allocbt_verify(bp))) { 359 xfs_buf_ioerror(bp, EFSBADCRC);
360 trace_xfs_btree_corrupt(bp, _RET_IP_); 360 else if (!xfs_allocbt_verify(bp))
361 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
362 bp->b_target->bt_mount, bp->b_addr);
363 xfs_buf_ioerror(bp, EFSCORRUPTED); 361 xfs_buf_ioerror(bp, EFSCORRUPTED);
362
363 if (bp->b_error) {
364 trace_xfs_btree_corrupt(bp, _RET_IP_);
365 xfs_verifier_error(bp);
364 } 366 }
365} 367}
366 368
@@ -370,9 +372,9 @@ xfs_allocbt_write_verify(
370{ 372{
371 if (!xfs_allocbt_verify(bp)) { 373 if (!xfs_allocbt_verify(bp)) {
372 trace_xfs_btree_corrupt(bp, _RET_IP_); 374 trace_xfs_btree_corrupt(bp, _RET_IP_);
373 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
374 bp->b_target->bt_mount, bp->b_addr);
375 xfs_buf_ioerror(bp, EFSCORRUPTED); 375 xfs_buf_ioerror(bp, EFSCORRUPTED);
376 xfs_verifier_error(bp);
377 return;
376 } 378 }
377 xfs_btree_sblock_calc_crc(bp); 379 xfs_btree_sblock_calc_crc(bp);
378 380
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index db2cfb067d0b..75df77d09f75 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -632,38 +632,46 @@ xfs_map_at_offset(
632} 632}
633 633
634/* 634/*
635 * Test if a given page is suitable for writing as part of an unwritten 635 * Test if a given page contains at least one buffer of a given @type.
636 * or delayed allocate extent. 636 * If @check_all_buffers is true, then we walk all the buffers in the page to
637 * try to find one of the type passed in. If it is not set, then the caller only
638 * needs to check the first buffer on the page for a match.
637 */ 639 */
638STATIC int 640STATIC bool
639xfs_check_page_type( 641xfs_check_page_type(
640 struct page *page, 642 struct page *page,
641 unsigned int type) 643 unsigned int type,
644 bool check_all_buffers)
642{ 645{
643 if (PageWriteback(page)) 646 struct buffer_head *bh;
644 return 0; 647 struct buffer_head *head;
645 648
646 if (page->mapping && page_has_buffers(page)) { 649 if (PageWriteback(page))
647 struct buffer_head *bh, *head; 650 return false;
648 int acceptable = 0; 651 if (!page->mapping)
652 return false;
653 if (!page_has_buffers(page))
654 return false;
649 655
650 bh = head = page_buffers(page); 656 bh = head = page_buffers(page);
651 do { 657 do {
652 if (buffer_unwritten(bh)) 658 if (buffer_unwritten(bh)) {
653 acceptable += (type == XFS_IO_UNWRITTEN); 659 if (type == XFS_IO_UNWRITTEN)
654 else if (buffer_delay(bh)) 660 return true;
655 acceptable += (type == XFS_IO_DELALLOC); 661 } else if (buffer_delay(bh)) {
656 else if (buffer_dirty(bh) && buffer_mapped(bh)) 662 if (type == XFS_IO_DELALLOC)
657 acceptable += (type == XFS_IO_OVERWRITE); 663 return true;
658 else 664 } else if (buffer_dirty(bh) && buffer_mapped(bh)) {
659 break; 665 if (type == XFS_IO_OVERWRITE)
660 } while ((bh = bh->b_this_page) != head); 666 return true;
667 }
661 668
662 if (acceptable) 669 /* If we are only checking the first buffer, we are done now. */
663 return 1; 670 if (!check_all_buffers)
664 } 671 break;
672 } while ((bh = bh->b_this_page) != head);
665 673
666 return 0; 674 return false;
667} 675}
668 676
669/* 677/*
@@ -697,7 +705,7 @@ xfs_convert_page(
697 goto fail_unlock_page; 705 goto fail_unlock_page;
698 if (page->mapping != inode->i_mapping) 706 if (page->mapping != inode->i_mapping)
699 goto fail_unlock_page; 707 goto fail_unlock_page;
700 if (!xfs_check_page_type(page, (*ioendp)->io_type)) 708 if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
701 goto fail_unlock_page; 709 goto fail_unlock_page;
702 710
703 /* 711 /*
@@ -742,6 +750,15 @@ xfs_convert_page(
742 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; 750 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
743 page_dirty = p_offset / len; 751 page_dirty = p_offset / len;
744 752
753 /*
754 * The moment we find a buffer that doesn't match our current type
755 * specification or can't be written, abort the loop and start
756 * writeback. As per the above xfs_imap_valid() check, only
757 * xfs_vm_writepage() can handle partial page writeback fully - we are
758 * limited here to the buffers that are contiguous with the current
759 * ioend, and hence a buffer we can't write breaks that contiguity and
760 * we have to defer the rest of the IO to xfs_vm_writepage().
761 */
745 bh = head = page_buffers(page); 762 bh = head = page_buffers(page);
746 do { 763 do {
747 if (offset >= end_offset) 764 if (offset >= end_offset)
@@ -750,7 +767,7 @@ xfs_convert_page(
750 uptodate = 0; 767 uptodate = 0;
751 if (!(PageUptodate(page) || buffer_uptodate(bh))) { 768 if (!(PageUptodate(page) || buffer_uptodate(bh))) {
752 done = 1; 769 done = 1;
753 continue; 770 break;
754 } 771 }
755 772
756 if (buffer_unwritten(bh) || buffer_delay(bh) || 773 if (buffer_unwritten(bh) || buffer_delay(bh) ||
@@ -762,10 +779,11 @@ xfs_convert_page(
762 else 779 else
763 type = XFS_IO_OVERWRITE; 780 type = XFS_IO_OVERWRITE;
764 781
765 if (!xfs_imap_valid(inode, imap, offset)) { 782 /*
766 done = 1; 783 * imap should always be valid because of the above
767 continue; 784 * partial page end_offset check on the imap.
768 } 785 */
786 ASSERT(xfs_imap_valid(inode, imap, offset));
769 787
770 lock_buffer(bh); 788 lock_buffer(bh);
771 if (type != XFS_IO_OVERWRITE) 789 if (type != XFS_IO_OVERWRITE)
@@ -777,6 +795,7 @@ xfs_convert_page(
777 count++; 795 count++;
778 } else { 796 } else {
779 done = 1; 797 done = 1;
798 break;
780 } 799 }
781 } while (offset += len, (bh = bh->b_this_page) != head); 800 } while (offset += len, (bh = bh->b_this_page) != head);
782 801
@@ -868,7 +887,7 @@ xfs_aops_discard_page(
868 struct buffer_head *bh, *head; 887 struct buffer_head *bh, *head;
869 loff_t offset = page_offset(page); 888 loff_t offset = page_offset(page);
870 889
871 if (!xfs_check_page_type(page, XFS_IO_DELALLOC)) 890 if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true))
872 goto out_invalidate; 891 goto out_invalidate;
873 892
874 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 893 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -1441,7 +1460,8 @@ xfs_vm_direct_IO(
1441 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1460 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1442 offset, nr_segs, 1461 offset, nr_segs,
1443 xfs_get_blocks_direct, 1462 xfs_get_blocks_direct,
1444 xfs_end_io_direct_write, NULL, 0); 1463 xfs_end_io_direct_write, NULL,
1464 DIO_ASYNC_EXTEND);
1445 if (ret != -EIOCBQUEUED && iocb->private) 1465 if (ret != -EIOCBQUEUED && iocb->private)
1446 goto out_destroy_ioend; 1466 goto out_destroy_ioend;
1447 } else { 1467 } else {
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 7b126f46a2f9..fe9587fab17a 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -213,8 +213,8 @@ xfs_attr3_leaf_write_verify(
213 struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr; 213 struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
214 214
215 if (!xfs_attr3_leaf_verify(bp)) { 215 if (!xfs_attr3_leaf_verify(bp)) {
216 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
217 xfs_buf_ioerror(bp, EFSCORRUPTED); 216 xfs_buf_ioerror(bp, EFSCORRUPTED);
217 xfs_verifier_error(bp);
218 return; 218 return;
219 } 219 }
220 220
@@ -224,7 +224,7 @@ xfs_attr3_leaf_write_verify(
224 if (bip) 224 if (bip)
225 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); 225 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
226 226
227 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_ATTR3_LEAF_CRC_OFF); 227 xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF);
228} 228}
229 229
230/* 230/*
@@ -239,13 +239,14 @@ xfs_attr3_leaf_read_verify(
239{ 239{
240 struct xfs_mount *mp = bp->b_target->bt_mount; 240 struct xfs_mount *mp = bp->b_target->bt_mount;
241 241
242 if ((xfs_sb_version_hascrc(&mp->m_sb) && 242 if (xfs_sb_version_hascrc(&mp->m_sb) &&
243 !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 243 !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
244 XFS_ATTR3_LEAF_CRC_OFF)) || 244 xfs_buf_ioerror(bp, EFSBADCRC);
245 !xfs_attr3_leaf_verify(bp)) { 245 else if (!xfs_attr3_leaf_verify(bp))
246 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
247 xfs_buf_ioerror(bp, EFSCORRUPTED); 246 xfs_buf_ioerror(bp, EFSCORRUPTED);
248 } 247
248 if (bp->b_error)
249 xfs_verifier_error(bp);
249} 250}
250 251
251const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { 252const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index 5549d69ddb45..6e37823e2932 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -125,7 +125,6 @@ xfs_attr3_rmt_read_verify(
125 struct xfs_mount *mp = bp->b_target->bt_mount; 125 struct xfs_mount *mp = bp->b_target->bt_mount;
126 char *ptr; 126 char *ptr;
127 int len; 127 int len;
128 bool corrupt = false;
129 xfs_daddr_t bno; 128 xfs_daddr_t bno;
130 129
131 /* no verification of non-crc buffers */ 130 /* no verification of non-crc buffers */
@@ -140,11 +139,11 @@ xfs_attr3_rmt_read_verify(
140 while (len > 0) { 139 while (len > 0) {
141 if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp), 140 if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp),
142 XFS_ATTR3_RMT_CRC_OFF)) { 141 XFS_ATTR3_RMT_CRC_OFF)) {
143 corrupt = true; 142 xfs_buf_ioerror(bp, EFSBADCRC);
144 break; 143 break;
145 } 144 }
146 if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { 145 if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
147 corrupt = true; 146 xfs_buf_ioerror(bp, EFSCORRUPTED);
148 break; 147 break;
149 } 148 }
150 len -= XFS_LBSIZE(mp); 149 len -= XFS_LBSIZE(mp);
@@ -152,10 +151,9 @@ xfs_attr3_rmt_read_verify(
152 bno += mp->m_bsize; 151 bno += mp->m_bsize;
153 } 152 }
154 153
155 if (corrupt) { 154 if (bp->b_error)
156 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); 155 xfs_verifier_error(bp);
157 xfs_buf_ioerror(bp, EFSCORRUPTED); 156 else
158 } else
159 ASSERT(len == 0); 157 ASSERT(len == 0);
160} 158}
161 159
@@ -180,9 +178,8 @@ xfs_attr3_rmt_write_verify(
180 178
181 while (len > 0) { 179 while (len > 0) {
182 if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { 180 if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
183 XFS_CORRUPTION_ERROR(__func__,
184 XFS_ERRLEVEL_LOW, mp, bp->b_addr);
185 xfs_buf_ioerror(bp, EFSCORRUPTED); 181 xfs_buf_ioerror(bp, EFSCORRUPTED);
182 xfs_verifier_error(bp);
186 return; 183 return;
187 } 184 }
188 if (bip) { 185 if (bip) {
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 152543c4ca70..5b6092ef51ef 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5378,3 +5378,196 @@ error0:
5378 } 5378 }
5379 return error; 5379 return error;
5380} 5380}
5381
5382/*
5383 * Shift extent records to the left to cover a hole.
5384 *
5385 * The maximum number of extents to be shifted in a single operation
5386 * is @num_exts, and @current_ext keeps track of the current extent
5387 * index we have shifted. @offset_shift_fsb is the length by which each
5388 * extent is shifted. If there is no hole to shift the extents
5389 * into, this will be considered invalid operation and we abort immediately.
5390 */
5391int
5392xfs_bmap_shift_extents(
5393 struct xfs_trans *tp,
5394 struct xfs_inode *ip,
5395 int *done,
5396 xfs_fileoff_t start_fsb,
5397 xfs_fileoff_t offset_shift_fsb,
5398 xfs_extnum_t *current_ext,
5399 xfs_fsblock_t *firstblock,
5400 struct xfs_bmap_free *flist,
5401 int num_exts)
5402{
5403 struct xfs_btree_cur *cur;
5404 struct xfs_bmbt_rec_host *gotp;
5405 struct xfs_bmbt_irec got;
5406 struct xfs_bmbt_irec left;
5407 struct xfs_mount *mp = ip->i_mount;
5408 struct xfs_ifork *ifp;
5409 xfs_extnum_t nexts = 0;
5410 xfs_fileoff_t startoff;
5411 int error = 0;
5412 int i;
5413 int whichfork = XFS_DATA_FORK;
5414 int logflags;
5415 xfs_filblks_t blockcount = 0;
5416
5417 if (unlikely(XFS_TEST_ERROR(
5418 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
5419 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
5420 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
5421 XFS_ERROR_REPORT("xfs_bmap_shift_extents",
5422 XFS_ERRLEVEL_LOW, mp);
5423 return XFS_ERROR(EFSCORRUPTED);
5424 }
5425
5426 if (XFS_FORCED_SHUTDOWN(mp))
5427 return XFS_ERROR(EIO);
5428
5429 ASSERT(current_ext != NULL);
5430
5431 ifp = XFS_IFORK_PTR(ip, whichfork);
5432
5433 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
5434 /* Read in all the extents */
5435 error = xfs_iread_extents(tp, ip, whichfork);
5436 if (error)
5437 return error;
5438 }
5439
5440 /*
5441 * If *current_ext is 0, we would need to lookup the extent
5442 * from where we would start shifting and store it in gotp.
5443 */
5444 if (!*current_ext) {
5445 gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
5446 /*
5447 * gotp can be null in 2 cases: 1) if there are no extents
5448 * or 2) start_fsb lies in a hole beyond which there are
5449 * no extents. Either way, we are done.
5450 */
5451 if (!gotp) {
5452 *done = 1;
5453 return 0;
5454 }
5455 }
5456
5457 /* We are going to change core inode */
5458 logflags = XFS_ILOG_CORE;
5459
5460 if (ifp->if_flags & XFS_IFBROOT) {
5461 cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
5462 cur->bc_private.b.firstblock = *firstblock;
5463 cur->bc_private.b.flist = flist;
5464 cur->bc_private.b.flags = 0;
5465 } else {
5466 cur = NULL;
5467 logflags |= XFS_ILOG_DEXT;
5468 }
5469
5470 while (nexts++ < num_exts &&
5471 *current_ext < XFS_IFORK_NEXTENTS(ip, whichfork)) {
5472
5473 gotp = xfs_iext_get_ext(ifp, *current_ext);
5474 xfs_bmbt_get_all(gotp, &got);
5475 startoff = got.br_startoff - offset_shift_fsb;
5476
5477 /*
5478 * Before shifting extent into hole, make sure that the hole
5479 * is large enough to accomodate the shift.
5480 */
5481 if (*current_ext) {
5482 xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
5483 *current_ext - 1), &left);
5484
5485 if (startoff < left.br_startoff + left.br_blockcount)
5486 error = XFS_ERROR(EINVAL);
5487 } else if (offset_shift_fsb > got.br_startoff) {
5488 /*
5489 * When first extent is shifted, offset_shift_fsb
5490 * should be less than the stating offset of
5491 * the first extent.
5492 */
5493 error = XFS_ERROR(EINVAL);
5494 }
5495
5496 if (error)
5497 goto del_cursor;
5498
5499 if (cur) {
5500 error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
5501 got.br_startblock,
5502 got.br_blockcount,
5503 &i);
5504 if (error)
5505 goto del_cursor;
5506 XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
5507 }
5508
5509 /* Check if we can merge 2 adjacent extents */
5510 if (*current_ext &&
5511 left.br_startoff + left.br_blockcount == startoff &&
5512 left.br_startblock + left.br_blockcount ==
5513 got.br_startblock &&
5514 left.br_state == got.br_state &&
5515 left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
5516 blockcount = left.br_blockcount +
5517 got.br_blockcount;
5518 xfs_iext_remove(ip, *current_ext, 1, 0);
5519 if (cur) {
5520 error = xfs_btree_delete(cur, &i);
5521 if (error)
5522 goto del_cursor;
5523 XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
5524 }
5525 XFS_IFORK_NEXT_SET(ip, whichfork,
5526 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
5527 gotp = xfs_iext_get_ext(ifp, --*current_ext);
5528 xfs_bmbt_get_all(gotp, &got);
5529
5530 /* Make cursor point to the extent we will update */
5531 if (cur) {
5532 error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
5533 got.br_startblock,
5534 got.br_blockcount,
5535 &i);
5536 if (error)
5537 goto del_cursor;
5538 XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
5539 }
5540
5541 xfs_bmbt_set_blockcount(gotp, blockcount);
5542 got.br_blockcount = blockcount;
5543 } else {
5544 /* We have to update the startoff */
5545 xfs_bmbt_set_startoff(gotp, startoff);
5546 got.br_startoff = startoff;
5547 }
5548
5549 if (cur) {
5550 error = xfs_bmbt_update(cur, got.br_startoff,
5551 got.br_startblock,
5552 got.br_blockcount,
5553 got.br_state);
5554 if (error)
5555 goto del_cursor;
5556 }
5557
5558 (*current_ext)++;
5559 }
5560
5561 /* Check if we are done */
5562 if (*current_ext == XFS_IFORK_NEXTENTS(ip, whichfork))
5563 *done = 1;
5564
5565del_cursor:
5566 if (cur)
5567 xfs_btree_del_cursor(cur,
5568 error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
5569
5570 xfs_trans_log_inode(tp, ip, logflags);
5571
5572 return error;
5573}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 33b41f351225..f84bd7af43be 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -127,6 +127,16 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
127 { BMAP_RIGHT_FILLING, "RF" }, \ 127 { BMAP_RIGHT_FILLING, "RF" }, \
128 { BMAP_ATTRFORK, "ATTR" } 128 { BMAP_ATTRFORK, "ATTR" }
129 129
130
131/*
132 * This macro is used to determine how many extents will be shifted
133 * in one write transaction. We could require two splits,
134 * an extent move on the first and an extent merge on the second,
135 * So it is proper that one extent is shifted inside write transaction
136 * at a time.
137 */
138#define XFS_BMAP_MAX_SHIFT_EXTENTS 1
139
130#ifdef DEBUG 140#ifdef DEBUG
131void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, 141void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
132 int whichfork, unsigned long caller_ip); 142 int whichfork, unsigned long caller_ip);
@@ -169,5 +179,10 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
169int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, 179int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
170 xfs_extnum_t num); 180 xfs_extnum_t num);
171uint xfs_default_attroffset(struct xfs_inode *ip); 181uint xfs_default_attroffset(struct xfs_inode *ip);
182int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
183 int *done, xfs_fileoff_t start_fsb,
184 xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext,
185 xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist,
186 int num_exts);
172 187
173#endif /* __XFS_BMAP_H__ */ 188#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 706bc3f777cb..818d546664e7 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -780,12 +780,14 @@ static void
780xfs_bmbt_read_verify( 780xfs_bmbt_read_verify(
781 struct xfs_buf *bp) 781 struct xfs_buf *bp)
782{ 782{
783 if (!(xfs_btree_lblock_verify_crc(bp) && 783 if (!xfs_btree_lblock_verify_crc(bp))
784 xfs_bmbt_verify(bp))) { 784 xfs_buf_ioerror(bp, EFSBADCRC);
785 trace_xfs_btree_corrupt(bp, _RET_IP_); 785 else if (!xfs_bmbt_verify(bp))
786 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
787 bp->b_target->bt_mount, bp->b_addr);
788 xfs_buf_ioerror(bp, EFSCORRUPTED); 786 xfs_buf_ioerror(bp, EFSCORRUPTED);
787
788 if (bp->b_error) {
789 trace_xfs_btree_corrupt(bp, _RET_IP_);
790 xfs_verifier_error(bp);
789 } 791 }
790} 792}
791 793
@@ -794,11 +796,9 @@ xfs_bmbt_write_verify(
794 struct xfs_buf *bp) 796 struct xfs_buf *bp)
795{ 797{
796 if (!xfs_bmbt_verify(bp)) { 798 if (!xfs_bmbt_verify(bp)) {
797 xfs_warn(bp->b_target->bt_mount, "bmbt daddr 0x%llx failed", bp->b_bn);
798 trace_xfs_btree_corrupt(bp, _RET_IP_); 799 trace_xfs_btree_corrupt(bp, _RET_IP_);
799 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
800 bp->b_target->bt_mount, bp->b_addr);
801 xfs_buf_ioerror(bp, EFSCORRUPTED); 800 xfs_buf_ioerror(bp, EFSCORRUPTED);
801 xfs_verifier_error(bp);
802 return; 802 return;
803 } 803 }
804 xfs_btree_lblock_calc_crc(bp); 804 xfs_btree_lblock_calc_crc(bp);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index f264616080ca..01f6a646caa1 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1349,7 +1349,6 @@ xfs_free_file_space(
1349 * the freeing of the space succeeds at ENOSPC. 1349 * the freeing of the space succeeds at ENOSPC.
1350 */ 1350 */
1351 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); 1351 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1352 tp->t_flags |= XFS_TRANS_RESERVE;
1353 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); 1352 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
1354 1353
1355 /* 1354 /*
@@ -1468,6 +1467,102 @@ out:
1468} 1467}
1469 1468
1470/* 1469/*
1470 * xfs_collapse_file_space()
1471 * This routine frees disk space and shift extent for the given file.
1472 * The first thing we do is to free data blocks in the specified range
1473 * by calling xfs_free_file_space(). It would also sync dirty data
1474 * and invalidate page cache over the region on which collapse range
1475 * is working. And Shift extent records to the left to cover a hole.
1476 * RETURNS:
1477 * 0 on success
1478 * errno on error
1479 *
1480 */
1481int
1482xfs_collapse_file_space(
1483 struct xfs_inode *ip,
1484 xfs_off_t offset,
1485 xfs_off_t len)
1486{
1487 int done = 0;
1488 struct xfs_mount *mp = ip->i_mount;
1489 struct xfs_trans *tp;
1490 int error;
1491 xfs_extnum_t current_ext = 0;
1492 struct xfs_bmap_free free_list;
1493 xfs_fsblock_t first_block;
1494 int committed;
1495 xfs_fileoff_t start_fsb;
1496 xfs_fileoff_t shift_fsb;
1497
1498 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1499
1500 trace_xfs_collapse_file_space(ip);
1501
1502 start_fsb = XFS_B_TO_FSB(mp, offset + len);
1503 shift_fsb = XFS_B_TO_FSB(mp, len);
1504
1505 error = xfs_free_file_space(ip, offset, len);
1506 if (error)
1507 return error;
1508
1509 while (!error && !done) {
1510 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1511 tp->t_flags |= XFS_TRANS_RESERVE;
1512 /*
1513 * We would need to reserve permanent block for transaction.
1514 * This will come into picture when after shifting extent into
1515 * hole we found that adjacent extents can be merged which
1516 * may lead to freeing of a block during record update.
1517 */
1518 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
1519 XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
1520 if (error) {
1521 ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1522 xfs_trans_cancel(tp, 0);
1523 break;
1524 }
1525
1526 xfs_ilock(ip, XFS_ILOCK_EXCL);
1527 error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
1528 ip->i_gdquot, ip->i_pdquot,
1529 XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
1530 XFS_QMOPT_RES_REGBLKS);
1531 if (error)
1532 goto out;
1533
1534 xfs_trans_ijoin(tp, ip, 0);
1535
1536 xfs_bmap_init(&free_list, &first_block);
1537
1538 /*
1539 * We are using the write transaction in which max 2 bmbt
1540 * updates are allowed
1541 */
1542 error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb,
1543 shift_fsb, &current_ext,
1544 &first_block, &free_list,
1545 XFS_BMAP_MAX_SHIFT_EXTENTS);
1546 if (error)
1547 goto out;
1548
1549 error = xfs_bmap_finish(&tp, &free_list, &committed);
1550 if (error)
1551 goto out;
1552
1553 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1554 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1555 }
1556
1557 return error;
1558
1559out:
1560 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1561 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1562 return error;
1563}
1564
1565/*
1471 * We need to check that the format of the data fork in the temporary inode is 1566 * We need to check that the format of the data fork in the temporary inode is
1472 * valid for the target inode before doing the swap. This is not a problem with 1567 * valid for the target inode before doing the swap. This is not a problem with
1473 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized 1568 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 900747b25772..935ed2b24edf 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -99,6 +99,8 @@ int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
99 xfs_off_t len); 99 xfs_off_t len);
100int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset, 100int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset,
101 xfs_off_t len); 101 xfs_off_t len);
102int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
103 xfs_off_t len);
102 104
103/* EOF block manipulation functions */ 105/* EOF block manipulation functions */
104bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); 106bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 9adaae4f3e2f..e80d59fdf89a 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -234,8 +234,7 @@ xfs_btree_lblock_calc_crc(
234 return; 234 return;
235 if (bip) 235 if (bip)
236 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 236 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
237 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 237 xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
238 XFS_BTREE_LBLOCK_CRC_OFF);
239} 238}
240 239
241bool 240bool
@@ -243,8 +242,8 @@ xfs_btree_lblock_verify_crc(
243 struct xfs_buf *bp) 242 struct xfs_buf *bp)
244{ 243{
245 if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) 244 if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
246 return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 245 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
247 XFS_BTREE_LBLOCK_CRC_OFF); 246
248 return true; 247 return true;
249} 248}
250 249
@@ -267,8 +266,7 @@ xfs_btree_sblock_calc_crc(
267 return; 266 return;
268 if (bip) 267 if (bip)
269 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 268 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
270 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 269 xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
271 XFS_BTREE_SBLOCK_CRC_OFF);
272} 270}
273 271
274bool 272bool
@@ -276,8 +274,8 @@ xfs_btree_sblock_verify_crc(
276 struct xfs_buf *bp) 274 struct xfs_buf *bp)
277{ 275{
278 if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) 276 if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
279 return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 277 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
280 XFS_BTREE_SBLOCK_CRC_OFF); 278
281 return true; 279 return true;
282} 280}
283 281
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 9c061ef2b0d9..107f2fdfe41f 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -396,7 +396,17 @@ _xfs_buf_map_pages(
396 bp->b_addr = NULL; 396 bp->b_addr = NULL;
397 } else { 397 } else {
398 int retried = 0; 398 int retried = 0;
399 unsigned noio_flag;
399 400
401 /*
402 * vm_map_ram() will allocate auxillary structures (e.g.
403 * pagetables) with GFP_KERNEL, yet we are likely to be under
404 * GFP_NOFS context here. Hence we need to tell memory reclaim
405 * that we are in such a context via PF_MEMALLOC_NOIO to prevent
406 * memory reclaim re-entering the filesystem here and
407 * potentially deadlocking.
408 */
409 noio_flag = memalloc_noio_save();
400 do { 410 do {
401 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 411 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
402 -1, PAGE_KERNEL); 412 -1, PAGE_KERNEL);
@@ -404,6 +414,7 @@ _xfs_buf_map_pages(
404 break; 414 break;
405 vm_unmap_aliases(); 415 vm_unmap_aliases();
406 } while (retried++ <= 1); 416 } while (retried++ <= 1);
417 memalloc_noio_restore(noio_flag);
407 418
408 if (!bp->b_addr) 419 if (!bp->b_addr)
409 return -ENOMEM; 420 return -ENOMEM;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 995339534db6..b8a3abf6cf47 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -369,6 +369,20 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
369 xfs_buf_rele(bp); 369 xfs_buf_rele(bp);
370} 370}
371 371
372static inline int
373xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
374{
375 return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
376 cksum_offset);
377}
378
379static inline void
380xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
381{
382 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
383 cksum_offset);
384}
385
372/* 386/*
373 * Handling of buftargs. 387 * Handling of buftargs.
374 */ 388 */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 33149113e333..8752821443be 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -796,20 +796,6 @@ xfs_buf_item_init(
796 bip->bli_formats[i].blf_map_size = map_size; 796 bip->bli_formats[i].blf_map_size = map_size;
797 } 797 }
798 798
799#ifdef XFS_TRANS_DEBUG
800 /*
801 * Allocate the arrays for tracking what needs to be logged
802 * and what our callers request to be logged. bli_orig
803 * holds a copy of the original, clean buffer for comparison
804 * against, and bli_logged keeps a 1 bit flag per byte in
805 * the buffer to indicate which bytes the callers have asked
806 * to have logged.
807 */
808 bip->bli_orig = kmem_alloc(BBTOB(bp->b_length), KM_SLEEP);
809 memcpy(bip->bli_orig, bp->b_addr, BBTOB(bp->b_length));
810 bip->bli_logged = kmem_zalloc(BBTOB(bp->b_length) / NBBY, KM_SLEEP);
811#endif
812
813 /* 799 /*
814 * Put the buf item into the list of items attached to the 800 * Put the buf item into the list of items attached to the
815 * buffer at the front. 801 * buffer at the front.
@@ -957,11 +943,6 @@ STATIC void
957xfs_buf_item_free( 943xfs_buf_item_free(
958 xfs_buf_log_item_t *bip) 944 xfs_buf_log_item_t *bip)
959{ 945{
960#ifdef XFS_TRANS_DEBUG
961 kmem_free(bip->bli_orig);
962 kmem_free(bip->bli_logged);
963#endif /* XFS_TRANS_DEBUG */
964
965 xfs_buf_item_free_format(bip); 946 xfs_buf_item_free_format(bip);
966 kmem_zone_free(xfs_buf_item_zone, bip); 947 kmem_zone_free(xfs_buf_item_zone, bip);
967} 948}
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 796272a2e129..6cc5f6785a77 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -185,8 +185,8 @@ xfs_da3_node_write_verify(
185 struct xfs_da3_node_hdr *hdr3 = bp->b_addr; 185 struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
186 186
187 if (!xfs_da3_node_verify(bp)) { 187 if (!xfs_da3_node_verify(bp)) {
188 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
189 xfs_buf_ioerror(bp, EFSCORRUPTED); 188 xfs_buf_ioerror(bp, EFSCORRUPTED);
189 xfs_verifier_error(bp);
190 return; 190 return;
191 } 191 }
192 192
@@ -196,7 +196,7 @@ xfs_da3_node_write_verify(
196 if (bip) 196 if (bip)
197 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); 197 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
198 198
199 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DA3_NODE_CRC_OFF); 199 xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF);
200} 200}
201 201
202/* 202/*
@@ -209,18 +209,20 @@ static void
209xfs_da3_node_read_verify( 209xfs_da3_node_read_verify(
210 struct xfs_buf *bp) 210 struct xfs_buf *bp)
211{ 211{
212 struct xfs_mount *mp = bp->b_target->bt_mount;
213 struct xfs_da_blkinfo *info = bp->b_addr; 212 struct xfs_da_blkinfo *info = bp->b_addr;
214 213
215 switch (be16_to_cpu(info->magic)) { 214 switch (be16_to_cpu(info->magic)) {
216 case XFS_DA3_NODE_MAGIC: 215 case XFS_DA3_NODE_MAGIC:
217 if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 216 if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
218 XFS_DA3_NODE_CRC_OFF)) 217 xfs_buf_ioerror(bp, EFSBADCRC);
219 break; 218 break;
219 }
220 /* fall through */ 220 /* fall through */
221 case XFS_DA_NODE_MAGIC: 221 case XFS_DA_NODE_MAGIC:
222 if (!xfs_da3_node_verify(bp)) 222 if (!xfs_da3_node_verify(bp)) {
223 xfs_buf_ioerror(bp, EFSCORRUPTED);
223 break; 224 break;
225 }
224 return; 226 return;
225 case XFS_ATTR_LEAF_MAGIC: 227 case XFS_ATTR_LEAF_MAGIC:
226 case XFS_ATTR3_LEAF_MAGIC: 228 case XFS_ATTR3_LEAF_MAGIC:
@@ -237,8 +239,7 @@ xfs_da3_node_read_verify(
237 } 239 }
238 240
239 /* corrupt block */ 241 /* corrupt block */
240 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); 242 xfs_verifier_error(bp);
241 xfs_buf_ioerror(bp, EFSCORRUPTED);
242} 243}
243 244
244const struct xfs_buf_ops xfs_da3_node_buf_ops = { 245const struct xfs_buf_ops xfs_da3_node_buf_ops = {
@@ -1295,7 +1296,7 @@ xfs_da3_fixhashpath(
1295 node = blk->bp->b_addr; 1296 node = blk->bp->b_addr;
1296 dp->d_ops->node_hdr_from_disk(&nodehdr, node); 1297 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1297 btree = dp->d_ops->node_tree_p(node); 1298 btree = dp->d_ops->node_tree_p(node);
1298 if (be32_to_cpu(btree->hashval) == lasthash) 1299 if (be32_to_cpu(btree[blk->index].hashval) == lasthash)
1299 break; 1300 break;
1300 blk->hashval = lasthash; 1301 blk->hashval = lasthash;
1301 btree[blk->index].hashval = cpu_to_be32(lasthash); 1302 btree[blk->index].hashval = cpu_to_be32(lasthash);
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index e5869b50dc41..623bbe8fd921 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -89,6 +89,8 @@ typedef struct xfs_dinode {
89 /* structure must be padded to 64 bit alignment */ 89 /* structure must be padded to 64 bit alignment */
90} xfs_dinode_t; 90} xfs_dinode_t;
91 91
92#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc)
93
92#define DI_MAX_FLUSH 0xffff 94#define DI_MAX_FLUSH 0xffff
93 95
94/* 96/*
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index ce16ef02997a..fda46253966a 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -180,16 +180,23 @@ xfs_dir_init(
180 xfs_inode_t *dp, 180 xfs_inode_t *dp,
181 xfs_inode_t *pdp) 181 xfs_inode_t *pdp)
182{ 182{
183 xfs_da_args_t args; 183 struct xfs_da_args *args;
184 int error; 184 int error;
185 185
186 memset((char *)&args, 0, sizeof(args));
187 args.dp = dp;
188 args.trans = tp;
189 ASSERT(S_ISDIR(dp->i_d.di_mode)); 186 ASSERT(S_ISDIR(dp->i_d.di_mode));
190 if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) 187 error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
188 if (error)
191 return error; 189 return error;
192 return xfs_dir2_sf_create(&args, pdp->i_ino); 190
191 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
192 if (!args)
193 return ENOMEM;
194
195 args->dp = dp;
196 args->trans = tp;
197 error = xfs_dir2_sf_create(args, pdp->i_ino);
198 kmem_free(args);
199 return error;
193} 200}
194 201
195/* 202/*
@@ -205,41 +212,56 @@ xfs_dir_createname(
205 xfs_bmap_free_t *flist, /* bmap's freeblock list */ 212 xfs_bmap_free_t *flist, /* bmap's freeblock list */
206 xfs_extlen_t total) /* bmap's total block count */ 213 xfs_extlen_t total) /* bmap's total block count */
207{ 214{
208 xfs_da_args_t args; 215 struct xfs_da_args *args;
209 int rval; 216 int rval;
210 int v; /* type-checking value */ 217 int v; /* type-checking value */
211 218
212 ASSERT(S_ISDIR(dp->i_d.di_mode)); 219 ASSERT(S_ISDIR(dp->i_d.di_mode));
213 if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) 220 rval = xfs_dir_ino_validate(tp->t_mountp, inum);
221 if (rval)
214 return rval; 222 return rval;
215 XFS_STATS_INC(xs_dir_create); 223 XFS_STATS_INC(xs_dir_create);
216 224
217 memset(&args, 0, sizeof(xfs_da_args_t)); 225 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
218 args.name = name->name; 226 if (!args)
219 args.namelen = name->len; 227 return ENOMEM;
220 args.filetype = name->type; 228
221 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 229 args->name = name->name;
222 args.inumber = inum; 230 args->namelen = name->len;
223 args.dp = dp; 231 args->filetype = name->type;
224 args.firstblock = first; 232 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
225 args.flist = flist; 233 args->inumber = inum;
226 args.total = total; 234 args->dp = dp;
227 args.whichfork = XFS_DATA_FORK; 235 args->firstblock = first;
228 args.trans = tp; 236 args->flist = flist;
229 args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; 237 args->total = total;
230 238 args->whichfork = XFS_DATA_FORK;
231 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 239 args->trans = tp;
232 rval = xfs_dir2_sf_addname(&args); 240 args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
233 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) 241
234 return rval; 242 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
235 else if (v) 243 rval = xfs_dir2_sf_addname(args);
236 rval = xfs_dir2_block_addname(&args); 244 goto out_free;
237 else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) 245 }
238 return rval; 246
239 else if (v) 247 rval = xfs_dir2_isblock(tp, dp, &v);
240 rval = xfs_dir2_leaf_addname(&args); 248 if (rval)
249 goto out_free;
250 if (v) {
251 rval = xfs_dir2_block_addname(args);
252 goto out_free;
253 }
254
255 rval = xfs_dir2_isleaf(tp, dp, &v);
256 if (rval)
257 goto out_free;
258 if (v)
259 rval = xfs_dir2_leaf_addname(args);
241 else 260 else
242 rval = xfs_dir2_node_addname(&args); 261 rval = xfs_dir2_node_addname(args);
262
263out_free:
264 kmem_free(args);
243 return rval; 265 return rval;
244} 266}
245 267
@@ -282,46 +304,66 @@ xfs_dir_lookup(
282 xfs_ino_t *inum, /* out: inode number */ 304 xfs_ino_t *inum, /* out: inode number */
283 struct xfs_name *ci_name) /* out: actual name if CI match */ 305 struct xfs_name *ci_name) /* out: actual name if CI match */
284{ 306{
285 xfs_da_args_t args; 307 struct xfs_da_args *args;
286 int rval; 308 int rval;
287 int v; /* type-checking value */ 309 int v; /* type-checking value */
288 310
289 ASSERT(S_ISDIR(dp->i_d.di_mode)); 311 ASSERT(S_ISDIR(dp->i_d.di_mode));
290 XFS_STATS_INC(xs_dir_lookup); 312 XFS_STATS_INC(xs_dir_lookup);
291 313
292 memset(&args, 0, sizeof(xfs_da_args_t)); 314 /*
293 args.name = name->name; 315 * We need to use KM_NOFS here so that lockdep will not throw false
294 args.namelen = name->len; 316 * positive deadlock warnings on a non-transactional lookup path. It is
295 args.filetype = name->type; 317 * safe to recurse into inode recalim in that case, but lockdep can't
296 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 318 * easily be taught about it. Hence KM_NOFS avoids having to add more
297 args.dp = dp; 319 * lockdep Doing this avoids having to add a bunch of lockdep class
298 args.whichfork = XFS_DATA_FORK; 320 * annotations into the reclaim path for the ilock.
299 args.trans = tp; 321 */
300 args.op_flags = XFS_DA_OP_OKNOENT; 322 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
323 args->name = name->name;
324 args->namelen = name->len;
325 args->filetype = name->type;
326 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
327 args->dp = dp;
328 args->whichfork = XFS_DATA_FORK;
329 args->trans = tp;
330 args->op_flags = XFS_DA_OP_OKNOENT;
301 if (ci_name) 331 if (ci_name)
302 args.op_flags |= XFS_DA_OP_CILOOKUP; 332 args->op_flags |= XFS_DA_OP_CILOOKUP;
303 333
304 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 334 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
305 rval = xfs_dir2_sf_lookup(&args); 335 rval = xfs_dir2_sf_lookup(args);
306 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) 336 goto out_check_rval;
307 return rval; 337 }
308 else if (v) 338
309 rval = xfs_dir2_block_lookup(&args); 339 rval = xfs_dir2_isblock(tp, dp, &v);
310 else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) 340 if (rval)
311 return rval; 341 goto out_free;
312 else if (v) 342 if (v) {
313 rval = xfs_dir2_leaf_lookup(&args); 343 rval = xfs_dir2_block_lookup(args);
344 goto out_check_rval;
345 }
346
347 rval = xfs_dir2_isleaf(tp, dp, &v);
348 if (rval)
349 goto out_free;
350 if (v)
351 rval = xfs_dir2_leaf_lookup(args);
314 else 352 else
315 rval = xfs_dir2_node_lookup(&args); 353 rval = xfs_dir2_node_lookup(args);
354
355out_check_rval:
316 if (rval == EEXIST) 356 if (rval == EEXIST)
317 rval = 0; 357 rval = 0;
318 if (!rval) { 358 if (!rval) {
319 *inum = args.inumber; 359 *inum = args->inumber;
320 if (ci_name) { 360 if (ci_name) {
321 ci_name->name = args.value; 361 ci_name->name = args->value;
322 ci_name->len = args.valuelen; 362 ci_name->len = args->valuelen;
323 } 363 }
324 } 364 }
365out_free:
366 kmem_free(args);
325 return rval; 367 return rval;
326} 368}
327 369
@@ -338,38 +380,51 @@ xfs_dir_removename(
338 xfs_bmap_free_t *flist, /* bmap's freeblock list */ 380 xfs_bmap_free_t *flist, /* bmap's freeblock list */
339 xfs_extlen_t total) /* bmap's total block count */ 381 xfs_extlen_t total) /* bmap's total block count */
340{ 382{
341 xfs_da_args_t args; 383 struct xfs_da_args *args;
342 int rval; 384 int rval;
343 int v; /* type-checking value */ 385 int v; /* type-checking value */
344 386
345 ASSERT(S_ISDIR(dp->i_d.di_mode)); 387 ASSERT(S_ISDIR(dp->i_d.di_mode));
346 XFS_STATS_INC(xs_dir_remove); 388 XFS_STATS_INC(xs_dir_remove);
347 389
348 memset(&args, 0, sizeof(xfs_da_args_t)); 390 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
349 args.name = name->name; 391 if (!args)
350 args.namelen = name->len; 392 return ENOMEM;
351 args.filetype = name->type; 393
352 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 394 args->name = name->name;
353 args.inumber = ino; 395 args->namelen = name->len;
354 args.dp = dp; 396 args->filetype = name->type;
355 args.firstblock = first; 397 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
356 args.flist = flist; 398 args->inumber = ino;
357 args.total = total; 399 args->dp = dp;
358 args.whichfork = XFS_DATA_FORK; 400 args->firstblock = first;
359 args.trans = tp; 401 args->flist = flist;
360 402 args->total = total;
361 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 403 args->whichfork = XFS_DATA_FORK;
362 rval = xfs_dir2_sf_removename(&args); 404 args->trans = tp;
363 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) 405
364 return rval; 406 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
365 else if (v) 407 rval = xfs_dir2_sf_removename(args);
366 rval = xfs_dir2_block_removename(&args); 408 goto out_free;
367 else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) 409 }
368 return rval; 410
369 else if (v) 411 rval = xfs_dir2_isblock(tp, dp, &v);
370 rval = xfs_dir2_leaf_removename(&args); 412 if (rval)
413 goto out_free;
414 if (v) {
415 rval = xfs_dir2_block_removename(args);
416 goto out_free;
417 }
418
419 rval = xfs_dir2_isleaf(tp, dp, &v);
420 if (rval)
421 goto out_free;
422 if (v)
423 rval = xfs_dir2_leaf_removename(args);
371 else 424 else
372 rval = xfs_dir2_node_removename(&args); 425 rval = xfs_dir2_node_removename(args);
426out_free:
427 kmem_free(args);
373 return rval; 428 return rval;
374} 429}
375 430
@@ -386,40 +441,54 @@ xfs_dir_replace(
386 xfs_bmap_free_t *flist, /* bmap's freeblock list */ 441 xfs_bmap_free_t *flist, /* bmap's freeblock list */
387 xfs_extlen_t total) /* bmap's total block count */ 442 xfs_extlen_t total) /* bmap's total block count */
388{ 443{
389 xfs_da_args_t args; 444 struct xfs_da_args *args;
390 int rval; 445 int rval;
391 int v; /* type-checking value */ 446 int v; /* type-checking value */
392 447
393 ASSERT(S_ISDIR(dp->i_d.di_mode)); 448 ASSERT(S_ISDIR(dp->i_d.di_mode));
394 449
395 if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) 450 rval = xfs_dir_ino_validate(tp->t_mountp, inum);
451 if (rval)
396 return rval; 452 return rval;
397 453
398 memset(&args, 0, sizeof(xfs_da_args_t)); 454 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
399 args.name = name->name; 455 if (!args)
400 args.namelen = name->len; 456 return ENOMEM;
401 args.filetype = name->type; 457
402 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 458 args->name = name->name;
403 args.inumber = inum; 459 args->namelen = name->len;
404 args.dp = dp; 460 args->filetype = name->type;
405 args.firstblock = first; 461 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
406 args.flist = flist; 462 args->inumber = inum;
407 args.total = total; 463 args->dp = dp;
408 args.whichfork = XFS_DATA_FORK; 464 args->firstblock = first;
409 args.trans = tp; 465 args->flist = flist;
410 466 args->total = total;
411 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 467 args->whichfork = XFS_DATA_FORK;
412 rval = xfs_dir2_sf_replace(&args); 468 args->trans = tp;
413 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) 469
414 return rval; 470 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
415 else if (v) 471 rval = xfs_dir2_sf_replace(args);
416 rval = xfs_dir2_block_replace(&args); 472 goto out_free;
417 else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) 473 }
418 return rval; 474
419 else if (v) 475 rval = xfs_dir2_isblock(tp, dp, &v);
420 rval = xfs_dir2_leaf_replace(&args); 476 if (rval)
477 goto out_free;
478 if (v) {
479 rval = xfs_dir2_block_replace(args);
480 goto out_free;
481 }
482
483 rval = xfs_dir2_isleaf(tp, dp, &v);
484 if (rval)
485 goto out_free;
486 if (v)
487 rval = xfs_dir2_leaf_replace(args);
421 else 488 else
422 rval = xfs_dir2_node_replace(&args); 489 rval = xfs_dir2_node_replace(args);
490out_free:
491 kmem_free(args);
423 return rval; 492 return rval;
424} 493}
425 494
@@ -434,7 +503,7 @@ xfs_dir_canenter(
434 struct xfs_name *name, /* name of entry to add */ 503 struct xfs_name *name, /* name of entry to add */
435 uint resblks) 504 uint resblks)
436{ 505{
437 xfs_da_args_t args; 506 struct xfs_da_args *args;
438 int rval; 507 int rval;
439 int v; /* type-checking value */ 508 int v; /* type-checking value */
440 509
@@ -443,29 +512,42 @@ xfs_dir_canenter(
443 512
444 ASSERT(S_ISDIR(dp->i_d.di_mode)); 513 ASSERT(S_ISDIR(dp->i_d.di_mode));
445 514
446 memset(&args, 0, sizeof(xfs_da_args_t)); 515 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
447 args.name = name->name; 516 if (!args)
448 args.namelen = name->len; 517 return ENOMEM;
449 args.filetype = name->type; 518
450 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 519 args->name = name->name;
451 args.dp = dp; 520 args->namelen = name->len;
452 args.whichfork = XFS_DATA_FORK; 521 args->filetype = name->type;
453 args.trans = tp; 522 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
454 args.op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME | 523 args->dp = dp;
524 args->whichfork = XFS_DATA_FORK;
525 args->trans = tp;
526 args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
455 XFS_DA_OP_OKNOENT; 527 XFS_DA_OP_OKNOENT;
456 528
457 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 529 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
458 rval = xfs_dir2_sf_addname(&args); 530 rval = xfs_dir2_sf_addname(args);
459 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) 531 goto out_free;
460 return rval; 532 }
461 else if (v) 533
462 rval = xfs_dir2_block_addname(&args); 534 rval = xfs_dir2_isblock(tp, dp, &v);
463 else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) 535 if (rval)
464 return rval; 536 goto out_free;
465 else if (v) 537 if (v) {
466 rval = xfs_dir2_leaf_addname(&args); 538 rval = xfs_dir2_block_addname(args);
539 goto out_free;
540 }
541
542 rval = xfs_dir2_isleaf(tp, dp, &v);
543 if (rval)
544 goto out_free;
545 if (v)
546 rval = xfs_dir2_leaf_addname(args);
467 else 547 else
468 rval = xfs_dir2_node_addname(&args); 548 rval = xfs_dir2_node_addname(args);
549out_free:
550 kmem_free(args);
469 return rval; 551 return rval;
470} 552}
471 553
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 90cdbf4b5f19..4f6a38cb83a4 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -89,13 +89,14 @@ xfs_dir3_block_read_verify(
89{ 89{
90 struct xfs_mount *mp = bp->b_target->bt_mount; 90 struct xfs_mount *mp = bp->b_target->bt_mount;
91 91
92 if ((xfs_sb_version_hascrc(&mp->m_sb) && 92 if (xfs_sb_version_hascrc(&mp->m_sb) &&
93 !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 93 !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
94 XFS_DIR3_DATA_CRC_OFF)) || 94 xfs_buf_ioerror(bp, EFSBADCRC);
95 !xfs_dir3_block_verify(bp)) { 95 else if (!xfs_dir3_block_verify(bp))
96 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
97 xfs_buf_ioerror(bp, EFSCORRUPTED); 96 xfs_buf_ioerror(bp, EFSCORRUPTED);
98 } 97
98 if (bp->b_error)
99 xfs_verifier_error(bp);
99} 100}
100 101
101static void 102static void
@@ -107,8 +108,8 @@ xfs_dir3_block_write_verify(
107 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; 108 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
108 109
109 if (!xfs_dir3_block_verify(bp)) { 110 if (!xfs_dir3_block_verify(bp)) {
110 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
111 xfs_buf_ioerror(bp, EFSCORRUPTED); 111 xfs_buf_ioerror(bp, EFSCORRUPTED);
112 xfs_verifier_error(bp);
112 return; 113 return;
113 } 114 }
114 115
@@ -118,7 +119,7 @@ xfs_dir3_block_write_verify(
118 if (bip) 119 if (bip)
119 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); 120 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
120 121
121 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF); 122 xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
122} 123}
123 124
124const struct xfs_buf_ops xfs_dir3_block_buf_ops = { 125const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 70acff4ee173..afa4ad523f3f 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -241,7 +241,6 @@ static void
241xfs_dir3_data_reada_verify( 241xfs_dir3_data_reada_verify(
242 struct xfs_buf *bp) 242 struct xfs_buf *bp)
243{ 243{
244 struct xfs_mount *mp = bp->b_target->bt_mount;
245 struct xfs_dir2_data_hdr *hdr = bp->b_addr; 244 struct xfs_dir2_data_hdr *hdr = bp->b_addr;
246 245
247 switch (hdr->magic) { 246 switch (hdr->magic) {
@@ -255,8 +254,8 @@ xfs_dir3_data_reada_verify(
255 xfs_dir3_data_verify(bp); 254 xfs_dir3_data_verify(bp);
256 return; 255 return;
257 default: 256 default:
258 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
259 xfs_buf_ioerror(bp, EFSCORRUPTED); 257 xfs_buf_ioerror(bp, EFSCORRUPTED);
258 xfs_verifier_error(bp);
260 break; 259 break;
261 } 260 }
262} 261}
@@ -267,13 +266,14 @@ xfs_dir3_data_read_verify(
267{ 266{
268 struct xfs_mount *mp = bp->b_target->bt_mount; 267 struct xfs_mount *mp = bp->b_target->bt_mount;
269 268
270 if ((xfs_sb_version_hascrc(&mp->m_sb) && 269 if (xfs_sb_version_hascrc(&mp->m_sb) &&
271 !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 270 !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
272 XFS_DIR3_DATA_CRC_OFF)) || 271 xfs_buf_ioerror(bp, EFSBADCRC);
273 !xfs_dir3_data_verify(bp)) { 272 else if (!xfs_dir3_data_verify(bp))
274 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
275 xfs_buf_ioerror(bp, EFSCORRUPTED); 273 xfs_buf_ioerror(bp, EFSCORRUPTED);
276 } 274
275 if (bp->b_error)
276 xfs_verifier_error(bp);
277} 277}
278 278
279static void 279static void
@@ -285,8 +285,8 @@ xfs_dir3_data_write_verify(
285 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; 285 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
286 286
287 if (!xfs_dir3_data_verify(bp)) { 287 if (!xfs_dir3_data_verify(bp)) {
288 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
289 xfs_buf_ioerror(bp, EFSCORRUPTED); 288 xfs_buf_ioerror(bp, EFSCORRUPTED);
289 xfs_verifier_error(bp);
290 return; 290 return;
291 } 291 }
292 292
@@ -296,7 +296,7 @@ xfs_dir3_data_write_verify(
296 if (bip) 296 if (bip)
297 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); 297 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
298 298
299 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF); 299 xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
300} 300}
301 301
302const struct xfs_buf_ops xfs_dir3_data_buf_ops = { 302const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index ae47ec6e16c4..d36e97df1187 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -179,13 +179,14 @@ __read_verify(
179{ 179{
180 struct xfs_mount *mp = bp->b_target->bt_mount; 180 struct xfs_mount *mp = bp->b_target->bt_mount;
181 181
182 if ((xfs_sb_version_hascrc(&mp->m_sb) && 182 if (xfs_sb_version_hascrc(&mp->m_sb) &&
183 !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 183 !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
184 XFS_DIR3_LEAF_CRC_OFF)) || 184 xfs_buf_ioerror(bp, EFSBADCRC);
185 !xfs_dir3_leaf_verify(bp, magic)) { 185 else if (!xfs_dir3_leaf_verify(bp, magic))
186 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
187 xfs_buf_ioerror(bp, EFSCORRUPTED); 186 xfs_buf_ioerror(bp, EFSCORRUPTED);
188 } 187
188 if (bp->b_error)
189 xfs_verifier_error(bp);
189} 190}
190 191
191static void 192static void
@@ -198,8 +199,8 @@ __write_verify(
198 struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; 199 struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
199 200
200 if (!xfs_dir3_leaf_verify(bp, magic)) { 201 if (!xfs_dir3_leaf_verify(bp, magic)) {
201 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
202 xfs_buf_ioerror(bp, EFSCORRUPTED); 202 xfs_buf_ioerror(bp, EFSCORRUPTED);
203 xfs_verifier_error(bp);
203 return; 204 return;
204 } 205 }
205 206
@@ -209,7 +210,7 @@ __write_verify(
209 if (bip) 210 if (bip)
210 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); 211 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
211 212
212 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_LEAF_CRC_OFF); 213 xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
213} 214}
214 215
215static void 216static void
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 48c7d18f68c3..cb434d732681 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -115,13 +115,14 @@ xfs_dir3_free_read_verify(
115{ 115{
116 struct xfs_mount *mp = bp->b_target->bt_mount; 116 struct xfs_mount *mp = bp->b_target->bt_mount;
117 117
118 if ((xfs_sb_version_hascrc(&mp->m_sb) && 118 if (xfs_sb_version_hascrc(&mp->m_sb) &&
119 !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 119 !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
120 XFS_DIR3_FREE_CRC_OFF)) || 120 xfs_buf_ioerror(bp, EFSBADCRC);
121 !xfs_dir3_free_verify(bp)) { 121 else if (!xfs_dir3_free_verify(bp))
122 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
123 xfs_buf_ioerror(bp, EFSCORRUPTED); 122 xfs_buf_ioerror(bp, EFSCORRUPTED);
124 } 123
124 if (bp->b_error)
125 xfs_verifier_error(bp);
125} 126}
126 127
127static void 128static void
@@ -133,8 +134,8 @@ xfs_dir3_free_write_verify(
133 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; 134 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
134 135
135 if (!xfs_dir3_free_verify(bp)) { 136 if (!xfs_dir3_free_verify(bp)) {
136 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
137 xfs_buf_ioerror(bp, EFSCORRUPTED); 137 xfs_buf_ioerror(bp, EFSCORRUPTED);
138 xfs_verifier_error(bp);
138 return; 139 return;
139 } 140 }
140 141
@@ -144,7 +145,7 @@ xfs_dir3_free_write_verify(
144 if (bip) 145 if (bip)
145 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); 146 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
146 147
147 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_FREE_CRC_OFF); 148 xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF);
148} 149}
149 150
150const struct xfs_buf_ops xfs_dir3_free_buf_ops = { 151const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 7aeb4c895b32..868b19f096bf 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -615,7 +615,7 @@ xfs_qm_dqread(
615 615
616 if (flags & XFS_QMOPT_DQALLOC) { 616 if (flags & XFS_QMOPT_DQALLOC) {
617 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); 617 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
618 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_attrsetm, 618 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc,
619 XFS_QM_DQALLOC_SPACE_RES(mp), 0); 619 XFS_QM_DQALLOC_SPACE_RES(mp), 0);
620 if (error) 620 if (error)
621 goto error1; 621 goto error1;
diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/xfs_dquot_buf.c
index d401457d2f25..610da8177737 100644
--- a/fs/xfs/xfs_dquot_buf.c
+++ b/fs/xfs/xfs_dquot_buf.c
@@ -257,10 +257,13 @@ xfs_dquot_buf_read_verify(
257{ 257{
258 struct xfs_mount *mp = bp->b_target->bt_mount; 258 struct xfs_mount *mp = bp->b_target->bt_mount;
259 259
260 if (!xfs_dquot_buf_verify_crc(mp, bp) || !xfs_dquot_buf_verify(mp, bp)) { 260 if (!xfs_dquot_buf_verify_crc(mp, bp))
261 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); 261 xfs_buf_ioerror(bp, EFSBADCRC);
262 else if (!xfs_dquot_buf_verify(mp, bp))
262 xfs_buf_ioerror(bp, EFSCORRUPTED); 263 xfs_buf_ioerror(bp, EFSCORRUPTED);
263 } 264
265 if (bp->b_error)
266 xfs_verifier_error(bp);
264} 267}
265 268
266/* 269/*
@@ -275,8 +278,8 @@ xfs_dquot_buf_write_verify(
275 struct xfs_mount *mp = bp->b_target->bt_mount; 278 struct xfs_mount *mp = bp->b_target->bt_mount;
276 279
277 if (!xfs_dquot_buf_verify(mp, bp)) { 280 if (!xfs_dquot_buf_verify(mp, bp)) {
278 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
279 xfs_buf_ioerror(bp, EFSCORRUPTED); 281 xfs_buf_ioerror(bp, EFSCORRUPTED);
282 xfs_verifier_error(bp);
280 return; 283 return;
281 } 284 }
282} 285}
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 9995b807d627..edac5b057d28 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -156,7 +156,7 @@ xfs_error_report(
156{ 156{
157 if (level <= xfs_error_level) { 157 if (level <= xfs_error_level) {
158 xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, 158 xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
159 "Internal error %s at line %d of file %s. Caller 0x%p", 159 "Internal error %s at line %d of file %s. Caller %pF",
160 tag, linenum, filename, ra); 160 tag, linenum, filename, ra);
161 161
162 xfs_stack_trace(); 162 xfs_stack_trace();
@@ -178,3 +178,28 @@ xfs_corruption_error(
178 xfs_error_report(tag, level, mp, filename, linenum, ra); 178 xfs_error_report(tag, level, mp, filename, linenum, ra);
179 xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair"); 179 xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
180} 180}
181
182/*
183 * Warnings specifically for verifier errors. Differentiate CRC vs. invalid
184 * values, and omit the stack trace unless the error level is tuned high.
185 */
186void
187xfs_verifier_error(
188 struct xfs_buf *bp)
189{
190 struct xfs_mount *mp = bp->b_target->bt_mount;
191
192 xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx",
193 bp->b_error == EFSBADCRC ? "CRC error" : "corruption",
194 __return_address, bp->b_bn);
195
196 xfs_alert(mp, "Unmount and run xfs_repair");
197
198 if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
199 xfs_alert(mp, "First 64 bytes of corrupted metadata buffer:");
200 xfs_hex_dump(xfs_buf_offset(bp, 0), 64);
201 }
202
203 if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
204 xfs_stack_trace();
205}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 079a367f44ee..c1c57d4a4b5d 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -34,6 +34,7 @@ extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
34extern void xfs_corruption_error(const char *tag, int level, 34extern void xfs_corruption_error(const char *tag, int level,
35 struct xfs_mount *mp, void *p, const char *filename, 35 struct xfs_mount *mp, void *p, const char *filename,
36 int linenum, inst_t *ra); 36 int linenum, inst_t *ra);
37extern void xfs_verifier_error(struct xfs_buf *bp);
37 38
38#define XFS_ERROR_REPORT(e, lvl, mp) \ 39#define XFS_ERROR_REPORT(e, lvl, mp) \
39 xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) 40 xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 64b48eade91d..f7abff8c16ca 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -823,7 +823,8 @@ xfs_file_fallocate(
823 823
824 if (!S_ISREG(inode->i_mode)) 824 if (!S_ISREG(inode->i_mode))
825 return -EINVAL; 825 return -EINVAL;
826 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 826 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
827 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
827 return -EOPNOTSUPP; 828 return -EOPNOTSUPP;
828 829
829 xfs_ilock(ip, XFS_IOLOCK_EXCL); 830 xfs_ilock(ip, XFS_IOLOCK_EXCL);
@@ -831,6 +832,20 @@ xfs_file_fallocate(
831 error = xfs_free_file_space(ip, offset, len); 832 error = xfs_free_file_space(ip, offset, len);
832 if (error) 833 if (error)
833 goto out_unlock; 834 goto out_unlock;
835 } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
836 unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
837
838 if (offset & blksize_mask || len & blksize_mask) {
839 error = -EINVAL;
840 goto out_unlock;
841 }
842
843 ASSERT(offset + len < i_size_read(inode));
844 new_size = i_size_read(inode) - len;
845
846 error = xfs_collapse_file_space(ip, offset, len);
847 if (error)
848 goto out_unlock;
834 } else { 849 } else {
835 if (!(mode & FALLOC_FL_KEEP_SIZE) && 850 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
836 offset + len > i_size_read(inode)) { 851 offset + len > i_size_read(inode)) {
@@ -840,8 +855,11 @@ xfs_file_fallocate(
840 goto out_unlock; 855 goto out_unlock;
841 } 856 }
842 857
843 error = xfs_alloc_file_space(ip, offset, len, 858 if (mode & FALLOC_FL_ZERO_RANGE)
844 XFS_BMAPI_PREALLOC); 859 error = xfs_zero_file_space(ip, offset, len);
860 else
861 error = xfs_alloc_file_space(ip, offset, len,
862 XFS_BMAPI_PREALLOC);
845 if (error) 863 if (error)
846 goto out_unlock; 864 goto out_unlock;
847 } 865 }
@@ -859,7 +877,7 @@ xfs_file_fallocate(
859 if (ip->i_d.di_mode & S_IXGRP) 877 if (ip->i_d.di_mode & S_IXGRP)
860 ip->i_d.di_mode &= ~S_ISGID; 878 ip->i_d.di_mode &= ~S_ISGID;
861 879
862 if (!(mode & FALLOC_FL_PUNCH_HOLE)) 880 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)))
863 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; 881 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
864 882
865 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 883 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h
index b6ab5a3cfa12..9898f31d05d8 100644
--- a/fs/xfs/xfs_format.h
+++ b/fs/xfs/xfs_format.h
@@ -145,6 +145,8 @@ struct xfs_dsymlink_hdr {
145 __be64 sl_lsn; 145 __be64 sl_lsn;
146}; 146};
147 147
148#define XFS_SYMLINK_CRC_OFF offsetof(struct xfs_dsymlink_hdr, sl_crc)
149
148/* 150/*
149 * The maximum pathlen is 1024 bytes. Since the minimum file system 151 * The maximum pathlen is 1024 bytes. Since the minimum file system
150 * blocksize is 512 bytes, we can get a max of 3 extents back from 152 * blocksize is 512 bytes, we can get a max of 3 extents back from
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 5d7f105a1c82..8f711db61a0c 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -363,6 +363,18 @@ xfs_ialloc_ag_alloc(
363 args.minleft = args.mp->m_in_maxlevels - 1; 363 args.minleft = args.mp->m_in_maxlevels - 1;
364 if ((error = xfs_alloc_vextent(&args))) 364 if ((error = xfs_alloc_vextent(&args)))
365 return error; 365 return error;
366
367 /*
368 * This request might have dirtied the transaction if the AG can
369 * satisfy the request, but the exact block was not available.
370 * If the allocation did fail, subsequent requests will relax
371 * the exact agbno requirement and increase the alignment
372 * instead. It is critical that the total size of the request
373 * (len + alignment + slop) does not increase from this point
374 * on, so reset minalignslop to ensure it is not included in
375 * subsequent requests.
376 */
377 args.minalignslop = 0;
366 } else 378 } else
367 args.fsbno = NULLFSBLOCK; 379 args.fsbno = NULLFSBLOCK;
368 380
@@ -1568,18 +1580,17 @@ xfs_agi_read_verify(
1568 struct xfs_buf *bp) 1580 struct xfs_buf *bp)
1569{ 1581{
1570 struct xfs_mount *mp = bp->b_target->bt_mount; 1582 struct xfs_mount *mp = bp->b_target->bt_mount;
1571 int agi_ok = 1;
1572
1573 if (xfs_sb_version_hascrc(&mp->m_sb))
1574 agi_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
1575 offsetof(struct xfs_agi, agi_crc));
1576 agi_ok = agi_ok && xfs_agi_verify(bp);
1577 1583
1578 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, 1584 if (xfs_sb_version_hascrc(&mp->m_sb) &&
1579 XFS_RANDOM_IALLOC_READ_AGI))) { 1585 !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
1580 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); 1586 xfs_buf_ioerror(bp, EFSBADCRC);
1587 else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
1588 XFS_ERRTAG_IALLOC_READ_AGI,
1589 XFS_RANDOM_IALLOC_READ_AGI))
1581 xfs_buf_ioerror(bp, EFSCORRUPTED); 1590 xfs_buf_ioerror(bp, EFSCORRUPTED);
1582 } 1591
1592 if (bp->b_error)
1593 xfs_verifier_error(bp);
1583} 1594}
1584 1595
1585static void 1596static void
@@ -1590,8 +1601,8 @@ xfs_agi_write_verify(
1590 struct xfs_buf_log_item *bip = bp->b_fspriv; 1601 struct xfs_buf_log_item *bip = bp->b_fspriv;
1591 1602
1592 if (!xfs_agi_verify(bp)) { 1603 if (!xfs_agi_verify(bp)) {
1593 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
1594 xfs_buf_ioerror(bp, EFSCORRUPTED); 1604 xfs_buf_ioerror(bp, EFSCORRUPTED);
1605 xfs_verifier_error(bp);
1595 return; 1606 return;
1596 } 1607 }
1597 1608
@@ -1600,8 +1611,7 @@ xfs_agi_write_verify(
1600 1611
1601 if (bip) 1612 if (bip)
1602 XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); 1613 XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
1603 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 1614 xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
1604 offsetof(struct xfs_agi, agi_crc));
1605} 1615}
1606 1616
1607const struct xfs_buf_ops xfs_agi_buf_ops = { 1617const struct xfs_buf_ops xfs_agi_buf_ops = {
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index c8fa5bbb36de..7e309b11e87d 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -243,12 +243,14 @@ static void
243xfs_inobt_read_verify( 243xfs_inobt_read_verify(
244 struct xfs_buf *bp) 244 struct xfs_buf *bp)
245{ 245{
246 if (!(xfs_btree_sblock_verify_crc(bp) && 246 if (!xfs_btree_sblock_verify_crc(bp))
247 xfs_inobt_verify(bp))) { 247 xfs_buf_ioerror(bp, EFSBADCRC);
248 trace_xfs_btree_corrupt(bp, _RET_IP_); 248 else if (!xfs_inobt_verify(bp))
249 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
250 bp->b_target->bt_mount, bp->b_addr);
251 xfs_buf_ioerror(bp, EFSCORRUPTED); 249 xfs_buf_ioerror(bp, EFSCORRUPTED);
250
251 if (bp->b_error) {
252 trace_xfs_btree_corrupt(bp, _RET_IP_);
253 xfs_verifier_error(bp);
252 } 254 }
253} 255}
254 256
@@ -258,9 +260,9 @@ xfs_inobt_write_verify(
258{ 260{
259 if (!xfs_inobt_verify(bp)) { 261 if (!xfs_inobt_verify(bp)) {
260 trace_xfs_btree_corrupt(bp, _RET_IP_); 262 trace_xfs_btree_corrupt(bp, _RET_IP_);
261 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
262 bp->b_target->bt_mount, bp->b_addr);
263 xfs_buf_ioerror(bp, EFSCORRUPTED); 263 xfs_buf_ioerror(bp, EFSCORRUPTED);
264 xfs_verifier_error(bp);
265 return;
264 } 266 }
265 xfs_btree_sblock_calc_crc(bp); 267 xfs_btree_sblock_calc_crc(bp);
266 268
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3a137e9f9a7d..5e7a38fa6ee6 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -42,7 +42,6 @@
42#include "xfs_bmap_util.h" 42#include "xfs_bmap_util.h"
43#include "xfs_error.h" 43#include "xfs_error.h"
44#include "xfs_quota.h" 44#include "xfs_quota.h"
45#include "xfs_dinode.h"
46#include "xfs_filestream.h" 45#include "xfs_filestream.h"
47#include "xfs_cksum.h" 46#include "xfs_cksum.h"
48#include "xfs_trace.h" 47#include "xfs_trace.h"
@@ -62,6 +61,8 @@ kmem_zone_t *xfs_inode_zone;
62 61
63STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); 62STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
64 63
64STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *);
65
65/* 66/*
66 * helper function to extract extent size hint from inode 67 * helper function to extract extent size hint from inode
67 */ 68 */
@@ -1115,7 +1116,7 @@ xfs_bumplink(
1115{ 1116{
1116 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 1117 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1117 1118
1118 ASSERT(ip->i_d.di_nlink > 0); 1119 ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE));
1119 ip->i_d.di_nlink++; 1120 ip->i_d.di_nlink++;
1120 inc_nlink(VFS_I(ip)); 1121 inc_nlink(VFS_I(ip));
1121 if ((ip->i_d.di_version == 1) && 1122 if ((ip->i_d.di_version == 1) &&
@@ -1165,10 +1166,7 @@ xfs_create(
1165 if (XFS_FORCED_SHUTDOWN(mp)) 1166 if (XFS_FORCED_SHUTDOWN(mp))
1166 return XFS_ERROR(EIO); 1167 return XFS_ERROR(EIO);
1167 1168
1168 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1169 prid = xfs_get_initial_prid(dp);
1169 prid = xfs_get_projid(dp);
1170 else
1171 prid = XFS_PROJID_DEFAULT;
1172 1170
1173 /* 1171 /*
1174 * Make sure that we have allocated dquot(s) on disk. 1172 * Make sure that we have allocated dquot(s) on disk.
@@ -1333,6 +1331,113 @@ xfs_create(
1333} 1331}
1334 1332
1335int 1333int
1334xfs_create_tmpfile(
1335 struct xfs_inode *dp,
1336 struct dentry *dentry,
1337 umode_t mode)
1338{
1339 struct xfs_mount *mp = dp->i_mount;
1340 struct xfs_inode *ip = NULL;
1341 struct xfs_trans *tp = NULL;
1342 int error;
1343 uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1344 prid_t prid;
1345 struct xfs_dquot *udqp = NULL;
1346 struct xfs_dquot *gdqp = NULL;
1347 struct xfs_dquot *pdqp = NULL;
1348 struct xfs_trans_res *tres;
1349 uint resblks;
1350
1351 if (XFS_FORCED_SHUTDOWN(mp))
1352 return XFS_ERROR(EIO);
1353
1354 prid = xfs_get_initial_prid(dp);
1355
1356 /*
1357 * Make sure that we have allocated dquot(s) on disk.
1358 */
1359 error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
1360 xfs_kgid_to_gid(current_fsgid()), prid,
1361 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1362 &udqp, &gdqp, &pdqp);
1363 if (error)
1364 return error;
1365
1366 resblks = XFS_IALLOC_SPACE_RES(mp);
1367 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE);
1368
1369 tres = &M_RES(mp)->tr_create_tmpfile;
1370 error = xfs_trans_reserve(tp, tres, resblks, 0);
1371 if (error == ENOSPC) {
1372 /* No space at all so try a "no-allocation" reservation */
1373 resblks = 0;
1374 error = xfs_trans_reserve(tp, tres, 0, 0);
1375 }
1376 if (error) {
1377 cancel_flags = 0;
1378 goto out_trans_cancel;
1379 }
1380
1381 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
1382 pdqp, resblks, 1, 0);
1383 if (error)
1384 goto out_trans_cancel;
1385
1386 error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
1387 prid, resblks > 0, &ip, NULL);
1388 if (error) {
1389 if (error == ENOSPC)
1390 goto out_trans_cancel;
1391 goto out_trans_abort;
1392 }
1393
1394 if (mp->m_flags & XFS_MOUNT_WSYNC)
1395 xfs_trans_set_sync(tp);
1396
1397 /*
1398 * Attach the dquot(s) to the inodes and modify them incore.
1399 * These ids of the inode couldn't have changed since the new
1400 * inode has been locked ever since it was created.
1401 */
1402 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1403
1404 ip->i_d.di_nlink--;
1405 d_tmpfile(dentry, VFS_I(ip));
1406 error = xfs_iunlink(tp, ip);
1407 if (error)
1408 goto out_trans_abort;
1409
1410 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1411 if (error)
1412 goto out_release_inode;
1413
1414 xfs_qm_dqrele(udqp);
1415 xfs_qm_dqrele(gdqp);
1416 xfs_qm_dqrele(pdqp);
1417
1418 return 0;
1419
1420 out_trans_abort:
1421 cancel_flags |= XFS_TRANS_ABORT;
1422 out_trans_cancel:
1423 xfs_trans_cancel(tp, cancel_flags);
1424 out_release_inode:
1425 /*
1426 * Wait until after the current transaction is aborted to
1427 * release the inode. This prevents recursive transactions
1428 * and deadlocks from xfs_inactive.
1429 */
1430 if (ip)
1431 IRELE(ip);
1432
1433 xfs_qm_dqrele(udqp);
1434 xfs_qm_dqrele(gdqp);
1435 xfs_qm_dqrele(pdqp);
1436
1437 return error;
1438}
1439
1440int
1336xfs_link( 1441xfs_link(
1337 xfs_inode_t *tdp, 1442 xfs_inode_t *tdp,
1338 xfs_inode_t *sip, 1443 xfs_inode_t *sip,
@@ -1397,6 +1502,12 @@ xfs_link(
1397 1502
1398 xfs_bmap_init(&free_list, &first_block); 1503 xfs_bmap_init(&free_list, &first_block);
1399 1504
1505 if (sip->i_d.di_nlink == 0) {
1506 error = xfs_iunlink_remove(tp, sip);
1507 if (error)
1508 goto abort_return;
1509 }
1510
1400 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, 1511 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1401 &first_block, &free_list, resblks); 1512 &first_block, &free_list, resblks);
1402 if (error) 1513 if (error)
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 65e2350f449c..396cc1fafd0d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -20,6 +20,7 @@
20 20
21#include "xfs_inode_buf.h" 21#include "xfs_inode_buf.h"
22#include "xfs_inode_fork.h" 22#include "xfs_inode_fork.h"
23#include "xfs_dinode.h"
23 24
24/* 25/*
25 * Kernel only inode definitions 26 * Kernel only inode definitions
@@ -192,6 +193,15 @@ xfs_set_projid(struct xfs_inode *ip,
192 ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff); 193 ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
193} 194}
194 195
196static inline prid_t
197xfs_get_initial_prid(struct xfs_inode *dp)
198{
199 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
200 return xfs_get_projid(dp);
201
202 return XFS_PROJID_DEFAULT;
203}
204
195/* 205/*
196 * In-core inode flags. 206 * In-core inode flags.
197 */ 207 */
@@ -323,6 +333,8 @@ int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
323 struct xfs_inode **ipp, struct xfs_name *ci_name); 333 struct xfs_inode **ipp, struct xfs_name *ci_name);
324int xfs_create(struct xfs_inode *dp, struct xfs_name *name, 334int xfs_create(struct xfs_inode *dp, struct xfs_name *name,
325 umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); 335 umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp);
336int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry,
337 umode_t mode);
326int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, 338int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
327 struct xfs_inode *ip); 339 struct xfs_inode *ip);
328int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, 340int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c
index 4fc9f39dd89e..24e993996bdc 100644
--- a/fs/xfs/xfs_inode_buf.c
+++ b/fs/xfs/xfs_inode_buf.c
@@ -102,8 +102,7 @@ xfs_inode_buf_verify(
102 } 102 }
103 103
104 xfs_buf_ioerror(bp, EFSCORRUPTED); 104 xfs_buf_ioerror(bp, EFSCORRUPTED);
105 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, 105 xfs_verifier_error(bp);
106 mp, dip);
107#ifdef DEBUG 106#ifdef DEBUG
108 xfs_alert(mp, 107 xfs_alert(mp,
109 "bad inode magic/vsn daddr %lld #%d (magic=%x)", 108 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
@@ -306,7 +305,7 @@ xfs_dinode_verify(
306 if (!xfs_sb_version_hascrc(&mp->m_sb)) 305 if (!xfs_sb_version_hascrc(&mp->m_sb))
307 return false; 306 return false;
308 if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, 307 if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
309 offsetof(struct xfs_dinode, di_crc))) 308 XFS_DINODE_CRC_OFF))
310 return false; 309 return false;
311 if (be64_to_cpu(dip->di_ino) != ip->i_ino) 310 if (be64_to_cpu(dip->di_ino) != ip->i_ino)
312 return false; 311 return false;
@@ -327,7 +326,7 @@ xfs_dinode_calc_crc(
327 326
328 ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); 327 ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
329 crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, 328 crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
330 offsetof(struct xfs_dinode, di_crc)); 329 XFS_DINODE_CRC_OFF);
331 dip->di_crc = xfs_end_cksum(crc); 330 dip->di_crc = xfs_end_cksum(crc);
332} 331}
333 332
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 22d1cbea283d..3b80ebae05f5 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -128,7 +128,6 @@ xfs_iomap_write_direct(
128 xfs_fsblock_t firstfsb; 128 xfs_fsblock_t firstfsb;
129 xfs_extlen_t extsz, temp; 129 xfs_extlen_t extsz, temp;
130 int nimaps; 130 int nimaps;
131 int bmapi_flag;
132 int quota_flag; 131 int quota_flag;
133 int rt; 132 int rt;
134 xfs_trans_t *tp; 133 xfs_trans_t *tp;
@@ -200,18 +199,15 @@ xfs_iomap_write_direct(
200 199
201 xfs_trans_ijoin(tp, ip, 0); 200 xfs_trans_ijoin(tp, ip, 0);
202 201
203 bmapi_flag = 0;
204 if (offset < XFS_ISIZE(ip) || extsz)
205 bmapi_flag |= XFS_BMAPI_PREALLOC;
206
207 /* 202 /*
208 * From this point onwards we overwrite the imap pointer that the 203 * From this point onwards we overwrite the imap pointer that the
209 * caller gave to us. 204 * caller gave to us.
210 */ 205 */
211 xfs_bmap_init(&free_list, &firstfsb); 206 xfs_bmap_init(&free_list, &firstfsb);
212 nimaps = 1; 207 nimaps = 1;
213 error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag, 208 error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
214 &firstfsb, 0, imap, &nimaps, &free_list); 209 XFS_BMAPI_PREALLOC, &firstfsb, 0,
210 imap, &nimaps, &free_list);
215 if (error) 211 if (error)
216 goto out_bmap_cancel; 212 goto out_bmap_cancel;
217 213
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 9ddfb8190ca1..89b07e43ca28 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -39,6 +39,7 @@
39#include "xfs_da_btree.h" 39#include "xfs_da_btree.h"
40#include "xfs_dir2_priv.h" 40#include "xfs_dir2_priv.h"
41#include "xfs_dinode.h" 41#include "xfs_dinode.h"
42#include "xfs_trans_space.h"
42 43
43#include <linux/capability.h> 44#include <linux/capability.h>
44#include <linux/xattr.h> 45#include <linux/xattr.h>
@@ -48,6 +49,18 @@
48#include <linux/fiemap.h> 49#include <linux/fiemap.h>
49#include <linux/slab.h> 50#include <linux/slab.h>
50 51
52/*
53 * Directories have different lock order w.r.t. mmap_sem compared to regular
54 * files. This is due to readdir potentially triggering page faults on a user
55 * buffer inside filldir(), and this happens with the ilock on the directory
56 * held. For regular files, the lock order is the other way around - the
57 * mmap_sem is taken during the page fault, and then we lock the ilock to do
58 * block mapping. Hence we need a different class for the directory ilock so
59 * that lockdep can tell them apart.
60 */
61static struct lock_class_key xfs_nondir_ilock_class;
62static struct lock_class_key xfs_dir_ilock_class;
63
51static int 64static int
52xfs_initxattrs( 65xfs_initxattrs(
53 struct inode *inode, 66 struct inode *inode,
@@ -1034,6 +1047,19 @@ xfs_vn_fiemap(
1034 return 0; 1047 return 0;
1035} 1048}
1036 1049
1050STATIC int
1051xfs_vn_tmpfile(
1052 struct inode *dir,
1053 struct dentry *dentry,
1054 umode_t mode)
1055{
1056 int error;
1057
1058 error = xfs_create_tmpfile(XFS_I(dir), dentry, mode);
1059
1060 return -error;
1061}
1062
1037static const struct inode_operations xfs_inode_operations = { 1063static const struct inode_operations xfs_inode_operations = {
1038 .get_acl = xfs_get_acl, 1064 .get_acl = xfs_get_acl,
1039 .set_acl = xfs_set_acl, 1065 .set_acl = xfs_set_acl,
@@ -1072,6 +1098,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
1072 .removexattr = generic_removexattr, 1098 .removexattr = generic_removexattr,
1073 .listxattr = xfs_vn_listxattr, 1099 .listxattr = xfs_vn_listxattr,
1074 .update_time = xfs_vn_update_time, 1100 .update_time = xfs_vn_update_time,
1101 .tmpfile = xfs_vn_tmpfile,
1075}; 1102};
1076 1103
1077static const struct inode_operations xfs_dir_ci_inode_operations = { 1104static const struct inode_operations xfs_dir_ci_inode_operations = {
@@ -1099,6 +1126,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
1099 .removexattr = generic_removexattr, 1126 .removexattr = generic_removexattr,
1100 .listxattr = xfs_vn_listxattr, 1127 .listxattr = xfs_vn_listxattr,
1101 .update_time = xfs_vn_update_time, 1128 .update_time = xfs_vn_update_time,
1129 .tmpfile = xfs_vn_tmpfile,
1102}; 1130};
1103 1131
1104static const struct inode_operations xfs_symlink_inode_operations = { 1132static const struct inode_operations xfs_symlink_inode_operations = {
@@ -1191,6 +1219,7 @@ xfs_setup_inode(
1191 xfs_diflags_to_iflags(inode, ip); 1219 xfs_diflags_to_iflags(inode, ip);
1192 1220
1193 ip->d_ops = ip->i_mount->m_nondir_inode_ops; 1221 ip->d_ops = ip->i_mount->m_nondir_inode_ops;
1222 lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
1194 switch (inode->i_mode & S_IFMT) { 1223 switch (inode->i_mode & S_IFMT) {
1195 case S_IFREG: 1224 case S_IFREG:
1196 inode->i_op = &xfs_inode_operations; 1225 inode->i_op = &xfs_inode_operations;
@@ -1198,6 +1227,7 @@ xfs_setup_inode(
1198 inode->i_mapping->a_ops = &xfs_address_space_operations; 1227 inode->i_mapping->a_ops = &xfs_address_space_operations;
1199 break; 1228 break;
1200 case S_IFDIR: 1229 case S_IFDIR:
1230 lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
1201 if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) 1231 if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
1202 inode->i_op = &xfs_dir_ci_inode_operations; 1232 inode->i_op = &xfs_dir_ci_inode_operations;
1203 else 1233 else
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index f9bb590acc0e..825249d2dfc1 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -119,6 +119,7 @@ typedef __uint64_t __psunsigned_t;
119#include "xfs_iops.h" 119#include "xfs_iops.h"
120#include "xfs_aops.h" 120#include "xfs_aops.h"
121#include "xfs_super.h" 121#include "xfs_super.h"
122#include "xfs_cksum.h"
122#include "xfs_buf.h" 123#include "xfs_buf.h"
123#include "xfs_message.h" 124#include "xfs_message.h"
124 125
@@ -178,6 +179,7 @@ typedef __uint64_t __psunsigned_t;
178#define ENOATTR ENODATA /* Attribute not found */ 179#define ENOATTR ENODATA /* Attribute not found */
179#define EWRONGFS EINVAL /* Mount with wrong filesystem type */ 180#define EWRONGFS EINVAL /* Mount with wrong filesystem type */
180#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ 181#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
182#define EFSBADCRC EBADMSG /* Bad CRC detected */
181 183
182#define SYNCHRONIZE() barrier() 184#define SYNCHRONIZE() barrier()
183#define __return_address __builtin_return_address(0) 185#define __return_address __builtin_return_address(0)
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index b0f4ef77fa70..2c4004475e71 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -175,7 +175,7 @@ void xlog_iodone(struct xfs_buf *);
175struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); 175struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
176void xfs_log_ticket_put(struct xlog_ticket *ticket); 176void xfs_log_ticket_put(struct xlog_ticket *ticket);
177 177
178int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, 178void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
179 xfs_lsn_t *commit_lsn, int flags); 179 xfs_lsn_t *commit_lsn, int flags);
180bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); 180bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
181 181
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 4ef6fdbced78..7e5455391176 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -499,13 +499,6 @@ xlog_cil_push(
499 cil->xc_ctx = new_ctx; 499 cil->xc_ctx = new_ctx;
500 500
501 /* 501 /*
502 * mirror the new sequence into the cil structure so that we can do
503 * unlocked checks against the current sequence in log forces without
504 * risking deferencing a freed context pointer.
505 */
506 cil->xc_current_sequence = new_ctx->sequence;
507
508 /*
509 * The switch is now done, so we can drop the context lock and move out 502 * The switch is now done, so we can drop the context lock and move out
510 * of a shared context. We can't just go straight to the commit record, 503 * of a shared context. We can't just go straight to the commit record,
511 * though - we need to synchronise with previous and future commits so 504 * though - we need to synchronise with previous and future commits so
@@ -523,8 +516,15 @@ xlog_cil_push(
523 * Hence we need to add this context to the committing context list so 516 * Hence we need to add this context to the committing context list so
524 * that higher sequences will wait for us to write out a commit record 517 * that higher sequences will wait for us to write out a commit record
525 * before they do. 518 * before they do.
519 *
520 * xfs_log_force_lsn requires us to mirror the new sequence into the cil
521 * structure atomically with the addition of this sequence to the
522 * committing list. This also ensures that we can do unlocked checks
523 * against the current sequence in log forces without risking
524 * deferencing a freed context pointer.
526 */ 525 */
527 spin_lock(&cil->xc_push_lock); 526 spin_lock(&cil->xc_push_lock);
527 cil->xc_current_sequence = new_ctx->sequence;
528 list_add(&ctx->committing, &cil->xc_committing); 528 list_add(&ctx->committing, &cil->xc_committing);
529 spin_unlock(&cil->xc_push_lock); 529 spin_unlock(&cil->xc_push_lock);
530 up_write(&cil->xc_ctx_lock); 530 up_write(&cil->xc_ctx_lock);
@@ -662,8 +662,14 @@ xlog_cil_push_background(
662 662
663} 663}
664 664
665/*
666 * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence
667 * number that is passed. When it returns, the work will be queued for
668 * @push_seq, but it won't be completed. The caller is expected to do any
669 * waiting for push_seq to complete if it is required.
670 */
665static void 671static void
666xlog_cil_push_foreground( 672xlog_cil_push_now(
667 struct xlog *log, 673 struct xlog *log,
668 xfs_lsn_t push_seq) 674 xfs_lsn_t push_seq)
669{ 675{
@@ -688,10 +694,8 @@ xlog_cil_push_foreground(
688 } 694 }
689 695
690 cil->xc_push_seq = push_seq; 696 cil->xc_push_seq = push_seq;
697 queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
691 spin_unlock(&cil->xc_push_lock); 698 spin_unlock(&cil->xc_push_lock);
692
693 /* do the push now */
694 xlog_cil_push(log);
695} 699}
696 700
697bool 701bool
@@ -721,7 +725,7 @@ xlog_cil_empty(
721 * background commit, returns without it held once background commits are 725 * background commit, returns without it held once background commits are
722 * allowed again. 726 * allowed again.
723 */ 727 */
724int 728void
725xfs_log_commit_cil( 729xfs_log_commit_cil(
726 struct xfs_mount *mp, 730 struct xfs_mount *mp,
727 struct xfs_trans *tp, 731 struct xfs_trans *tp,
@@ -767,7 +771,6 @@ xfs_log_commit_cil(
767 xlog_cil_push_background(log); 771 xlog_cil_push_background(log);
768 772
769 up_read(&cil->xc_ctx_lock); 773 up_read(&cil->xc_ctx_lock);
770 return 0;
771} 774}
772 775
773/* 776/*
@@ -796,7 +799,8 @@ xlog_cil_force_lsn(
796 * xlog_cil_push() handles racing pushes for the same sequence, 799 * xlog_cil_push() handles racing pushes for the same sequence,
797 * so no need to deal with it here. 800 * so no need to deal with it here.
798 */ 801 */
799 xlog_cil_push_foreground(log, sequence); 802restart:
803 xlog_cil_push_now(log, sequence);
800 804
801 /* 805 /*
802 * See if we can find a previous sequence still committing. 806 * See if we can find a previous sequence still committing.
@@ -804,7 +808,6 @@ xlog_cil_force_lsn(
804 * before allowing the force of push_seq to go ahead. Hence block 808 * before allowing the force of push_seq to go ahead. Hence block
805 * on commits for those as well. 809 * on commits for those as well.
806 */ 810 */
807restart:
808 spin_lock(&cil->xc_push_lock); 811 spin_lock(&cil->xc_push_lock);
809 list_for_each_entry(ctx, &cil->xc_committing, committing) { 812 list_for_each_entry(ctx, &cil->xc_committing, committing) {
810 if (ctx->sequence > sequence) 813 if (ctx->sequence > sequence)
@@ -822,6 +825,28 @@ restart:
822 /* found it! */ 825 /* found it! */
823 commit_lsn = ctx->commit_lsn; 826 commit_lsn = ctx->commit_lsn;
824 } 827 }
828
829 /*
830 * The call to xlog_cil_push_now() executes the push in the background.
831 * Hence by the time we have got here it our sequence may not have been
832 * pushed yet. This is true if the current sequence still matches the
833 * push sequence after the above wait loop and the CIL still contains
834 * dirty objects.
835 *
836 * When the push occurs, it will empty the CIL and
837 * atomically increment the currect sequence past the push sequence and
838 * move it into the committing list. Of course, if the CIL is clean at
839 * the time of the push, it won't have pushed the CIL at all, so in that
840 * case we should try the push for this sequence again from the start
841 * just in case.
842 */
843
844 if (sequence == cil->xc_current_sequence &&
845 !list_empty(&cil->xc_cil)) {
846 spin_unlock(&cil->xc_push_lock);
847 goto restart;
848 }
849
825 spin_unlock(&cil->xc_push_lock); 850 spin_unlock(&cil->xc_push_lock);
826 return commit_lsn; 851 return commit_lsn;
827} 852}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index f96c05669a9e..993cb19e7d39 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -314,6 +314,9 @@ reread:
314 error = bp->b_error; 314 error = bp->b_error;
315 if (loud) 315 if (loud)
316 xfs_warn(mp, "SB validate failed with error %d.", error); 316 xfs_warn(mp, "SB validate failed with error %d.", error);
317 /* bad CRC means corrupted metadata */
318 if (error == EFSBADCRC)
319 error = EFSCORRUPTED;
317 goto release_buf; 320 goto release_buf;
318 } 321 }
319 322
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index a6a76b2b6a85..ec5ca65c6211 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -842,7 +842,7 @@ xfs_growfs_rt_alloc(
842 /* 842 /*
843 * Reserve space & log for one extent added to the file. 843 * Reserve space & log for one extent added to the file.
844 */ 844 */
845 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, 845 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc,
846 resblks, 0); 846 resblks, 0);
847 if (error) 847 if (error)
848 goto error_cancel; 848 goto error_cancel;
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
index 1e116794bb66..0c0e41bbe4e3 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/xfs_sb.c
@@ -288,6 +288,7 @@ xfs_mount_validate_sb(
288 sbp->sb_inodelog < XFS_DINODE_MIN_LOG || 288 sbp->sb_inodelog < XFS_DINODE_MIN_LOG ||
289 sbp->sb_inodelog > XFS_DINODE_MAX_LOG || 289 sbp->sb_inodelog > XFS_DINODE_MAX_LOG ||
290 sbp->sb_inodesize != (1 << sbp->sb_inodelog) || 290 sbp->sb_inodesize != (1 << sbp->sb_inodelog) ||
291 sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) ||
291 (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || 292 (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) ||
292 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || 293 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
293 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || 294 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
@@ -610,12 +611,11 @@ xfs_sb_read_verify(
610 XFS_SB_VERSION_5) || 611 XFS_SB_VERSION_5) ||
611 dsb->sb_crc != 0)) { 612 dsb->sb_crc != 0)) {
612 613
613 if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 614 if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) {
614 offsetof(struct xfs_sb, sb_crc))) {
615 /* Only fail bad secondaries on a known V5 filesystem */ 615 /* Only fail bad secondaries on a known V5 filesystem */
616 if (bp->b_bn == XFS_SB_DADDR || 616 if (bp->b_bn == XFS_SB_DADDR ||
617 xfs_sb_version_hascrc(&mp->m_sb)) { 617 xfs_sb_version_hascrc(&mp->m_sb)) {
618 error = EFSCORRUPTED; 618 error = EFSBADCRC;
619 goto out_error; 619 goto out_error;
620 } 620 }
621 } 621 }
@@ -624,10 +624,9 @@ xfs_sb_read_verify(
624 624
625out_error: 625out_error:
626 if (error) { 626 if (error) {
627 if (error == EFSCORRUPTED)
628 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
629 mp, bp->b_addr);
630 xfs_buf_ioerror(bp, error); 627 xfs_buf_ioerror(bp, error);
628 if (error == EFSCORRUPTED || error == EFSBADCRC)
629 xfs_verifier_error(bp);
631 } 630 }
632} 631}
633 632
@@ -662,9 +661,8 @@ xfs_sb_write_verify(
662 661
663 error = xfs_sb_verify(bp, false); 662 error = xfs_sb_verify(bp, false);
664 if (error) { 663 if (error) {
665 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
666 mp, bp->b_addr);
667 xfs_buf_ioerror(bp, error); 664 xfs_buf_ioerror(bp, error);
665 xfs_verifier_error(bp);
668 return; 666 return;
669 } 667 }
670 668
@@ -674,8 +672,7 @@ xfs_sb_write_verify(
674 if (bip) 672 if (bip)
675 XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 673 XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
676 674
677 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 675 xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF);
678 offsetof(struct xfs_sb, sb_crc));
679} 676}
680 677
681const struct xfs_buf_ops xfs_sb_buf_ops = { 678const struct xfs_buf_ops xfs_sb_buf_ops = {
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 35061d4b614c..f7b2fe77c5a5 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -182,6 +182,8 @@ typedef struct xfs_sb {
182 /* must be padded to 64 bit alignment */ 182 /* must be padded to 64 bit alignment */
183} xfs_sb_t; 183} xfs_sb_t;
184 184
185#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc)
186
185/* 187/*
186 * Superblock - on disk version. Must match the in core version above. 188 * Superblock - on disk version. Must match the in core version above.
187 * Must be padded to 64 bit alignment. 189 * Must be padded to 64 bit alignment.
diff --git a/fs/xfs/xfs_shared.h b/fs/xfs/xfs_shared.h
index 8c5035a13df1..4484e5151395 100644
--- a/fs/xfs/xfs_shared.h
+++ b/fs/xfs/xfs_shared.h
@@ -104,7 +104,8 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
104#define XFS_TRANS_SB_COUNT 41 104#define XFS_TRANS_SB_COUNT 41
105#define XFS_TRANS_CHECKPOINT 42 105#define XFS_TRANS_CHECKPOINT 42
106#define XFS_TRANS_ICREATE 43 106#define XFS_TRANS_ICREATE 43
107#define XFS_TRANS_TYPE_MAX 43 107#define XFS_TRANS_CREATE_TMPFILE 44
108#define XFS_TRANS_TYPE_MAX 44
108/* new transaction types need to be reflected in xfs_logprint(8) */ 109/* new transaction types need to be reflected in xfs_logprint(8) */
109 110
110#define XFS_TRANS_TYPES \ 111#define XFS_TRANS_TYPES \
@@ -112,6 +113,7 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
112 { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ 113 { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \
113 { XFS_TRANS_INACTIVE, "INACTIVE" }, \ 114 { XFS_TRANS_INACTIVE, "INACTIVE" }, \
114 { XFS_TRANS_CREATE, "CREATE" }, \ 115 { XFS_TRANS_CREATE, "CREATE" }, \
116 { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \
115 { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ 117 { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \
116 { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ 118 { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \
117 { XFS_TRANS_REMOVE, "REMOVE" }, \ 119 { XFS_TRANS_REMOVE, "REMOVE" }, \
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d971f4932b5d..205376776377 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -996,7 +996,7 @@ xfs_fs_evict_inode(
996 996
997 trace_xfs_evict_inode(ip); 997 trace_xfs_evict_inode(ip);
998 998
999 truncate_inode_pages(&inode->i_data, 0); 999 truncate_inode_pages_final(&inode->i_data);
1000 clear_inode(inode); 1000 clear_inode(inode);
1001 XFS_STATS_INC(vn_rele); 1001 XFS_STATS_INC(vn_rele);
1002 XFS_STATS_INC(vn_remove); 1002 XFS_STATS_INC(vn_remove);
@@ -1197,6 +1197,7 @@ xfs_fs_remount(
1197 char *p; 1197 char *p;
1198 int error; 1198 int error;
1199 1199
1200 sync_filesystem(sb);
1200 while ((p = strsep(&options, ",")) != NULL) { 1201 while ((p = strsep(&options, ",")) != NULL) {
1201 int token; 1202 int token;
1202 1203
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 14e58f2c96bd..52979aa90986 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -80,6 +80,10 @@ xfs_readlink_bmap(
80 if (error) { 80 if (error) {
81 xfs_buf_ioerror_alert(bp, __func__); 81 xfs_buf_ioerror_alert(bp, __func__);
82 xfs_buf_relse(bp); 82 xfs_buf_relse(bp);
83
84 /* bad CRC means corrupted metadata */
85 if (error == EFSBADCRC)
86 error = EFSCORRUPTED;
83 goto out; 87 goto out;
84 } 88 }
85 byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); 89 byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
@@ -208,10 +212,7 @@ xfs_symlink(
208 return XFS_ERROR(ENAMETOOLONG); 212 return XFS_ERROR(ENAMETOOLONG);
209 213
210 udqp = gdqp = NULL; 214 udqp = gdqp = NULL;
211 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 215 prid = xfs_get_initial_prid(dp);
212 prid = xfs_get_projid(dp);
213 else
214 prid = XFS_PROJID_DEFAULT;
215 216
216 /* 217 /*
217 * Make sure that we have allocated dquot(s) on disk. 218 * Make sure that we have allocated dquot(s) on disk.
diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c
index bf59a2b45f8c..9b32052ff65e 100644
--- a/fs/xfs/xfs_symlink_remote.c
+++ b/fs/xfs/xfs_symlink_remote.c
@@ -133,12 +133,13 @@ xfs_symlink_read_verify(
133 if (!xfs_sb_version_hascrc(&mp->m_sb)) 133 if (!xfs_sb_version_hascrc(&mp->m_sb))
134 return; 134 return;
135 135
136 if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 136 if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
137 offsetof(struct xfs_dsymlink_hdr, sl_crc)) || 137 xfs_buf_ioerror(bp, EFSBADCRC);
138 !xfs_symlink_verify(bp)) { 138 else if (!xfs_symlink_verify(bp))
139 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
140 xfs_buf_ioerror(bp, EFSCORRUPTED); 139 xfs_buf_ioerror(bp, EFSCORRUPTED);
141 } 140
141 if (bp->b_error)
142 xfs_verifier_error(bp);
142} 143}
143 144
144static void 145static void
@@ -153,8 +154,8 @@ xfs_symlink_write_verify(
153 return; 154 return;
154 155
155 if (!xfs_symlink_verify(bp)) { 156 if (!xfs_symlink_verify(bp)) {
156 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
157 xfs_buf_ioerror(bp, EFSCORRUPTED); 157 xfs_buf_ioerror(bp, EFSCORRUPTED);
158 xfs_verifier_error(bp);
158 return; 159 return;
159 } 160 }
160 161
@@ -162,8 +163,7 @@ xfs_symlink_write_verify(
162 struct xfs_dsymlink_hdr *dsl = bp->b_addr; 163 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
163 dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn); 164 dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
164 } 165 }
165 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 166 xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF);
166 offsetof(struct xfs_dsymlink_hdr, sl_crc));
167} 167}
168 168
169const struct xfs_buf_ops xfs_symlink_buf_ops = { 169const struct xfs_buf_ops xfs_symlink_buf_ops = {
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 425dfa45b9a0..a4ae41c179a8 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -603,6 +603,7 @@ DEFINE_INODE_EVENT(xfs_readlink);
603DEFINE_INODE_EVENT(xfs_inactive_symlink); 603DEFINE_INODE_EVENT(xfs_inactive_symlink);
604DEFINE_INODE_EVENT(xfs_alloc_file_space); 604DEFINE_INODE_EVENT(xfs_alloc_file_space);
605DEFINE_INODE_EVENT(xfs_free_file_space); 605DEFINE_INODE_EVENT(xfs_free_file_space);
606DEFINE_INODE_EVENT(xfs_collapse_file_space);
606DEFINE_INODE_EVENT(xfs_readdir); 607DEFINE_INODE_EVENT(xfs_readdir);
607#ifdef CONFIG_XFS_POSIX_ACL 608#ifdef CONFIG_XFS_POSIX_ACL
608DEFINE_INODE_EVENT(xfs_get_acl); 609DEFINE_INODE_EVENT(xfs_get_acl);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index c812c5c060de..54a57326d85b 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -887,12 +887,7 @@ xfs_trans_commit(
887 xfs_trans_apply_sb_deltas(tp); 887 xfs_trans_apply_sb_deltas(tp);
888 xfs_trans_apply_dquot_deltas(tp); 888 xfs_trans_apply_dquot_deltas(tp);
889 889
890 error = xfs_log_commit_cil(mp, tp, &commit_lsn, flags); 890 xfs_log_commit_cil(mp, tp, &commit_lsn, flags);
891 if (error == ENOMEM) {
892 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
893 error = XFS_ERROR(EIO);
894 goto out_unreserve;
895 }
896 891
897 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 892 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
898 xfs_trans_free(tp); 893 xfs_trans_free(tp);
@@ -902,10 +897,7 @@ xfs_trans_commit(
902 * log out now and wait for it. 897 * log out now and wait for it.
903 */ 898 */
904 if (sync) { 899 if (sync) {
905 if (!error) { 900 error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL);
906 error = _xfs_log_force_lsn(mp, commit_lsn,
907 XFS_LOG_SYNC, NULL);
908 }
909 XFS_STATS_INC(xs_trans_sync); 901 XFS_STATS_INC(xs_trans_sync);
910 } else { 902 } else {
911 XFS_STATS_INC(xs_trans_async); 903 XFS_STATS_INC(xs_trans_async);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 647b6f1d8923..b8eef0549f3f 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -275,6 +275,10 @@ xfs_trans_read_buf_map(
275 XFS_BUF_UNDONE(bp); 275 XFS_BUF_UNDONE(bp);
276 xfs_buf_stale(bp); 276 xfs_buf_stale(bp);
277 xfs_buf_relse(bp); 277 xfs_buf_relse(bp);
278
279 /* bad CRC means corrupted metadata */
280 if (error == EFSBADCRC)
281 error = EFSCORRUPTED;
278 return error; 282 return error;
279 } 283 }
280#ifdef DEBUG 284#ifdef DEBUG
@@ -338,6 +342,9 @@ xfs_trans_read_buf_map(
338 if (tp->t_flags & XFS_TRANS_DIRTY) 342 if (tp->t_flags & XFS_TRANS_DIRTY)
339 xfs_force_shutdown(tp->t_mountp, 343 xfs_force_shutdown(tp->t_mountp,
340 SHUTDOWN_META_IO_ERROR); 344 SHUTDOWN_META_IO_ERROR);
345 /* bad CRC means corrupted metadata */
346 if (error == EFSBADCRC)
347 error = EFSCORRUPTED;
341 return error; 348 return error;
342 } 349 }
343 } 350 }
@@ -375,6 +382,10 @@ xfs_trans_read_buf_map(
375 if (tp->t_flags & XFS_TRANS_DIRTY) 382 if (tp->t_flags & XFS_TRANS_DIRTY)
376 xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); 383 xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
377 xfs_buf_relse(bp); 384 xfs_buf_relse(bp);
385
386 /* bad CRC means corrupted metadata */
387 if (error == EFSBADCRC)
388 error = EFSCORRUPTED;
378 return error; 389 return error;
379 } 390 }
380#ifdef DEBUG 391#ifdef DEBUG
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
index 2ffd3e331b49..ae368165244d 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/xfs_trans_resv.c
@@ -81,20 +81,28 @@ xfs_calc_buf_res(
81 * on disk. Hence we need an inode reservation function that calculates all this 81 * on disk. Hence we need an inode reservation function that calculates all this
82 * correctly. So, we log: 82 * correctly. So, we log:
83 * 83 *
84 * - log op headers for object 84 * - 4 log op headers for object
85 * - for the ilf, the inode core and 2 forks
85 * - inode log format object 86 * - inode log format object
86 * - the entire inode contents (core + 2 forks) 87 * - the inode core
87 * - two bmap btree block headers 88 * - two inode forks containing bmap btree root blocks.
89 * - the btree data contained by both forks will fit into the inode size,
90 * hence when combined with the inode core above, we have a total of the
91 * actual inode size.
92 * - the BMBT headers need to be accounted separately, as they are
93 * additional to the records and pointers that fit inside the inode
94 * forks.
88 */ 95 */
89STATIC uint 96STATIC uint
90xfs_calc_inode_res( 97xfs_calc_inode_res(
91 struct xfs_mount *mp, 98 struct xfs_mount *mp,
92 uint ninodes) 99 uint ninodes)
93{ 100{
94 return ninodes * (sizeof(struct xlog_op_header) + 101 return ninodes *
95 sizeof(struct xfs_inode_log_format) + 102 (4 * sizeof(struct xlog_op_header) +
96 mp->m_sb.sb_inodesize + 103 sizeof(struct xfs_inode_log_format) +
97 2 * XFS_BMBT_BLOCK_LEN(mp)); 104 mp->m_sb.sb_inodesize +
105 2 * XFS_BMBT_BLOCK_LEN(mp));
98} 106}
99 107
100/* 108/*
@@ -204,6 +212,19 @@ xfs_calc_rename_reservation(
204} 212}
205 213
206/* 214/*
215 * For removing an inode from unlinked list at first, we can modify:
216 * the agi hash list and counters: sector size
217 * the on disk inode before ours in the agi hash list: inode cluster size
218 */
219STATIC uint
220xfs_calc_iunlink_remove_reservation(
221 struct xfs_mount *mp)
222{
223 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
224 max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size);
225}
226
227/*
207 * For creating a link to an inode: 228 * For creating a link to an inode:
208 * the parent directory inode: inode size 229 * the parent directory inode: inode size
209 * the linked inode: inode size 230 * the linked inode: inode size
@@ -220,6 +241,7 @@ xfs_calc_link_reservation(
220 struct xfs_mount *mp) 241 struct xfs_mount *mp)
221{ 242{
222 return XFS_DQUOT_LOGRES(mp) + 243 return XFS_DQUOT_LOGRES(mp) +
244 xfs_calc_iunlink_remove_reservation(mp) +
223 MAX((xfs_calc_inode_res(mp, 2) + 245 MAX((xfs_calc_inode_res(mp, 2) +
224 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), 246 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
225 XFS_FSB_TO_B(mp, 1))), 247 XFS_FSB_TO_B(mp, 1))),
@@ -229,6 +251,18 @@ xfs_calc_link_reservation(
229} 251}
230 252
231/* 253/*
254 * For adding an inode to unlinked list we can modify:
255 * the agi hash list: sector size
256 * the unlinked inode: inode size
257 */
258STATIC uint
259xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
260{
261 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
262 xfs_calc_inode_res(mp, 1);
263}
264
265/*
232 * For removing a directory entry we can modify: 266 * For removing a directory entry we can modify:
233 * the parent directory inode: inode size 267 * the parent directory inode: inode size
234 * the removed inode: inode size 268 * the removed inode: inode size
@@ -245,10 +279,11 @@ xfs_calc_remove_reservation(
245 struct xfs_mount *mp) 279 struct xfs_mount *mp)
246{ 280{
247 return XFS_DQUOT_LOGRES(mp) + 281 return XFS_DQUOT_LOGRES(mp) +
248 MAX((xfs_calc_inode_res(mp, 2) + 282 xfs_calc_iunlink_add_reservation(mp) +
283 MAX((xfs_calc_inode_res(mp, 1) +
249 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), 284 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
250 XFS_FSB_TO_B(mp, 1))), 285 XFS_FSB_TO_B(mp, 1))),
251 (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + 286 (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
252 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), 287 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
253 XFS_FSB_TO_B(mp, 1)))); 288 XFS_FSB_TO_B(mp, 1))));
254} 289}
@@ -343,6 +378,20 @@ xfs_calc_create_reservation(
343 378
344} 379}
345 380
381STATIC uint
382xfs_calc_create_tmpfile_reservation(
383 struct xfs_mount *mp)
384{
385 uint res = XFS_DQUOT_LOGRES(mp);
386
387 if (xfs_sb_version_hascrc(&mp->m_sb))
388 res += xfs_calc_icreate_resv_alloc(mp);
389 else
390 res += xfs_calc_create_resv_alloc(mp);
391
392 return res + xfs_calc_iunlink_add_reservation(mp);
393}
394
346/* 395/*
347 * Making a new directory is the same as creating a new file. 396 * Making a new directory is the same as creating a new file.
348 */ 397 */
@@ -383,9 +432,9 @@ xfs_calc_ifree_reservation(
383{ 432{
384 return XFS_DQUOT_LOGRES(mp) + 433 return XFS_DQUOT_LOGRES(mp) +
385 xfs_calc_inode_res(mp, 1) + 434 xfs_calc_inode_res(mp, 1) +
386 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + 435 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
387 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + 436 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
388 max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size) + 437 xfs_calc_iunlink_remove_reservation(mp) +
389 xfs_calc_buf_res(1, 0) + 438 xfs_calc_buf_res(1, 0) +
390 xfs_calc_buf_res(2 + mp->m_ialloc_blks + 439 xfs_calc_buf_res(2 + mp->m_ialloc_blks +
391 mp->m_in_maxlevels, 0) + 440 mp->m_in_maxlevels, 0) +
@@ -644,15 +693,14 @@ xfs_calc_qm_setqlim_reservation(
644 693
645/* 694/*
646 * Allocating quota on disk if needed. 695 * Allocating quota on disk if needed.
647 * the write transaction log space: M_RES(mp)->tr_write.tr_logres 696 * the write transaction log space for quota file extent allocation
648 * the unit of quota allocation: one system block size 697 * the unit of quota allocation: one system block size
649 */ 698 */
650STATIC uint 699STATIC uint
651xfs_calc_qm_dqalloc_reservation( 700xfs_calc_qm_dqalloc_reservation(
652 struct xfs_mount *mp) 701 struct xfs_mount *mp)
653{ 702{
654 ASSERT(M_RES(mp)->tr_write.tr_logres); 703 return xfs_calc_write_reservation(mp) +
655 return M_RES(mp)->tr_write.tr_logres +
656 xfs_calc_buf_res(1, 704 xfs_calc_buf_res(1,
657 XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); 705 XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
658} 706}
@@ -729,6 +777,11 @@ xfs_trans_resv_calc(
729 resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT; 777 resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
730 resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES; 778 resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
731 779
780 resp->tr_create_tmpfile.tr_logres =
781 xfs_calc_create_tmpfile_reservation(mp);
782 resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT;
783 resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
784
732 resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp); 785 resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
733 resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT; 786 resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
734 resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; 787 resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
@@ -784,7 +837,6 @@ xfs_trans_resv_calc(
784 /* The following transaction are logged in logical format */ 837 /* The following transaction are logged in logical format */
785 resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp); 838 resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
786 resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp); 839 resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
787 resp->tr_swrite.tr_logres = xfs_calc_swrite_reservation(mp);
788 resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp); 840 resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
789 resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp); 841 resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
790 resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp); 842 resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/xfs_trans_resv.h
index de7de9aaad8a..1097d14cd583 100644
--- a/fs/xfs/xfs_trans_resv.h
+++ b/fs/xfs/xfs_trans_resv.h
@@ -38,11 +38,11 @@ struct xfs_trans_resv {
38 struct xfs_trans_res tr_remove; /* unlink trans */ 38 struct xfs_trans_res tr_remove; /* unlink trans */
39 struct xfs_trans_res tr_symlink; /* symlink trans */ 39 struct xfs_trans_res tr_symlink; /* symlink trans */
40 struct xfs_trans_res tr_create; /* create trans */ 40 struct xfs_trans_res tr_create; /* create trans */
41 struct xfs_trans_res tr_create_tmpfile; /* create O_TMPFILE trans */
41 struct xfs_trans_res tr_mkdir; /* mkdir trans */ 42 struct xfs_trans_res tr_mkdir; /* mkdir trans */
42 struct xfs_trans_res tr_ifree; /* inode free trans */ 43 struct xfs_trans_res tr_ifree; /* inode free trans */
43 struct xfs_trans_res tr_ichange; /* inode update trans */ 44 struct xfs_trans_res tr_ichange; /* inode update trans */
44 struct xfs_trans_res tr_growdata; /* fs data section grow trans */ 45 struct xfs_trans_res tr_growdata; /* fs data section grow trans */
45 struct xfs_trans_res tr_swrite; /* sync write inode trans */
46 struct xfs_trans_res tr_addafork; /* add inode attr fork trans */ 46 struct xfs_trans_res tr_addafork; /* add inode attr fork trans */
47 struct xfs_trans_res tr_writeid; /* write setuid/setgid file */ 47 struct xfs_trans_res tr_writeid; /* write setuid/setgid file */
48 struct xfs_trans_res tr_attrinval; /* attr fork buffer 48 struct xfs_trans_res tr_attrinval; /* attr fork buffer
@@ -100,6 +100,7 @@ struct xfs_trans_resv {
100#define XFS_ITRUNCATE_LOG_COUNT 2 100#define XFS_ITRUNCATE_LOG_COUNT 2
101#define XFS_INACTIVE_LOG_COUNT 2 101#define XFS_INACTIVE_LOG_COUNT 2
102#define XFS_CREATE_LOG_COUNT 2 102#define XFS_CREATE_LOG_COUNT 2
103#define XFS_CREATE_TMPFILE_LOG_COUNT 2
103#define XFS_MKDIR_LOG_COUNT 3 104#define XFS_MKDIR_LOG_COUNT 3
104#define XFS_SYMLINK_LOG_COUNT 3 105#define XFS_SYMLINK_LOG_COUNT 3
105#define XFS_REMOVE_LOG_COUNT 2 106#define XFS_REMOVE_LOG_COUNT 2